Compare commits

..

2 Commits

Author SHA1 Message Date
3f7e20f702 删除diff_new.f90中冗余部分,方便后续工作 2026-02-08 00:54:23 +08:00
673dd20722 对fmisc.f90的polint修改 2026-02-07 01:56:44 +08:00
63 changed files with 69319 additions and 91874 deletions

9
.gitignore vendored
View File

@@ -1,6 +1,3 @@
__pycache__ __pycache__
GW150914 GW150914
GW150914-origin GW150914-origin
docs
*.tmp

4877
2.txt

File diff suppressed because it is too large Load Diff

445
AMSS_NCKU_ABEtest.py Normal file
View File

@@ -0,0 +1,445 @@
##################################################################
##
## AMSS-NCKU ABE Test Program (Skip TwoPuncture if data exists)
## Modified from AMSS_NCKU_Program.py
## Author: Xiaoqu
## Modified: 2026/02/01
##
##################################################################
##################################################################
## Print program introduction
import print_information
print_information.print_program_introduction()
##################################################################
import AMSS_NCKU_Input as input_data
##################################################################
## Create directories to store program run data
import os
import shutil
import sys
import time
## Set the output directory according to the input file
File_directory = os.path.join(input_data.File_directory)
## Check if output directory exists and if TwoPuncture data is available
skip_twopuncture = False
output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
if os.path.exists(File_directory):
print( " Output directory already exists." )
print()
# Check if TwoPuncture initial data files exist
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture"):
twopuncture_output = os.path.join(output_directory, "TwoPunctureABE")
input_par = os.path.join(output_directory, "input.par")
if os.path.exists(twopuncture_output) and os.path.exists(input_par):
print( " Found existing TwoPuncture initial data." )
print( " Do you want to skip TwoPuncture phase and reuse existing data?" )
print( " Input 'skip' to skip TwoPuncture and start ABE directly" )
print( " Input 'regenerate' to regenerate everything from scratch" )
print()
while True:
try:
inputvalue = input()
if ( inputvalue == "skip" ):
print( " Skipping TwoPuncture phase, will reuse existing initial data." )
print()
skip_twopuncture = True
break
elif ( inputvalue == "regenerate" ):
print( " Regenerating everything from scratch." )
print()
skip_twopuncture = False
break
else:
print( " Please input 'skip' or 'regenerate'." )
except ValueError:
print( " Please input 'skip' or 'regenerate'." )
else:
print( " TwoPuncture initial data not found, will regenerate everything." )
print()
# If not skipping, remove and recreate directory
if not skip_twopuncture:
shutil.rmtree(File_directory, ignore_errors=True)
os.mkdir(File_directory)
os.mkdir(output_directory)
os.mkdir(binary_results_directory)
figure_directory = os.path.join(File_directory, "figure")
os.mkdir(figure_directory)
shutil.copy("AMSS_NCKU_Input.py", File_directory)
print( " Output directory has been regenerated." )
print()
else:
# Create fresh directory structure
os.mkdir(File_directory)
shutil.copy("AMSS_NCKU_Input.py", File_directory)
os.mkdir(output_directory)
os.mkdir(binary_results_directory)
figure_directory = os.path.join(File_directory, "figure")
os.mkdir(figure_directory)
print( " Output directory has been generated." )
print()
# Ensure figure directory exists
figure_directory = os.path.join(File_directory, "figure")
if not os.path.exists(figure_directory):
os.mkdir(figure_directory)
##################################################################
## Output related parameter information
import setup
## Print and save input parameter information
setup.print_input_data( File_directory )
if not skip_twopuncture:
setup.generate_AMSSNCKU_input()
setup.print_puncture_information()
##################################################################
## Generate AMSS-NCKU program input files based on the configured parameters
if not skip_twopuncture:
print()
print( " Generating the AMSS-NCKU input parfile for the ABE executable." )
print()
## Generate cgh-related input files from the grid information
import numerical_grid
numerical_grid.append_AMSSNCKU_cgh_input()
print()
print( " The input parfile for AMSS-NCKU C++ executable file ABE has been generated." )
print( " However, the input relevant to TwoPuncture need to be appended later." )
print()
##################################################################
## Plot the initial grid configuration
if not skip_twopuncture:
print()
print( " Schematically plot the numerical grid structure." )
print()
import numerical_grid
numerical_grid.plot_initial_grid()
##################################################################
## Generate AMSS-NCKU macro files according to the numerical scheme and parameters
if not skip_twopuncture:
print()
print( " Automatically generating the macro file for AMSS-NCKU C++ executable file ABE " )
print( " (Based on the finite-difference numerical scheme) " )
print()
import generate_macrodef
generate_macrodef.generate_macrodef_h()
print( " AMSS-NCKU macro file macrodef.h has been generated. " )
generate_macrodef.generate_macrodef_fh()
print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
##################################################################
# Compile the AMSS-NCKU program according to user requirements
# NOTE: ABE compilation is always performed, even when skipping TwoPuncture
print()
print( " Preparing to compile and run the AMSS-NCKU code as requested " )
print( " Compiling the AMSS-NCKU code based on the generated macro files " )
print()
AMSS_NCKU_source_path = "AMSS_NCKU_source"
AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
## If AMSS_NCKU source folder is missing, create it and prompt the user
if not os.path.exists(AMSS_NCKU_source_path):
os.makedirs(AMSS_NCKU_source_path)
print( " The AMSS-NCKU source files are incomplete; copy all source files into ./AMSS_NCKU_source. " )
print( " Press Enter to continue. " )
inputvalue = input()
# Copy AMSS-NCKU source files to prepare for compilation
# If skipping TwoPuncture and source_copy already exists, remove it first
if skip_twopuncture and os.path.exists(AMSS_NCKU_source_copy):
shutil.rmtree(AMSS_NCKU_source_copy)
shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
# Copy the generated macro files into the AMSS_NCKU source folder
if not skip_twopuncture:
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
else:
# When skipping TwoPuncture, use existing macro files from previous run
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
# Compile related programs
import makefile_and_run
## Change working directory to the target source copy
os.chdir(AMSS_NCKU_source_copy)
## Build the main AMSS-NCKU executable (ABE or ABEGPU)
makefile_and_run.makefile_ABE()
## If the initial-data method is Ansorg-TwoPuncture, build the TwoPunctureABE executable
## Only build TwoPunctureABE if not skipping TwoPuncture phase
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
makefile_and_run.makefile_TwoPunctureABE()
## Change current working directory back up two levels
os.chdir('..')
os.chdir('..')
print()
##################################################################
## Copy the AMSS-NCKU executable (ABE/ABEGPU) to the run directory
if (input_data.GPU_Calculation == "no"):
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
elif (input_data.GPU_Calculation == "yes"):
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
if not os.path.exists( ABE_file ):
print()
print( " Lack of AMSS-NCKU executable file ABE/ABEGPU; recompile AMSS_NCKU_source manually. " )
print( " When recompilation is finished, press Enter to continue. " )
inputvalue = input()
## Copy the executable ABE (or ABEGPU) into the run directory
shutil.copy2(ABE_file, output_directory)
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
## Only copy TwoPunctureABE if not skipping TwoPuncture phase
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
if not os.path.exists( TwoPuncture_file ):
print()
print( " Lack of AMSS-NCKU executable file TwoPunctureABE; recompile TwoPunctureABE in AMSS_NCKU_source. " )
print( " When recompilation is finished, press Enter to continue. " )
inputvalue = input()
## Copy the TwoPunctureABE executable into the run directory
shutil.copy2(TwoPuncture_file, output_directory)
##################################################################
## If the initial-data method is TwoPuncture, generate the TwoPuncture input files
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and not skip_twopuncture:
print()
print( " Initial data is chosen as Ansorg-TwoPuncture" )
print()
print()
print( " Automatically generating the input parfile for the TwoPunctureABE executable " )
print()
import generate_TwoPuncture_input
generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
print()
print( " The input parfile for the TwoPunctureABE executable has been generated. " )
print()
## Generated AMSS-NCKU TwoPuncture input filename
AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
## Copy and rename the file
shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
## Run TwoPuncture to generate initial-data files
start_time = time.time() # Record start time
print()
print()
## Change to the output (run) directory
os.chdir(output_directory)
## Run the TwoPuncture executable
import makefile_and_run
makefile_and_run.run_TwoPunctureABE()
## Change current working directory back up two levels
os.chdir('..')
os.chdir('..')
elif (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ) and skip_twopuncture:
print()
print( " Skipping TwoPuncture execution, using existing initial data." )
print()
start_time = time.time() # Record start time for ABE only
else:
start_time = time.time() # Record start time
##################################################################
## Update puncture data based on TwoPuncture run results
if not skip_twopuncture:
import renew_puncture_parameter
renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
## Generated AMSS-NCKU input filename
AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
## Copy and rename the file
shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
print()
print( " Successfully copy all AMSS-NCKU input parfile to target dictionary. " )
print()
else:
print()
print( " Using existing input.par file from previous run." )
print()
##################################################################
## Launch the AMSS-NCKU program
print()
print()
## Change to the run directory
os.chdir( output_directory )
import makefile_and_run
makefile_and_run.run_ABE()
## Change current working directory back up two levels
os.chdir('..')
os.chdir('..')
end_time = time.time()
elapsed_time = end_time - start_time
##################################################################
## Copy some basic input and log files out to facilitate debugging
## Path to the file that stores calculation settings
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
## Copy and rename the file for easier inspection
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
## Path to the error log file
AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
## Copy and rename the error log
shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
## Primary program outputs
AMSS_NCKU_BH_data = os.path.join(binary_results_directory, "bssn_BH.dat" )
AMSS_NCKU_ADM_data = os.path.join(binary_results_directory, "bssn_ADMQs.dat" )
AMSS_NCKU_psi4_data = os.path.join(binary_results_directory, "bssn_psi4.dat" )
AMSS_NCKU_constraint_data = os.path.join(binary_results_directory, "bssn_constraint.dat")
## copy and rename the file
shutil.copy( AMSS_NCKU_BH_data, os.path.join(output_directory, "bssn_BH.dat" ) )
shutil.copy( AMSS_NCKU_ADM_data, os.path.join(output_directory, "bssn_ADMQs.dat" ) )
shutil.copy( AMSS_NCKU_psi4_data, os.path.join(output_directory, "bssn_psi4.dat" ) )
shutil.copy( AMSS_NCKU_constraint_data, os.path.join(output_directory, "bssn_constraint.dat") )
## Additional program outputs
if (input_data.Equation_Class == "BSSN-EM"):
AMSS_NCKU_phi1_data = os.path.join(binary_results_directory, "bssn_phi1.dat" )
AMSS_NCKU_phi2_data = os.path.join(binary_results_directory, "bssn_phi2.dat" )
shutil.copy( AMSS_NCKU_phi1_data, os.path.join(output_directory, "bssn_phi1.dat" ) )
shutil.copy( AMSS_NCKU_phi2_data, os.path.join(output_directory, "bssn_phi2.dat" ) )
elif (input_data.Equation_Class == "BSSN-EScalar"):
AMSS_NCKU_maxs_data = os.path.join(binary_results_directory, "bssn_maxs.dat" )
shutil.copy( AMSS_NCKU_maxs_data, os.path.join(output_directory, "bssn_maxs.dat" ) )
##################################################################
## Plot the AMSS-NCKU program results
print()
print( " Plotting the txt and binary results data from the AMSS-NCKU simulation " )
print()
import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu
## Plot black hole trajectory
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number):
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution
for i in range(input_data.Detector_Number):
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level):
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
print()
print( f" This Program Cost = {elapsed_time} Seconds " )
print()
##################################################################
print()
print( " The AMSS-NCKU-Python simulation is successfully finished, thanks for using !!! " )
print()
##################################################################

View File

@@ -16,7 +16,7 @@ import numpy
File_directory = "GW150914" ## output file directory File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long ## The file directory name should not be too long
MPI_processes = 2 ## number of mpi processes used in the simulation MPI_processes = 64 ## number of mpi processes used in the simulation
GPU_Calculation = "no" ## Use GPU or not GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface) ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
@@ -50,7 +50,7 @@ Check_Time = 100.0
Dump_Time = 100.0 ## time inteval dT for dumping binary data Dump_Time = 100.0 ## time inteval dT for dumping binary data
D2_Dump_Time = 100.0 ## dump the ascii data for 2d surface after dT' D2_Dump_Time = 100.0 ## dump the ascii data for 2d surface after dT'
Analysis_Time = 0.1 ## dump the puncture position and GW psi4 after dT" Analysis_Time = 0.1 ## dump the puncture position and GW psi4 after dT"
Evolution_Step_Number = 6 ## stop the calculation after the maximal step number Evolution_Step_Number = 10000000 ## stop the calculation after the maximal step number
Courant_Factor = 0.5 ## Courant Factor Courant_Factor = 0.5 ## Courant Factor
Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength

View File

@@ -49,32 +49,32 @@ import time
File_directory = os.path.join(input_data.File_directory) File_directory = os.path.join(input_data.File_directory)
## If the specified output directory exists, ask the user whether to continue ## If the specified output directory exists, ask the user whether to continue
# if os.path.exists(File_directory): if os.path.exists(File_directory):
# print( " Output dictionary has been existed !!! " ) print( " Output dictionary has been existed !!! " )
# print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! " ) print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! " )
# print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the " ) print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the " )
# print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " ) print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
# print( ) print( )
# ## Prompt whether to overwrite the existing directory ## Prompt whether to overwrite the existing directory
# while True: while True:
# try: try:
# inputvalue = input() inputvalue = input()
# ## If the user agrees to overwrite, proceed and remove the existing directory ## If the user agrees to overwrite, proceed and remove the existing directory
# if ( inputvalue == "continue" ): if ( inputvalue == "continue" ):
# print( " Continue the calculation !!! " ) print( " Continue the calculation !!! " )
# print( ) print( )
# break break
# ## If the user chooses not to overwrite, exit and keep the existing directory ## If the user chooses not to overwrite, exit and keep the existing directory
# elif ( inputvalue == "stop" ): elif ( inputvalue == "stop" ):
# print( " Stop the calculation !!! " ) print( " Stop the calculation !!! " )
# sys.exit() sys.exit()
# ## If the user input is invalid, prompt again ## If the user input is invalid, prompt again
# else: else:
# print( " Please input your choice !!! " ) print( " Please input your choice !!! " )
# print( " Input 'continue' or 'stop' in the terminal !!! " ) print( " Input 'continue' or 'stop' in the terminal !!! " )
# except ValueError: except ValueError:
# print( " Please input your choice !!! " ) print( " Please input your choice !!! " )
# print( " Input 'continue' or 'stop' in the terminal !!! " ) print( " Input 'continue' or 'stop' in the terminal !!! " )
## Remove the existing output directory if present ## Remove the existing output directory if present
shutil.rmtree(File_directory, ignore_errors=True) shutil.rmtree(File_directory, ignore_errors=True)

View File

@@ -277,3 +277,4 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -24,7 +24,7 @@ using namespace std;
#include "misc.h" #include "misc.h"
#include "macrodef.h" #include "macrodef.h"
#include <omp.h>
#ifndef ABEtype #ifndef ABEtype
#error "not define ABEtype" #error "not define ABEtype"
#endif #endif
@@ -69,9 +69,8 @@ int main(int argc, char *argv[])
double Begin_clock, End_clock; double Begin_clock, End_clock;
if (myrank == 0) if (myrank == 0)
{ {
Begin_clock = MPI_Wtime(); Begin_clock = MPI_Wtime();
} }
if (argc > 1) if (argc > 1)

File diff suppressed because it is too large Load Diff

View File

@@ -37,51 +37,57 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!------------- !-------------
SUBROUTINE four1(dataa,nn,isign) SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none implicit none
INTEGER, intent(in) :: isign, nn INTEGER::isign,nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
type(DFTI_DESCRIPTOR), pointer :: desc double precision::tempi,tempr
integer :: status DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
! Create DFTI descriptor for 1D complex-to-complex transform j=1
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) do i=1,n,2
if (status /= 0) return if(j.gt.i)then
tempr=dataa(j)
! Set input/output storage as interleaved complex (default) tempi=dataa(j+1)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) dataa(j)=dataa(i)
if (status /= 0) then dataa(j+1)=dataa(i+1)
status = DftiFreeDescriptor(desc) dataa(i)=tempr
return dataa(i+1)=tempi
endif
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return return
END SUBROUTINE four1 END SUBROUTINE four1

View File

@@ -13,7 +13,7 @@ using namespace std;
#include "MPatch.h" #include "MPatch.h"
#include "Parallel.h" #include "Parallel.h"
#include "fmisc.h" #include "fmisc.h"
#include "xh_global_interp.h"
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{ {
@@ -341,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
double *Shellf, int Symmetry) double *Shellf, int Symmetry)
{ {
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
int myrank, nprocs; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
int ordn = 2 * ghost_width; int ordn = 2 * ghost_width;
MyList<var> *varl; MyList<var> *varl;
@@ -355,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next; varl = varl->next;
} }
memset(Shellf, 0, sizeof(double) * NN * num_var); double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j // we use weight to monitor code, later some day we can move it for optimization
// All ranks traverse the same block list so they all agree on ownership int *weight;
int *owner_rank; weight = new int[NN];
owner_rank = new int[NN]; memset(weight, 0, sizeof(int) * NN);
for (int j = 0; j < NN; j++)
owner_rank[j] = -1; double *DH, *llb, *uub;
DH = new double[dim];
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i); DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -394,9 +399,16 @@ void Patch::Interp_Points(MyList<var> *VarList,
while (notfind && Bp) // run along Blocks while (notfind && Bp) // run along Blocks
{ {
Block *BP = Bp->data; Block *BP = Bp->data;
bool flag = true; bool flag = true;
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{ {
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex #ifdef Vertex
#ifdef Cell #ifdef Cell
#error Both Cell and Vertex are defined #error Both Cell and Vertex are defined
@@ -421,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag) if (flag)
{ {
notfind = false; notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) if (myrank == BP->rank)
{ {
//---> interpolation //---> interpolation
@@ -429,187 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0; int k = 0;
while (varl) // run along variables while (varl) // run along variables
{ {
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], // pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;
} }
} weight[j] = 1;
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group.
// Since each point's data is non-zero only on the owner rank,
// Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
{
int j = 0;
while (j < NN)
{
int cur_owner = owner_rank[j];
if (cur_owner < 0)
{
if (myrank == 0)
{
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
cout << XX[d][j];
if (d < dim - 1)
cout << ",";
else
cout << ")";
}
cout << " on Patch (";
for (int d = 0; d < dim; d++)
{
cout << bbox[d] << "+" << lli[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")--";
}
cout << "(";
for (int d = 0; d < dim; d++)
{
cout << bbox[dim + d] << "-" << uui[d] * DH[d];
if (d < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
j++;
continue;
}
// Find contiguous run of points with the same owner
int jstart = j;
while (j < NN && owner_rank[j] == cur_owner)
j++;
int count = (j - jstart) * num_var;
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
}
}
delete[] owner_rank;
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer)
{
// Targeted point-to-point overload: each owner sends each point only to
// the one rank that needs it for integration (consumer), reducing
// communication volume by ~nprocs times compared to the Bcast version.
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
// printf("here----\n");
// int zzz = 0;
int ordn = 2 * ghost_width;
MyList<var> *varl;
int num_var = 0;
varl = VarList;
while (varl)
{
num_var++;
varl = varl->next;
}
memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j
int *owner_rank;
owner_rank = new int[NN];
for (int j = 0; j < NN; j++)
owner_rank[j] = -1;
double DH[dim];
for (int i = 0; i < dim; i++)
DH[i] = getdX(i);
// --- Interpolation phase (identical to original) ---
// printf("NN: %d, num_var = %d\n", NN, num_var);
#pragma omp parallel
{
#pragma omp for
for (int j = 0; j < NN; j++)
{
double pox[dim], llb[dim], uub[dim];
MyList<var> *varl1;
for (int i = 0; i < dim; i++)
{
pox[i] = XX[i][j];
// if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
// {
// cout << "Patch::Interp_Points: point (";
// for (int k = 0; k < dim; k++)
// {
// cout << XX[k][j];
// if (k < dim - 1)
// cout << ",";
// else
// cout << ") is out of current Patch." << endl;
// }
// MPI_Abort(MPI_COMM_WORLD, 1);
// }
}
MyList<Block> *Bp = blb;
bool notfind = true;
while (notfind && Bp)
{
Block *BP = Bp->data;
bool flag = true;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{
flag = false;
break;
}
}
// printf("flag = %d\n", flag);
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl1 = VarList;
int k = 0;
while (varl1)
{
xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl1->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl1->data->SoA, Symmetry);
varl1 = varl1->next;
k++;
// zzz += 1;
}
} }
} }
if (Bp == ble) if (Bp == ble)
@@ -617,164 +455,104 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next; Bp = Bp->next;
} }
} }
}
// printf("Interpolation done, zzz = %d\n", zzz); MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
// --- Error check for unfound points --- int *Weight;
for (int j = 0; j < NN; j++) Weight = new int[NN];
MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
// misc::tillherecheck("print me");
for (int i = 0; i < NN; i++)
{ {
if (owner_rank[j] < 0 && myrank == 0) if (Weight[i] > 1)
{
if (myrank == 0)
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
for (int j = 0; j < num_var; j++)
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
}
else if (Weight[i] == 0 && myrank == 0)
{ {
cout << "ERROR: Patch::Interp_Points fails to find point ("; cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << XX[d][j]; cout << XX[j][i];
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")"; cout << ")";
} }
cout << " on Patch ("; cout << " on Patch (";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << bbox[d] << "+" << lli[d] * DH[d]; cout << bbox[j] << "+" << lli[j] * getdX(j);
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")--"; cout << ")--";
} }
cout << "("; cout << "(";
for (int d = 0; d < dim; d++) for (int j = 0; j < dim; j++)
{ {
cout << bbox[dim + d] << "-" << uui[d] * DH[d]; cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
if (d < dim - 1) if (j < dim - 1)
cout << ","; cout << ",";
else else
cout << ")" << endl; cout << ")" << endl;
} }
#if 0
checkBlock();
#else
cout << "splited domains:" << endl;
{
MyList<Block> *Bp = blb;
while (Bp)
{
Block *BP = Bp->data;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout << "(";
for (int j = 0; j < dim; j++)
{
cout << llb[j] << ":" << uub[j];
if (j < dim - 1)
cout << ",";
else
cout << ")" << endl;
}
if (Bp == ble)
break;
Bp = Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
// --- Targeted point-to-point communication phase --- delete[] shellf;
// Compute consumer_rank[j] using the same deterministic formula as surface_integral delete[] weight;
int *consumer_rank = new int[NN]; delete[] Weight;
{ delete[] DH;
int mp = NN / nprocs; delete[] llb;
int Lp = NN - nprocs * mp; delete[] uub;
for (int j = 0; j < NN; j++)
{
if (j < Lp * (mp + 1))
consumer_rank[j] = j / (mp + 1);
else
consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
}
}
// Count sends and recvs per rank
int *send_count = new int[nprocs];
int *recv_count = new int[nprocs];
memset(send_count, 0, sizeof(int) * nprocs);
memset(recv_count, 0, sizeof(int) * nprocs);
for (int j = 0; j < NN; j++)
{
int own = owner_rank[j];
int con = consumer_rank[j];
if (own == con)
continue; // local — no communication needed
if (own == myrank)
send_count[con]++;
if (con == myrank)
recv_count[own]++;
}
// Build send buffers: for each destination rank, pack (index, data) pairs
// Each entry: 1 int (point index j) + num_var doubles
int total_send = 0, total_recv = 0;
int *send_offset = new int[nprocs];
int *recv_offset = new int[nprocs];
for (int r = 0; r < nprocs; r++)
{
send_offset[r] = total_send;
total_send += send_count[r];
recv_offset[r] = total_recv;
total_recv += recv_count[r];
}
// Pack send buffers: each message contains (j, data[0..num_var-1]) per point
int stride = 1 + num_var; // 1 double for index + num_var doubles for data
double *sendbuf = new double[total_send * stride];
double *recvbuf = new double[total_recv * stride];
// Temporary counters for packing
int *pack_pos = new int[nprocs];
memset(pack_pos, 0, sizeof(int) * nprocs);
for (int j = 0; j < NN; j++)
{
int own = owner_rank[j];
int con = consumer_rank[j];
if (own != myrank || con == myrank)
continue;
int pos = (send_offset[con] + pack_pos[con]) * stride;
sendbuf[pos] = (double)j; // point index
for (int v = 0; v < num_var; v++)
sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
pack_pos[con]++;
}
// Post non-blocking recvs and sends
int n_req = 0;
for (int r = 0; r < nprocs; r++)
{
if (recv_count[r] > 0) n_req++;
if (send_count[r] > 0) n_req++;
}
MPI_Request *reqs = new MPI_Request[n_req];
int req_idx = 0;
for (int r = 0; r < nprocs; r++)
{
if (recv_count[r] > 0)
{
MPI_Irecv(recvbuf + recv_offset[r] * stride,
recv_count[r] * stride, MPI_DOUBLE,
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
}
}
for (int r = 0; r < nprocs; r++)
{
if (send_count[r] > 0)
{
MPI_Isend(sendbuf + send_offset[r] * stride,
send_count[r] * stride, MPI_DOUBLE,
r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
}
}
if (n_req > 0)
MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
// Unpack recv buffers into Shellf
for (int i = 0; i < total_recv; i++)
{
int pos = i * stride;
int j = (int)recvbuf[pos];
for (int v = 0; v < num_var; v++)
Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
}
delete[] reqs;
delete[] sendbuf;
delete[] recvbuf;
delete[] pack_pos;
delete[] send_offset;
delete[] recv_offset;
delete[] send_count;
delete[] recv_count;
delete[] consumer_rank;
delete[] owner_rank;
} }
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
@@ -784,6 +562,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
int myrank, lmyrank; int myrank, lmyrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_rank(Comm_here, &lmyrank); MPI_Comm_rank(Comm_here, &lmyrank);
int ordn = 2 * ghost_width; int ordn = 2 * ghost_width;
MyList<var> *varl; MyList<var> *varl;
int num_var = 0; int num_var = 0;
@@ -794,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next; varl = varl->next;
} }
memset(Shellf, 0, sizeof(double) * NN * num_var); double *shellf;
shellf = new double[NN * num_var];
memset(shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] stores the global rank that owns point j // we use weight to monitor code, later some day we can move it for optimization
int *owner_rank; int *weight;
owner_rank = new int[NN]; weight = new int[NN];
for (int j = 0; j < NN; j++) memset(weight, 0, sizeof(int) * NN);
owner_rank[j] = -1;
// Build global-to-local rank translation for Comm_here double *DH, *llb, *uub;
MPI_Group world_group, local_group; DH = new double[dim];
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group);
double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{
DH[i] = getdX(i); DH[i] = getdX(i);
}
llb = new double[dim];
uub = new double[dim];
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -841,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
bool flag = true; bool flag = true;
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
{ {
// NOTE: our dividing structure is (exclude ghost)
// -1 0
// 1 2
// so (0,1) does not belong to any part for vertex structure
// here we put (0,0.5) to left part and (0.5,1) to right part
// BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
#ifdef Vertex #ifdef Vertex
#ifdef Cell #ifdef Cell
#error Both Cell and Vertex are defined #error Both Cell and Vertex are defined
@@ -865,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (flag) if (flag)
{ {
notfind = false; notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) if (myrank == BP->rank)
{ {
//---> interpolation //---> interpolation
@@ -873,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
int k = 0; int k = 0;
while (varl) // run along variables while (varl) // run along variables
{ {
xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry);
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;
} }
weight[j] = 1;
} }
} }
if (Bp == ble) if (Bp == ble)
@@ -886,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
// Collect unique global owner ranks and translate to local ranks in Comm_here MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
// Then broadcast each owner's points via MPI_Bcast on Comm_here int *Weight;
{ Weight = new int[NN];
int j = 0; MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
while (j < NN)
{
int cur_owner_global = owner_rank[j];
if (cur_owner_global < 0)
{
// Point not found — skip (error check disabled for sub-communicator levels)
j++;
continue;
}
// Translate global rank to local rank in Comm_here
int cur_owner_local;
MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
// Find contiguous run of points with the same owner // misc::tillherecheck("print me");
int jstart = j; // if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
while (j < NN && owner_rank[j] == cur_owner_global)
j++; for (int i = 0; i < NN; i++)
int count = (j - jstart) * num_var; {
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here); if (Weight[i] > 1)
{
if (lmyrank == 0)
cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
for (int j = 0; j < num_var; j++)
Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
} }
#if 0 // for not involved levels, this may fail
else if(Weight[i] == 0 && lmyrank == 0)
{
cout<<"ERROR: Patch::Interp_Points fails to find point (";
for(int j=0;j<dim;j++)
{
cout<<XX[j][i];
if(j<dim-1) cout<<",";
else cout<<")";
}
cout<<" on Patch (";
for(int j=0;j<dim;j++)
{
cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")--";
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
#if 0
checkBlock();
#else
cout<<"splited domains:"<<endl;
{
MyList<Block> *Bp=blb;
while(Bp)
{
Block *BP=Bp->data;
for(int i=0;i<dim;i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +(ghost_width-0.5)*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i] ,bbox[i] ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i] : BP->bbox[i] +ghost_width*DH[i];
uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
cout<<"(";
for(int j=0;j<dim;j++)
{
cout<<llb[j]<<":"<<uub[j];
if(j<dim-1) cout<<",";
else cout<<")"<<endl;
}
if(Bp == ble) break;
Bp=Bp->next;
}
}
#endif
MPI_Abort(MPI_COMM_WORLD,1);
}
#endif
} }
MPI_Group_free(&world_group); delete[] shellf;
MPI_Group_free(&local_group); delete[] weight;
delete[] owner_rank; delete[] Weight;
delete[] DH;
delete[] llb;
delete[] uub;
} }
void Patch::checkBlock() void Patch::checkBlock()
{ {
@@ -1105,7 +956,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
{ {
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry); // pox,ordn,varl->data->SoA,Symmetry);
xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k], f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;
@@ -1207,7 +1058,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int ordn = 2 * ghost_width; int ordn = 2 * ghost_width;
MyList<var> *varl; MyList<var> *varl;
int num_var = 0; int num_var = 0;
@@ -1347,7 +1198,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
{ {
// shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
// pox,ordn,varl->data->SoA,Symmetry); // pox,ordn,varl->data->SoA,Symmetry);
xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k], f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next; varl = varl->next;
k++; k++;

View File

@@ -39,10 +39,6 @@ public:
bool Find_Point(double *XX); bool Find_Point(double *XX);
void Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer);
void Interp_Points(MyList<var> *VarList, void Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -4,7 +4,7 @@
#include "prolongrestrict.h" #include "prolongrestrict.h"
#include "misc.h" #include "misc.h"
#include "parameters.h" #include "parameters.h"
#include <omp.h>
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
{ {
nx = Mymax(1, shape / min_width); nx = Mymax(1, shape / min_width);
@@ -3338,7 +3338,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// double time1 = omp_get_wtime();
int DIM = dim; int DIM = dim;
if (dir != PACK && dir != UNPACK) if (dir != PACK && dir != UNPACK)
@@ -3361,6 +3361,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
varls = varls->next; varls = varls->next;
varld = varld->next; varld = varld->next;
} }
if (varls || varld) if (varls || varld)
{ {
cout << "error in short data packer, var lists does not match." << endl; cout << "error in short data packer, var lists does not match." << endl;
@@ -3374,6 +3375,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
type = 2; type = 2;
else else
type = 3; type = 3;
while (src && dst) while (src && dst)
{ {
if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
@@ -3383,7 +3385,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
varld = VarListd; varld = VarListd;
while (varls && varld) while (varls && varld)
{ {
if (data) if (data)
{ {
if (dir == PACK) if (dir == PACK)
@@ -3404,7 +3405,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
} }
if (dir == UNPACK) // from target data to corresponding grid if (dir == UNPACK) // from target data to corresponding grid
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
@@ -3418,14 +3418,8 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
} }
dst = dst->next; dst = dst->next;
src = src->next; src = src->next;
} }
// double time2 = omp_get_wtime();
// xxx += time2 - time1;
// if(myrank == 0){
// printf("prolong3 time = %lf\n", time2 - time1);
// }
return size_out; return size_out;
} }
int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir, int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
@@ -3520,7 +3514,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int node; int node;
// double time1 = omp_get_wtime();
MPI_Request *reqs; MPI_Request *reqs;
MPI_Status *stats; MPI_Status *stats;
reqs = new MPI_Request[2 * cpusize]; reqs = new MPI_Request[2 * cpusize];
@@ -3589,9 +3583,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
if (rec_data[node]) if (rec_data[node])
delete[] rec_data[node]; delete[] rec_data[node];
} }
// double time2 = omp_get_wtime();
// if (myrank == 0)
// printf("transfer time = %lf\n", time2 - time1);
delete[] reqs; delete[] reqs;
delete[] stats; delete[] stats;
delete[] send_data; delete[] send_data;
@@ -3764,502 +3756,6 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
delete[] transfer_src; delete[] transfer_src;
delete[] transfer_dst; delete[] transfer_dst;
} }
// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
// then issue a single transfer() call instead of N+1 separate ones.
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
for (int node = 0; node < cpusize; node++)
combined_src[node] = combined_dst[node] = 0;
// Phase A: Intra-patch ghost exchange segments
MyList<Patch> *Pp = PatL;
while (Pp)
{
Patch *Pat = Pp->data;
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
if (tsrc)
{
if (combined_src[node])
combined_src[node]->catList(tsrc);
else
combined_src[node] = tsrc;
}
if (tdst)
{
if (combined_dst[node])
combined_dst[node]->catList(tdst);
else
combined_dst[node] = tdst;
}
if (src_owned)
src_owned->destroyList();
}
if (dst_ghost)
dst_ghost->destroyList();
Pp = Pp->next;
}
// Phase B: Inter-patch buffer exchange segments
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
if (tsrc)
{
if (combined_src[node])
combined_src[node]->catList(tsrc);
else
combined_src[node] = tsrc;
}
if (tdst)
{
if (combined_dst[node])
combined_dst[node]->catList(tdst);
else
combined_dst[node] = tdst;
}
if (src_owned)
src_owned->destroyList();
}
if (dst_buffer)
dst_buffer->destroyList();
// Phase C: Single transfer
transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
// Phase D: Cleanup
for (int node = 0; node < cpusize; node++)
{
if (combined_src[node])
combined_src[node]->destroyList();
if (combined_dst[node])
combined_dst[node]->destroyList();
}
delete[] combined_src;
delete[] combined_dst;
}
// SyncCache constructor
Parallel::SyncCache::SyncCache()
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
lengths_valid(false)
{
}
// SyncCache invalidate: free grid segment lists but keep buffers
void Parallel::SyncCache::invalidate()
{
if (!valid)
return;
for (int i = 0; i < cpusize; i++)
{
if (combined_src[i])
combined_src[i]->destroyList();
if (combined_dst[i])
combined_dst[i]->destroyList();
combined_src[i] = combined_dst[i] = 0;
send_lengths[i] = recv_lengths[i] = 0;
}
valid = false;
lengths_valid = false;
}
// SyncCache destroy: free everything
void Parallel::SyncCache::destroy()
{
invalidate();
if (combined_src) delete[] combined_src;
if (combined_dst) delete[] combined_dst;
if (send_lengths) delete[] send_lengths;
if (recv_lengths) delete[] recv_lengths;
if (send_buf_caps) delete[] send_buf_caps;
if (recv_buf_caps) delete[] recv_buf_caps;
for (int i = 0; i < cpusize; i++)
{
if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
}
if (send_bufs) delete[] send_bufs;
if (recv_bufs) delete[] recv_bufs;
if (reqs) delete[] reqs;
if (stats) delete[] stats;
combined_src = combined_dst = 0;
send_lengths = recv_lengths = 0;
send_buf_caps = recv_buf_caps = 0;
send_bufs = recv_bufs = 0;
reqs = 0; stats = 0;
cpusize = 0; max_reqs = 0;
}
// transfer_cached: reuse pre-allocated buffers from SyncCache
void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache)
{
int myrank;
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
int req_no = 0;
int node;
for (node = 0; node < cpusize; node++)
{
if (node == myrank)
{
int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = length;
if (length > 0)
{
if (length > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
}
}
else
{
// send
int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
cache.send_lengths[node] = slength;
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
}
// recv
int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
cache.recv_lengths[node] = rlength;
if (rlength > 0)
{
if (rlength > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
}
}
}
MPI_Waitall(req_no, cache.reqs, cache.stats);
for (node = 0; node < cpusize; node++)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
}
// Sync_cached: build grid segment lists on first call, reuse on subsequent calls
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
{
if (!cache.valid)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
cache.cpusize = cpusize;
// Allocate cache arrays if needed
if (!cache.combined_src)
{
cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
cache.send_lengths = new int[cpusize];
cache.recv_lengths = new int[cpusize];
cache.send_bufs = new double *[cpusize];
cache.recv_bufs = new double *[cpusize];
cache.send_buf_caps = new int[cpusize];
cache.recv_buf_caps = new int[cpusize];
for (int i = 0; i < cpusize; i++)
{
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
}
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
}
for (int node = 0; node < cpusize; node++)
{
cache.combined_src[node] = cache.combined_dst[node] = 0;
cache.send_lengths[node] = cache.recv_lengths[node] = 0;
}
// Build intra-patch segments (same as Sync_merged Phase A)
MyList<Patch> *Pp = PatL;
while (Pp)
{
Patch *Pat = Pp->data;
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
if (tsrc)
{
if (cache.combined_src[node])
cache.combined_src[node]->catList(tsrc);
else
cache.combined_src[node] = tsrc;
}
if (tdst)
{
if (cache.combined_dst[node])
cache.combined_dst[node]->catList(tdst);
else
cache.combined_dst[node] = tdst;
}
if (src_owned) src_owned->destroyList();
}
if (dst_ghost) dst_ghost->destroyList();
Pp = Pp->next;
}
// Build inter-patch segments (same as Sync_merged Phase B)
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
if (tsrc)
{
if (cache.combined_src[node])
cache.combined_src[node]->catList(tsrc);
else
cache.combined_src[node] = tsrc;
}
if (tdst)
{
if (cache.combined_dst[node])
cache.combined_dst[node]->catList(tdst);
else
cache.combined_dst[node] = tdst;
}
if (src_owned) src_owned->destroyList();
}
if (dst_buffer) dst_buffer->destroyList();
cache.valid = true;
}
// Use cached lists with buffer-reusing transfer
transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
}
// Sync_start: pack and post MPI_Isend/Irecv, return immediately
void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state)
{
// Ensure cache is built
if (!cache.valid)
{
// Build cache (same logic as Sync_cached)
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
cache.cpusize = cpusize;
if (!cache.combined_src)
{
cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
cache.send_lengths = new int[cpusize];
cache.recv_lengths = new int[cpusize];
cache.send_bufs = new double *[cpusize];
cache.recv_bufs = new double *[cpusize];
cache.send_buf_caps = new int[cpusize];
cache.recv_buf_caps = new int[cpusize];
for (int i = 0; i < cpusize; i++)
{
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
}
cache.max_reqs = 2 * cpusize;
cache.reqs = new MPI_Request[cache.max_reqs];
cache.stats = new MPI_Status[cache.max_reqs];
}
for (int node = 0; node < cpusize; node++)
{
cache.combined_src[node] = cache.combined_dst[node] = 0;
cache.send_lengths[node] = cache.recv_lengths[node] = 0;
}
MyList<Patch> *Pp = PatL;
while (Pp)
{
Patch *Pat = Pp->data;
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
if (tsrc)
{
if (cache.combined_src[node])
cache.combined_src[node]->catList(tsrc);
else
cache.combined_src[node] = tsrc;
}
if (tdst)
{
if (cache.combined_dst[node])
cache.combined_dst[node]->catList(tdst);
else
cache.combined_dst[node] = tdst;
}
if (src_owned) src_owned->destroyList();
}
if (dst_ghost) dst_ghost->destroyList();
Pp = Pp->next;
}
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
for (int node = 0; node < cpusize; node++)
{
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
if (tsrc)
{
if (cache.combined_src[node])
cache.combined_src[node]->catList(tsrc);
else
cache.combined_src[node] = tsrc;
}
if (tdst)
{
if (cache.combined_dst[node])
cache.combined_dst[node]->catList(tdst);
else
cache.combined_dst[node] = tdst;
}
if (src_owned) src_owned->destroyList();
}
if (dst_buffer) dst_buffer->destroyList();
cache.valid = true;
}
// Now pack and post async MPI operations
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
state.req_no = 0;
state.active = true;
MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst;
for (int node = 0; node < cpusize; node++)
{
if (node == myrank)
{
int length;
if (!cache.lengths_valid) {
length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
cache.recv_lengths[node] = length;
} else {
length = cache.recv_lengths[node];
}
if (length > 0)
{
if (length > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[length];
cache.recv_buf_caps[node] = length;
}
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
}
}
else
{
int slength;
if (!cache.lengths_valid) {
slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
cache.send_lengths[node] = slength;
} else {
slength = cache.send_lengths[node];
}
if (slength > 0)
{
if (slength > cache.send_buf_caps[node])
{
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
cache.send_bufs[node] = new double[slength];
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
int rlength;
if (!cache.lengths_valid) {
rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
cache.recv_lengths[node] = rlength;
} else {
rlength = cache.recv_lengths[node];
}
if (rlength > 0)
{
if (rlength > cache.recv_buf_caps[node])
{
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
}
}
cache.lengths_valid = true;
}
// Sync_finish: wait for async MPI operations and unpack
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry)
{
if (!state.active)
return;
MPI_Waitall(state.req_no, cache.reqs, cache.stats);
int cpusize = cache.cpusize;
MyList<Parallel::gridseg> **src = cache.combined_src;
MyList<Parallel::gridseg> **dst = cache.combined_dst;
for (int node = 0; node < cpusize; node++)
if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
state.active = false;
}
// collect buffer grid segments or blocks for the periodic boundary condition of given patch // collect buffer grid segments or blocks for the periodic boundary condition of given patch
// --------------------------------------------------- // ---------------------------------------------------
// |con | |con | // |con | |con |

View File

@@ -81,43 +81,6 @@ namespace Parallel
int Symmetry); int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
struct SyncCache {
bool valid;
int cpusize;
MyList<gridseg> **combined_src;
MyList<gridseg> **combined_dst;
int *send_lengths;
int *recv_lengths;
double **send_bufs;
double **recv_bufs;
int *send_buf_caps;
int *recv_buf_caps;
MPI_Request *reqs;
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
SyncCache();
void invalidate();
void destroy();
};
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
struct AsyncSyncState {
int req_no;
bool active;
AsyncSyncState() : req_no(0), active(false) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,7 @@
#ifndef TWO_PUNCTURES_H #ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H #define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19 #define StencilSize 19
#define N_PlaneRelax 1 #define N_PlaneRelax 1
#define NRELAX 200 #define NRELAX 200
@@ -33,7 +32,7 @@ private:
int npoints_A, npoints_B, npoints_phi; int npoints_A, npoints_B, npoints_phi;
double target_M_plus, target_M_minus; double target_M_plus, target_M_minus;
double admMass; double admMass;
double adm_tol; double adm_tol;
@@ -43,18 +42,6 @@ private:
int ntotal; int ntotal;
// ===== Precomputed spectral derivative matrices =====
double *D1_A, *D2_A;
double *D1_B, *D2_B;
double *DF1_phi, *DF2_phi;
// ===== Pre-allocated workspace for LineRelax (per-thread) =====
int max_threads;
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
struct parameters struct parameters
{ {
int nvar, n1, n2, n3; int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
int Newtonmaxit); int Newtonmaxit);
~TwoPunctures(); ~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve(); void Solve();
void set_initial_guess(derivs v); void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d); int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
double BY_KKofxyz(double x, double y, double z); double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values); int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R, void LinEquations(double A, double B, double X, double R,
double x, double r, double phi, double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values); double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname); void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu) // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -164,4 +141,4 @@ public:
void SpecCoef(parameters par, int ivar, double *v, double *cf); void SpecCoef(parameters par, int ivar, double *v, double *cf);
}; };
#endif /* TWO_PUNCTURES_H */ #endif /* TWO_PUNCTURES_H */

View File

@@ -40,7 +40,7 @@ using namespace std;
#include "derivatives.h" #include "derivatives.h"
#include "ricci_gamma.h" #include "ricci_gamma.h"
#include "xh_bssn_rhs_compute.h"
//================================================================================================ //================================================================================================
// define bssn_class // define bssn_class
@@ -730,12 +730,6 @@ void bssn_class::Initialize()
PhysTime = StartTime; PhysTime = StartTime;
Setup_Black_Hole_position(); Setup_Black_Hole_position();
} }
// Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
} }
//================================================================================================ //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
delete Azzz; delete Azzz;
#endif #endif
// Destroy sync caches before GH
if (sync_cache_pre)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_pre[i].destroy();
delete[] sync_cache_pre;
}
if (sync_cache_cor)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_cor[i].destroy();
delete[] sync_cache_cor;
}
if (sync_cache_rp_coarse)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse;
}
if (sync_cache_rp_fine)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine;
}
delete GH; delete GH;
#ifdef WithShell #ifdef WithShell
delete SH; delete SH;
@@ -2029,7 +1997,6 @@ void bssn_class::Read_Ansorg()
void bssn_class::Evolve(int Steps) void bssn_class::Evolve(int Steps)
{ {
clock_t prev_clock, curr_clock; clock_t prev_clock, curr_clock;
double prev_time, curr_time;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0; double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0; LastAnas = 0;
#if 0 #if 0
@@ -2142,10 +2109,8 @@ void bssn_class::Evolve(int Steps)
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
// { GH->levels=GH->movls; } // { GH->levels=GH->movls; }
if (myrank == 0){ if (myrank == 0)
curr_clock = clock(); curr_clock = clock();
curr_time = omp_get_wtime();
}
#if (PSTR == 0) #if (PSTR == 0)
RecursiveStep(0); RecursiveStep(0);
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
@@ -2201,17 +2166,12 @@ void bssn_class::Evolve(int Steps)
if (myrank == 0) if (myrank == 0)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
prev_time = curr_time;
curr_clock = clock(); curr_clock = clock();
curr_time = omp_get_wtime();
cout << endl; cout << endl;
// cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << " "
// << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
// << " seconds! " << endl;
// // cout << endl;
cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << " " cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << " "
<< " Computer used " << (curr_time - prev_time) << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
// cout << endl;
} }
if (PhysTime >= TotalTime) if (PhysTime >= TotalTime)
@@ -2221,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2437,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2616,7 +2574,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2783,7 +2740,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2798,7 +2754,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2817,7 +2772,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2833,7 +2787,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3100,7 +3053,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -3205,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3223,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
@@ -3300,7 +3267,7 @@ void bssn_class::Step(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1; ERROR = 1;
} }
// cout<<"....................................."<<endl;
// rk4 substep and boundary // rk4 substep and boundary
{ {
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList;
@@ -3349,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3371,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -3465,7 +3424,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -3569,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3587,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1) #elif (AGM == 1)
if (iter_count == 3) if (iter_count == 3)
@@ -3709,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#"
<< iter_count << " variables at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3731,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -3978,7 +3945,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -4067,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4085,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[fngfs + ShellPatch::gx], cg->fgfs[fngfs + ShellPatch::gx],
cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gy],
cg->fgfs[fngfs + ShellPatch::gz], cg->fgfs[fngfs + ShellPatch::gz],
cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhodx],
cg->fgfs[fngfs + ShellPatch::drhody], cg->fgfs[fngfs + ShellPatch::drhody],
@@ -4208,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4230,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
#endif #endif
@@ -4320,7 +4293,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -4413,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4431,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1) #elif (AGM == 1)
if (iter_count == 3) if (iter_count == 3)
@@ -4553,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4575,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -4856,7 +4835,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -4964,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4988,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5056,7 +5031,7 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -5165,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
if (ErrorMonitor->outfile) if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << " variables at t = " << PhysTime
<< ", lev = " << lev << endl; << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5476,11 +5447,21 @@ void bssn_class::SHStep()
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
#endif #endif
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5492,25 +5473,12 @@ void bssn_class::SHStep()
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// corrector // corrector
for (iter_count = 1; iter_count < 4; iter_count++) for (iter_count = 1; iter_count < 4; iter_count++)
{ {
@@ -5653,11 +5621,21 @@ void bssn_class::SHStep()
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5669,26 +5647,12 @@ void bssn_class::SHStep()
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
sPp = SH->PatL; sPp = SH->PatL;
while (sPp) while (sPp)
{ {
@@ -5817,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5878,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5916,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5974,7 +5938,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -6006,7 +5970,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -6030,7 +5994,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
} }
} }
@@ -6081,7 +6045,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -6115,7 +6079,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -6139,7 +6103,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
@@ -6222,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#else #else
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
#undef MIXOUTB #undef MIXOUTB
@@ -7351,7 +7315,7 @@ void bssn_class::Constraint_Out()
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -7854,7 +7818,7 @@ void bssn_class::Interp_Constraint(bool infg)
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -8112,7 +8076,7 @@ void bssn_class::Compute_Constraint()
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],

View File

@@ -126,11 +126,6 @@ public:
MyList<var> *OldStateList, *DumpList; MyList<var> *OldStateList, *DumpList;
MyList<var> *ConstraintList; MyList<var> *ConstraintList;
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;

View File

@@ -106,6 +106,36 @@
call getpbh(BHN,Porg,Mass) call getpbh(BHN,Porg,Mass)
#endif #endif
!!! sanity check
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)
if(dX.ne.dX) then
if(sum(chi).ne.sum(chi))write(*,*)"bssn.f90: find NaN in chi"
if(sum(trK).ne.sum(trK))write(*,*)"bssn.f90: find NaN in trk"
if(sum(dxx).ne.sum(dxx))write(*,*)"bssn.f90: find NaN in dxx"
if(sum(gxy).ne.sum(gxy))write(*,*)"bssn.f90: find NaN in gxy"
if(sum(gxz).ne.sum(gxz))write(*,*)"bssn.f90: find NaN in gxz"
if(sum(dyy).ne.sum(dyy))write(*,*)"bssn.f90: find NaN in dyy"
if(sum(gyz).ne.sum(gyz))write(*,*)"bssn.f90: find NaN in gyz"
if(sum(dzz).ne.sum(dzz))write(*,*)"bssn.f90: find NaN in dzz"
if(sum(Axx).ne.sum(Axx))write(*,*)"bssn.f90: find NaN in Axx"
if(sum(Axy).ne.sum(Axy))write(*,*)"bssn.f90: find NaN in Axy"
if(sum(Axz).ne.sum(Axz))write(*,*)"bssn.f90: find NaN in Axz"
if(sum(Ayy).ne.sum(Ayy))write(*,*)"bssn.f90: find NaN in Ayy"
if(sum(Ayz).ne.sum(Ayz))write(*,*)"bssn.f90: find NaN in Ayz"
if(sum(Azz).ne.sum(Azz))write(*,*)"bssn.f90: find NaN in Azz"
if(sum(Gamx).ne.sum(Gamx))write(*,*)"bssn.f90: find NaN in Gamx"
if(sum(Gamy).ne.sum(Gamy))write(*,*)"bssn.f90: find NaN in Gamy"
if(sum(Gamz).ne.sum(Gamz))write(*,*)"bssn.f90: find NaN in Gamz"
if(sum(Lap).ne.sum(Lap))write(*,*)"bssn.f90: find NaN in Lap"
if(sum(betax).ne.sum(betax))write(*,*)"bssn.f90: find NaN in betax"
if(sum(betay).ne.sum(betay))write(*,*)"bssn.f90: find NaN in betay"
if(sum(betaz).ne.sum(betaz))write(*,*)"bssn.f90: find NaN in betaz"
gont = 1
return
endif
PI = dacos(-ONE) PI = dacos(-ONE)
@@ -602,7 +632,7 @@
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
! now get physical second kind of connection
Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF

File diff suppressed because it is too large Load Diff

View File

@@ -18,61 +18,49 @@
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
real*8 :: lgxx,lgyy,lgzz,ldetg real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: ltrA,lscale
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
lgxx = dxx(i,j,k) + ONE detg = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
lgyy = dyy(i,j,k) + ONE gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
lgzz = dzz(i,j,k) + ONE gupxx = ( gyy * gzz - gyz * gyz ) / detg
gupxy = - ( gxy * gzz - gyz * gxz ) / detg
gupxz = ( gxy * gyz - gyy * gxz ) / detg
gupyy = ( gxx * gzz - gxz * gxz ) / detg
gupyz = - ( gxx * gyz - gxy * gxz ) / detg
gupzz = ( gxx * gyy - gxy * gxy ) / detg
ldetg = lgxx * lgyy * lgzz & trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+ gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) & + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
+ gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
- gxz(i,j,k) * lgyy * gxz(i,j,k) &
- gxy(i,j,k) * gxy(i,j,k) * lgzz &
- lgxx * gyz(i,j,k) * gyz(i,j,k)
lgupxx = ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg Axx = Axx - F1o3 * gxx * trA
lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg Axy = Axy - F1o3 * gxy * trA
lgupxz = ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg Axz = Axz - F1o3 * gxz * trA
lgupyy = ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg Ayy = Ayy - F1o3 * gyy * trA
lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg Ayz = Ayz - F1o3 * gyz * trA
lgupzz = ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg Azz = Azz - F1o3 * gzz * trA
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) & detg = ONE / ( detg ** F1o3 )
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) & gxx = gxx * detg
+ lgupyz * Ayz(i,j,k)) gxy = gxy * detg
gxz = gxz * detg
gyy = gyy * detg
gyz = gyz * detg
gzz = gzz * detg
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA dxx = gxx - ONE
Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA dyy = gyy - ONE
Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA dzz = gzz - ONE
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
lscale = ONE / ( ldetg ** F1o3 )
dxx(i,j,k) = lgxx * lscale - ONE
gxy(i,j,k) = gxy(i,j,k) * lscale
gxz(i,j,k) = gxz(i,j,k) * lscale
dyy(i,j,k) = lgyy * lscale - ONE
gyz(i,j,k) = gyz(i,j,k) * lscale
dzz(i,j,k) = lgzz * lscale - ONE
enddo
enddo
enddo
return return
@@ -94,71 +82,51 @@
real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
!~~~~~~~> Local variable: !~~~~~~~> Local variable:
integer :: i,j,k real*8, dimension(ex(1),ex(2),ex(3)) :: trA
real*8 :: lgxx,lgyy,lgzz,lscale real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz
real*8 :: lgxy,lgxz,lgyz real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
real*8 :: ltrA
real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0 real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
!~~~~~~> !~~~~~~>
do k=1,ex(3) gxx = dxx + ONE
do j=1,ex(2) gyy = dyy + ONE
do i=1,ex(1) gzz = dzz + ONE
! for g
gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
! for g: normalize determinant first gupzz = ONE / ( gupzz ** F1o3 )
lgxx = dxx(i,j,k) + ONE
lgyy = dyy(i,j,k) + ONE gxx = gxx * gupzz
lgzz = dzz(i,j,k) + ONE gxy = gxy * gupzz
lgxy = gxy(i,j,k) gxz = gxz * gupzz
lgxz = gxz(i,j,k) gyy = gyy * gupzz
lgyz = gyz(i,j,k) gyz = gyz * gupzz
gzz = gzz * gupzz
lscale = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz & dxx = gxx - ONE
+ lgxz * lgxy * lgyz - lgxz * lgyy * lgxz & dyy = gyy - ONE
- lgxy * lgxy * lgzz - lgxx * lgyz * lgyz dzz = gzz - ONE
! for A
lscale = ONE / ( lscale ** F1o3 ) gupxx = ( gyy * gzz - gyz * gyz )
gupxy = - ( gxy * gzz - gyz * gxz )
gupxz = ( gxy * gyz - gyy * gxz )
gupyy = ( gxx * gzz - gxz * gxz )
gupyz = - ( gxx * gyz - gxy * gxz )
gupzz = ( gxx * gyy - gxy * gxy )
lgxx = lgxx * lscale trA = gupxx * Axx + gupyy * Ayy + gupzz * Azz &
lgxy = lgxy * lscale + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
lgxz = lgxz * lscale
lgyy = lgyy * lscale
lgyz = lgyz * lscale
lgzz = lgzz * lscale
dxx(i,j,k) = lgxx - ONE Axx = Axx - F1o3 * gxx * trA
gxy(i,j,k) = lgxy Axy = Axy - F1o3 * gxy * trA
gxz(i,j,k) = lgxz Axz = Axz - F1o3 * gxz * trA
dyy(i,j,k) = lgyy - ONE Ayy = Ayy - F1o3 * gyy * trA
gyz(i,j,k) = lgyz Ayz = Ayz - F1o3 * gyz * trA
dzz(i,j,k) = lgzz - ONE Azz = Azz - F1o3 * gzz * trA
! for A: trace-free using normalized metric (det=1, no division needed)
lgupxx = ( lgyy * lgzz - lgyz * lgyz )
lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
lgupxz = ( lgxy * lgyz - lgyy * lgxz )
lgupyy = ( lgxx * lgzz - lgxz * lgxz )
lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
lgupzz = ( lgxx * lgyy - lgxy * lgxy )
ltrA = lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+ lgupzz * Azz(i,j,k) &
+ TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+ lgupyz * Ayz(i,j,k))
Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
enddo
enddo
enddo
return return

View File

@@ -1,26 +0,0 @@
#include "xh_macrodef.h"
#include "xh_tool.h"
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co
);

View File

@@ -1,66 +0,0 @@
/* tetrad notes
v:r; u: phi; w: theta
tetradtype 0
v^a = (x,y,z)
orthonormal order: v,u,w
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
tetradtype 1
orthonormal order: w,u,v
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
tetradtype 2
v_a = (x,y,z)
orthonormal order: v,u,w
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
*/
#define tetradtype 2
/* Cell center or Vertex center */
#define Cell
/* ghost_width meaning:
2nd order: 2
4th order: 3
6th order: 4
8th order: 5
*/
#define ghost_width 3
/* use shell or not */
#define WithShell
/* use constraint preserving boundary condition or not
only affect Z4c
*/
#define CPBC
/* Gauge condition type
0: B^i gauge
1: David's puncture gauge
2: MB B^i gauge
3: RIT B^i gauge
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
6: MGB1 B^i gauge
7: MGB2 B^i gauge
*/
#define GAUGE 2
/* buffer points for CPBC boundary */
#define CPBC_ghost_width (ghost_width)
/* using BSSN variable for constraint violation and psi4 calculation: 0
using ADM variable for constraint violation and psi4 calculation: 1
*/
#define ABV 0
/* Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
1: Case C of 1112.3928, V=0
2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
4: a2 = infinity and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
*/
#define EScalar_CC 2

View File

@@ -1,338 +0,0 @@
#ifndef SHARE_FUNC_H
#define SHARE_FUNC_H
#include <stdlib.h>
#include <stddef.h>
#include <math.h>
#include <stdio.h>
#include <omp.h>
/* 主网格0-based -> 1D */
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
const int ex1 = ex[0], ex2 = ex[1];
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
}
/*
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
* ord=2 => shift=1
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex
*/
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
const int shift = 1;
const int nx = ex[0] + 2; // ex1 + ord
const int ny = ex[1] + 2;
const int ii = iF + shift; // 0..ex1+1
const int jj = jF + shift; // 0..ex2+1
const int kk = kF + shift; // 0..ex3+1
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
* ord=3 => shift=2
* iF/jF/kF 是 Fortran 索引(可为负)
*/
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
const int shift = 2; // ord=3 -> -2..ex
const int nx = ex[0] + 3; // ex1 + ord
const int ny = ex[1] + 3;
const int ii = iF + shift; // 0..ex1+2
const int jj = jF + shift; // 0..ex2+2
const int kk = kF + shift; // 0..ex3+2
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
*
* C 里我们把:
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
* funcc 用“平移下标”存为一维数组:
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
* 总长度 nx = extc1 + ord
* 同理 ny = extc2 + ord, nz = extc3 + ord
*/
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
const int nx = extc[0], ny = extc[1];
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
}
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
const int shift = ord - 1; // iF = -shift .. extc1
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
const int ny = extc[1] + ord;
const int ii = iF + shift; // 0..extc1+shift
const int jj = jF + shift; // 0..extc2+shift
const int kk = kF + shift; // 0..extc3+shift
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* 等价于 Fortran:
* funcc(1:extc1,1:extc2,1:extc3)=func
* do i=0,ord-1
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
* enddo
* do i=0,ord-1
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
* enddo
* do i=0,ord-1
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
* enddo
*/
static inline void symmetry_bd(int ord,
const int extc[3],
const double *func,
double *funcc,
const double SoA[3])
{
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
// 1) funcc(1:extc1,1:extc2,1:extc3) = func
// Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
for (int k0 = 0; k0 < extc3; ++k0) {
for (int j0 = 0; j0 < extc2; ++j0) {
for (int i0 = 0; i0 < extc1; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
for (int ii = 0; ii <= ord - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= extc3; ++kF) {
for (int jF = 1; jF <= extc2; ++jF) {
funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
for (int jj = 0; jj <= ord - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= extc3; ++kF) {
for (int iF = -ord + 1; iF <= extc1; ++iF) {
funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
for (int kk = 0; kk <= ord - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -ord + 1; jF <= extc2; ++jF) {
for (int iF = -ord + 1; iF <= extc1; ++iF) {
funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
}
}
}
}
#endif
/* 你已有的函数idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
static inline void fdderivs_xh(
int i0, int j0, int k0,
const int ex[3],
const double *fh,
int iminF, int jminF, int kminF,
int imaxF, int jmaxF, int kmaxF,
double Fdxdx, double Fdydy, double Fdzdz,
double Fdxdy, double Fdxdz, double Fdydz,
double Sdxdx, double Sdydy, double Sdzdz,
double Sdxdy, double Sdxdz, double Sdydz,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz
){
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double TWO = 2.0;
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* 高阶分支i±2,j±2,k±2 都在范围内 */
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
/* fxy 高阶 */
{
const double t_jm2 =
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
const double t_jm1 =
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
const double t_jp1 =
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
const double t_jp2 =
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
/* 二阶分支i±1,j±1,k±1 在范围内 */
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
);
}
else {
fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
}
}

View File

@@ -1,27 +0,0 @@
#include "xh_share_func.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps);
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3]);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,311 +0,0 @@
#include "../include/tool.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff;
const int NO_SYMM = 0, EQ_SYMM = 1;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
const double F1o4 = 2.5e-1; // 1/4
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double F1o12 = ONE / 12.0;
const double F1o144 = ONE / 144.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
const size_t nx = (size_t)ex1 + 2;
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
/* 系数:按 Fortran 原式 */
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX);
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
// double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// symmetry_bd(2, ex, f, fh, SoA);
const double SoA[3] = { SYM1, SYM2, SYM3 };
for (int k0 = 0; k0 < ex[2]; ++k0) {
for (int j0 = 0; j0 < ex[1]; ++j0) {
for (int i0 = 0; i0 < ex[0]; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
for (int ii = 0; ii <= 2 - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int jF = 1; jF <= ex[1]; ++jF) {
fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] =
fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
for (int jj = 0; jj <= 2 - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
for (int kk = 0; kk <= 2 - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
}
}
}
/* 输出清零fxx,fyy,fzz,fxy,fxz,fyz = 0 */
// const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
// for (size_t p = 0; p < all; ++p) {
// fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
// fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
// }
/*
* Fortran:
* do k=1,ex3-1
* do j=1,ex2-1
* do i=1,ex1-1
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* 高阶分支i±2,j±2,k±2 都在范围内 */
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
{
const double t_jm2 =
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
const double t_jm1 =
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
const double t_jp1 =
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
const double t_jp2 =
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
/* 二阶分支i±1,j±1,k±1 在范围内 */
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
);
}else{
fxx[p] = 0.0;
fyy[p] = 0.0;
fzz[p] = 0.0;
fxy[p] = 0.0;
fxz[p] = 0.0;
fyz[p] = 0.0;
}
}
}
}
// free(fh);
}

View File

@@ -1,7 +0,0 @@
#include "include/bssn_rhs_compute.h"
int main() {
// 这里可以写一些测试代码,调用 f_compute_rhs_bssn 来验证它的正确性
// 例如,定义一些小的网格和初始条件,调用函数,并检查输出是否合理。
return 0;
}

View File

@@ -1,65 +0,0 @@
SoA[0] = SYM, SoA[1] = SYM, SoA[2] = SYM;
#pragma omp for collapse(3)
for (int k0 = 0; k0 < ex[2]; ++k0) {
for (int j0 = 0; j0 < ex[1]; ++j0) {
for (int i0 = 0; i0 < ex[0]; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
fh[idx_funcc_F(iF, jF, kF, 2, ex)] = Lap[idx_func0(i0, j0, k0, ex)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
#pragma omp for collapse(3)
for (int ii = 0; ii <= 2 - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int jF = 1; jF <= ex[1]; ++jF) {
fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] =
fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
#pragma omp for collapse(3)
for (int jj = 0; jj <= 2 - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
#pragma omp for collapse(3)
for (int kk = 0; kk <= 2 - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
}
}
}
#pragma omp for collapse(3)
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
fdderivs_xh(i0, j0, k0, ex, fh, iminF, jminF, kminF, ex1, ex2, ex3,
Fdxdx, Fdydy, Fdzdz, Fdxdy, Fdxdz, Fdydz,
Sdxdx, Sdydy, Sdzdz, Sdxdy, Sdxdz, Sdydz,
fxx,fxy,fxz,fyy,fyz,fzz
);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,311 +0,0 @@
#include "xh_tool.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff;
const int NO_SYMM = 0, EQ_SYMM = 1;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
const double F1o4 = 2.5e-1; // 1/4
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double F1o12 = ONE / 12.0;
const double F1o144 = ONE / 144.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
const size_t nx = (size_t)ex1 + 2;
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
/* 系数:按 Fortran 原式 */
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX);
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
// double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// symmetry_bd(2, ex, f, fh, SoA);
const double SoA[3] = { SYM1, SYM2, SYM3 };
for (int k0 = 0; k0 < ex[2]; ++k0) {
for (int j0 = 0; j0 < ex[1]; ++j0) {
for (int i0 = 0; i0 < ex[0]; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
for (int ii = 0; ii <= 2 - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int jF = 1; jF <= ex[1]; ++jF) {
fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] =
fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
for (int jj = 0; jj <= 2 - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
for (int kk = 0; kk <= 2 - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
}
}
}
/* 输出清零fxx,fyy,fzz,fxy,fxz,fyz = 0 */
// const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
// for (size_t p = 0; p < all; ++p) {
// fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
// fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
// }
/*
* Fortran:
* do k=1,ex3-1
* do j=1,ex2-1
* do i=1,ex1-1
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* 高阶分支i±2,j±2,k±2 都在范围内 */
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
{
const double t_jm2 =
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
const double t_jm1 =
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
const double t_jp1 =
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
const double t_jp2 =
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
/* 二阶分支i±1,j±1,k±1 在范围内 */
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
);
}else{
fxx[p] = 0.0;
fyy[p] = 0.0;
fzz[p] = 0.0;
fxy[p] = 0.0;
fxz[p] = 0.0;
fyz[p] = 0.0;
}
}
}
}
// free(fh);
}

View File

@@ -1,145 +0,0 @@
#include "xh_tool.h"
/*
* C 版 fderivs
*
* Fortran:
* subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
*
* 约定:
* f, fx, fy, fz: ex1*ex2*ex3按 idx_ex 布局
* X: ex1, Y: ex2, Z: ex3
*/
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff; // Fortran 里没用到
const double ZEO = 0.0, ONE = 1.0;
const double TWO = 2.0, EIT = 8.0;
const double F12 = 12.0;
const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// dX = X(2)-X(1) -> C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
// SoA(1:3) = SYM1,SYM2,SYM3
const double SoA[3] = { SYM1, SYM2, SYM3 };
// fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
const size_t nx = (size_t)ex1 + 2;
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
// double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// call symmetry_bd(2,ex,f,fh,SoA)
symmetry_bd(2, ex, f, fh, SoA);
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
// fx = fy = fz = 0
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO;
fy[p] = ZEO;
fz[p] = ZEO;
}
/*
* Fortran loops:
* do k=1,ex3-1
* do j=1,ex2-1
* do i=1,ex1-1
*
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
(jF + 2) <= ex2 && (jF - 2) >= jminF &&
(kF + 2) <= ex3 && (kF - 2) >= kminF)
{
fx[p] = d12dx * (
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
);
fy[p] = d12dy * (
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
);
fz[p] = d12dz * (
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
);
}
// elseif(i+1 <= imax .and. i-1 >= imin ...)
else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
(jF + 1) <= ex2 && (jF - 1) >= jminF &&
(kF + 1) <= ex3 && (kF - 1) >= kminF)
{
fx[p] = d2dx * (
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fy[p] = d2dy * (
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fz[p] = d2dz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
}
}
}
}
// free(fh);
}

View File

@@ -1,116 +0,0 @@
#include "xh_tool.h"
/*
* C 版 kodis
*
* Fortran signature:
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
*
* 约定:
* X: ex1, Y: ex2, Z: ex3
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
* SoA[3]
* eps: double
*/
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps)
{
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
const double cof = 64.0; // 2^6
const int NO_SYMM = 0, OCTANT = 2;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
// Fortran: imax=ex(1) 等是 1-based 上界
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
// Fortran: imin=jmin=kmin=1某些对称情况变 -2
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
// 分配 fh大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
const size_t nx = (size_t)ex1 + 3;
const size_t ny = (size_t)ex2 + 3;
const size_t nz = (size_t)ex3 + 3;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
if (!fh) return;
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
symmetry_bd(3, ex, f, fh, SoA);
/*
* Fortran loops:
* do k=1,ex3
* do j=1,ex2
* do i=1,ex1
*
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
* 并定义 Fortran index: iF=i0+1, ...
*/
for (int k0 = 0; k0 < ex3; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 < ex2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 < ex1; ++i0) {
const int iF = i0 + 1;
// Fortran if 条件:
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
{
const size_t p = idx_ex(i0, j0, k0, ex);
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
const double Dx_term =
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
const double Dy_term =
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
const double Dz_term =
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
// Fortran:
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
}
}
}
}
// free(fh);
}

View File

@@ -1,262 +0,0 @@
#include "xh_tool.h"
/*
* 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。
* Fortran: call symmetry_bd(3,ex,f,fh,SoA)
*
* 约定:
* nghost = 3
* ex[3] = {ex1,ex2,ex3}
* f = 原始网格 (ex1*ex2*ex3)
* fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...)
* SoA[3] = 输入参数
*/
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3])
{
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
(void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// 对应 Fortran: dX = X(2)-X(1) Fortran 1-based
// C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
// Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来)
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
(void)d2dx; (void)d2dy; (void)d2dz;
// Fortran:
// imax = ex(1); jmax = ex(2); kmax = ex(3)
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
// Fortran:
// imin=jmin=kmin=1; 若满足对称条件则设为 -2
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
// 分配 fh大小 (ex1+3)*(ex2+3)*(ex3+3)
const size_t nx = (size_t)ex1 + 3;
const size_t ny = (size_t)ex2 + 3;
const size_t nz = (size_t)ex3 + 3;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错)
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
symmetry_bd(3, ex, f, fh, SoA);
/*
* Fortran 主循环:
* do k=1,ex(3)-1
* do j=1,ex(2)-1
* do i=1,ex(1)-1
*
* 转成 C 0-based
* k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
*
* 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值:
* iF=i0+1, jF=j0+1, kF=k0+1
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
// ---------------- x direction ----------------
const double sfx = Sfx[p];
if (sfx > ZEO) {
// Fortran: if(i+3 <= imax)
// iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4
if (i0 <= ex1 - 4) {
f_rhs[p] += sfx * d12dx *
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
}
// elseif(i+2 <= imax) <=> i0 <= ex1-3
else if (i0 <= ex1 - 3) {
f_rhs[p] += sfx * d12dx *
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
}
// elseif(i+1 <= imax) <=> i0 <= ex1-2循环里总成立
else if (i0 <= ex1 - 2) {
f_rhs[p] -= sfx * d12dx *
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
}
} else if (sfx < ZEO) {
// Fortran: if(i-3 >= imin)
// (iF-3) >= iminF <=> (i0-2) >= iminF
if ((i0 - 2) >= iminF) {
f_rhs[p] -= sfx * d12dx *
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
}
// elseif(i-2 >= imin) <=> (i0-1) >= iminF
else if ((i0 - 1) >= iminF) {
f_rhs[p] += sfx * d12dx *
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
}
// elseif(i-1 >= imin) <=> i0 >= iminF
else if (i0 >= iminF) {
f_rhs[p] += sfx * d12dx *
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
}
}
// ---------------- y direction ----------------
const double sfy = Sfy[p];
if (sfy > ZEO) {
// jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
if (j0 <= ex2 - 4) {
f_rhs[p] += sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
} else if (j0 <= ex2 - 3) {
f_rhs[p] += sfy * d12dy *
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
} else if (j0 <= ex2 - 2) {
f_rhs[p] -= sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
}
} else if (sfy < ZEO) {
if ((j0 - 2) >= jminF) {
f_rhs[p] -= sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
} else if ((j0 - 1) >= jminF) {
f_rhs[p] += sfy * d12dy *
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
} else if (j0 >= jminF) {
f_rhs[p] += sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
}
}
// ---------------- z direction ----------------
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0 <= ex3 - 4) {
f_rhs[p] += sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
} else if (k0 <= ex3 - 3) {
f_rhs[p] += sfz * d12dz *
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
} else if (k0 <= ex3 - 2) {
f_rhs[p] -= sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
}
} else if (sfz < ZEO) {
if ((k0 - 2) >= kminF) {
f_rhs[p] -= sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
} else if ((k0 - 1) >= kminF) {
f_rhs[p] += sfz * d12dz *
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
} else if (k0 >= kminF) {
f_rhs[p] += sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
}
}
}
}
}
// free(fh);
}

View File

@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,6 +886,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -908,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -936,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
funcc = 0.d0
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1111,8 +1117,7 @@ end subroutine d2dump
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polint(xa, ya, x, y, dy, ordn)
subroutine polint(xa, ya, x, y, dy, ordn)
implicit none implicit none
integer, intent(in) :: ordn integer, intent(in) :: ordn
@@ -1124,13 +1129,15 @@ end subroutine d2dump
real*8, dimension(ordn) :: c, d, ho real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val real*8 :: dif, dift, hp, h, den_val
! Initialization
c = ya c = ya
d = ya d = ya
ho = xa - x ho = xa - x
ns = 1 ns = 1
dif = abs(x - xa(1)) dif = abs(x - xa(1))
! Find the index of the closest table entry
do i = 2, ordn do i = 2, ordn
dift = abs(x - xa(i)) dift = abs(x - xa(i))
if (dift < dif) then if (dift < dif) then
@@ -1141,26 +1148,31 @@ end subroutine d2dump
y = ya(ns) y = ya(ns)
ns = ns - 1 ns = ns - 1
! Main Neville's algorithm loop
do m = 1, ordn - 1 do m = 1, ordn - 1
n_m = ordn - m n_m = ordn - m
do i = 1, n_m do i = 1, n_m
hp = ho(i) hp = ho(i)
h = ho(i+m) h = ho(i+m)
den_val = hp - h den_val = hp - h
! Check for division by zero locally
if (den_val == 0.0d0) then if (den_val == 0.0d0) then
write(*,*) 'failure in polint for point',x write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa write(*,*) 'with input points: ',xa
stop stop
end if end if
! Reuse den_val to avoid redundant divisions
den_val = (c(i+1) - d(i)) / den_val den_val = (c(i+1) - d(i)) / den_val
! Update c and d in place
d(i) = h * den_val d(i) = h * den_val
c(i) = hp * den_val c(i) = hp * den_val
end do end do
! Decide which path (up or down the tableau) to take
if (2 * ns < n_m) then if (2 * ns < n_m) then
dy = c(ns + 1) dy = c(ns + 1)
else else
@@ -1177,92 +1189,68 @@ end subroutine d2dump
! interpolation in 2 dimensions, follow yx order ! interpolation in 2 dimensions, follow yx order
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
implicit none implicit none
integer,intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: x1a,x2a
real*8, dimension(ordn,ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy
integer,intent(in) :: ordn integer :: j
real*8, dimension(1:ordn), intent(in) :: x1a,x2a real*8, dimension(ordn) :: ymtmp
real*8, dimension(1:ordn,1:ordn), intent(in) :: ya real*8 :: dy_temp ! Local variable to prevent overwriting result
real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER ! Optimized sequence: Loop over columns (j)
integer :: i,m ! ya(:,j) is a contiguous memory block in Fortran
real*8, dimension(ordn) :: ymtmp do j=1,ordn
real*8, dimension(ordn) :: yntmp call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do
m=size(x1a) ! Final interpolation on the results
do i=1,m call polint(x2a, ymtmp, x2, y, dy, ordn)
yntmp=ya(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do j=1,ordn return
call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
end do
call polint(x2a, ymtmp, x2, y, dy, ordn)
#endif
return
end subroutine polin2 end subroutine polin2
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! !
! interpolation in 3 dimensions, follow zyx order ! interpolation in 3 dimensions, follow zyx order
! !
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn) subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
implicit none implicit none
integer,intent(in) :: ordn
real*8, dimension(ordn), intent(in) :: x1a,x2a,x3a
real*8, dimension(ordn,ordn,ordn), intent(in) :: ya
real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy
integer,intent(in) :: ordn integer :: j, k
real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya real*8, dimension(ordn) :: ymtmp
real*8, intent(in) :: x1,x2,x3 real*8 :: dy_temp
real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER ! Sequence change: Process the contiguous first dimension (x1) first.
integer :: i,j,m,n ! We loop through the 'slow' planes (j, k) to extract 'fast' columns.
real*8, dimension(ordn,ordn) :: yatmp do k=1,ordn
real*8, dimension(ordn) :: ymtmp do j=1,ordn
real*8, dimension(ordn) :: yntmp ! ya(:,j,k) is contiguous; much faster than ya(i,j,:)
real*8, dimension(ordn) :: yqtmp call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
end do
m=size(x1a)
n=size(x2a)
do i=1,m
do j=1,n
yqtmp=ya(i,j,:)
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
end do
yntmp=yatmp(i,:)
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
end do
call polint(x1a,ymtmp,x1,y,dy,ordn)
#else
integer :: j, k
real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp
real*8 :: dy_temp
do k=1,ordn
do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
end do end do
end do
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
end do
call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif
return ! Now process the second dimension
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
end do
! Final dimension
call polint(x3a, ymtmp, x3, y, dy, ordn)
return
end subroutine polin3 end subroutine polin3
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm ! calculate L2norm
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw) f,f_out,gw)
@@ -1279,9 +1267,7 @@ end subroutine d2dump
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
dX = X(2) - X(1) dX = X(2) - X(1)
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
@@ -1305,12 +1291,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1335,9 +1316,7 @@ f_out = f_out*dX*dY*dZ
real*8 :: dX, dY, dZ real*8 :: dX, dY, dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k,n_elements integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1400,12 +1379,7 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1433,8 +1407,6 @@ f_out = f_out*dX*dY*dZ
integer::imin,jmin,kmin integer::imin,jmin,kmin
integer::imax,jmax,kmax integer::imax,jmax,kmax
integer::i,j,k integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
real*8 :: PIo4 real*8 :: PIo4
@@ -1497,12 +1469,11 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
f_out = f_out
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
return return
@@ -1700,7 +1671,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: tmp2 real*8, dimension(ORDN,ORDN) :: tmp2
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(3) :: SoAh real*8, dimension(3) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds+1 cxB = inds+1
@@ -1736,21 +1706,20 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
endif endif
! Optimized with BLAS operations for better performance
! First dimension: z-direction weighted sum
tmp2=0 tmp2=0
do m=1,ORDN do m=1,ORDN
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m) tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
enddo enddo
! Second dimension: y-direction weighted sum
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m) tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo enddo
! Third dimension: x-direction weighted sum using BLAS DDOT f_int=0
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return return
@@ -1780,7 +1749,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: ya real*8, dimension(ORDN,ORDN) :: ya
real*8, dimension(ORDN) :: tmp1 real*8, dimension(ORDN) :: tmp1
real*8, dimension(2) :: SoAh real*8, dimension(2) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
cxB = inds(1:2)+1 cxB = inds(1:2)+1
@@ -1810,14 +1778,15 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3)) ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
endif endif
! Optimized with BLAS operations
tmp1=0 tmp1=0
do m=1,ORDN do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m) tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo enddo
! Use BLAS DDOT for final weighted sum f_int=0
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return return
@@ -1848,7 +1817,6 @@ deallocate(f_flat)
real*8, dimension(ORDN) :: ya real*8, dimension(ORDN) :: ya
real*8 :: SoAh real*8 :: SoAh
integer,dimension(3) :: inds integer,dimension(3) :: inds
real*8, external :: DDOT
! +1 because c++ gives 0 for first point ! +1 because c++ gives 0 for first point
inds = indsi + 1 inds = indsi + 1
@@ -1909,8 +1877,10 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif endif
! Optimized with BLAS DDOT for weighted sum f_int=0
f_int = DDOT(ORDN, coef, 1, ya, 1) do m=1,ORDN
f_int = f_int + coef(m)*ya(m)
enddo
return return
@@ -2142,38 +2112,24 @@ deallocate(f_flat)
end function fWigner_d_function end function fWigner_d_function
!---------------------------------- !----------------------------------
! Optimized factorial function using lookup table for small N
! and log-gamma for large N to avoid overflow
function ffact(N) result(gont) function ffact(N) result(gont)
implicit none implicit none
integer,intent(in) :: N integer,intent(in) :: N
real*8 :: gont real*8 :: gont
integer :: i
! Lookup table for factorials 0! to 20! (precomputed) integer :: i
real*8, parameter, dimension(0:20) :: fact_table = [ &
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
2432902008176640000.d0 ]
! sanity check ! sanity check
if(N < 0)then if(N < 0)then
write(*,*) "ffact: error input for factorial" write(*,*) "ffact: error input for factorial"
gont = 1.d0
return return
endif endif
! Use lookup table for small N (fast path) gont = 1.d0
if(N <= 20)then do i=1,N
gont = fact_table(N) gont = gont*i
else enddo
! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
! This avoids overflow and is computed efficiently
gont = exp(log_gamma(dble(N+1)))
endif
return return
@@ -2307,3 +2263,4 @@ subroutine find_maximum(ext,X,Y,Z,fun,val,pos,llb,uub)
return return
end subroutine end subroutine

View File

@@ -16,66 +16,115 @@ using namespace std;
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
#endif #endif
/* Linear equation solution by Gauss-Jordan elimination.
// Intel oneMKL LAPACK interface
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. corresponding set of solution vectors */
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
int gaussj(double *a, double *b, int n) int gaussj(double *a, double *b, int n)
{ {
// Allocate pivot array and workspace double swap;
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
// Make a copy of matrix a for solving (dgesv modifies it to LU form) int *indxc, *indxr, *ipiv;
double *a_copy = new double[n * n]; indxc = new int[n];
for (int i = 0; i < n * n; i++) { indxr = new int[n];
a_copy[i] = a[i]; ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv, temp;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
}
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
} }
// Step 1: Solve linear system A*x = b using LU decomposition for (l = n - 1; l >= 0; l--)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
if (info != 0) { {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; swap = a[k * n + indxr[l]];
delete[] ipiv; a[k * n + indxr[l]] = a[k * n + indxc[l]];
delete[] a_copy; a[k * n + indxc[l]] = swap;
return 1; }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
} }
delete[] indxc;
delete[] indxr;
delete[] ipiv; delete[] ipiv;
delete[] a_copy;
return 0; return 0;
} }

View File

@@ -512,10 +512,11 @@
IMPLICIT DOUBLE PRECISION (A-H,O-Z) IMPLICIT DOUBLE PRECISION (A-H,O-Z)
DIMENSION V(N),W(N) DIMENSION V(N),W(N)
! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT. ! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
! Optimized using Intel oneMKL BLAS ddot
! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
DOUBLE PRECISION, EXTERNAL :: DDOT SUM = 0.0D0
DGVV = DDOT(N, V, 1, W, 1) DO 10 I = 1,N
SUM = SUM + V(I)*W(I)
10 CONTINUE
DGVV = SUM
RETURN RETURN
END END

View File

@@ -2,7 +2,7 @@
#ifndef MICRODEF_H #ifndef MICRODEF_H
#define MICRODEF_H #define MICRODEF_H
#include "macrodef.fh" #include "microdef.fh"
// application parameters // application parameters

View File

@@ -8,7 +8,7 @@ include makefile.inc
$(f90) $(f90appflags) -c $< -o $@ $(f90) $(f90appflags) -c $< -o $@
.C.o: .C.o:
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
.for.o: .for.o:
$(f77) -c $< -o $@ $(f77) -c $< -o $@
@@ -16,20 +16,13 @@ include makefile.inc
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o xh_bssn_rhs.o xh_fdderivs.o xh_fderivs.o xh_kodiss.o xh_lopsided.o \ NullShellPatch2_Evo.o writefile_f.o
xh_global_interp.o xh_polint3.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\ cgh.o surface_integral.o ShellPatch.o\
@@ -73,7 +66,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
initial_null2.h NullShellPatch2.h xh_bssn_rhs_compute.h xh_global_interp.h initial_null2.h NullShellPatch2.h
$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -97,13 +90,13 @@ misc.o : zbesh.o
# projects # projects
ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -1,32 +1,33 @@
## GCC version (commented out) ## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ ## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran ## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) ## Aggressive optimization flags:
## -fprofile-instr-use: use collected profile data to guide optimization decisions ## -O3: Maximum optimization
## (branch prediction, basic block layout, inlining, loop unrolling) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
PROFDATA = /home/hxh/AMSS-NCKU/pgo_profile/default.profdata ## -fp-model fast=2: Aggressive floating-point optimizations
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ ## -fma: Enable fused multiply-add instructions
-fprofile-instr-use=$(PROFDATA) \ ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
-Dfortran3 -Dnewc -I${MKLROOT}/include CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ -Dfortran3 -Dnewc -I${MKLROOT}/include
-fprofile-instr-use=$(PROFDATA) \ f90appflags = -O3 -xHost -fp-model fast=2 -fma \
-align array64byte -fpp -I${MKLROOT}/include -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx
CC = icx CC = icx
CLINKER = mpiicpx CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

View File

@@ -220,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
pox[2][n] = rex * nz_g[n]; pox[2][n] = rex * nz_g[n];
} }
double *shellf;
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp; Lp = n_tot - cpusize * mp;
if (Lp > myrank) if (Lp > myrank)
{ {
Nmin = myrank * mp + myrank; Nmin = myrank * mp + myrank;
@@ -234,11 +241,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf;
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
//|~~~~~> Integrate the dot product of Dphi with the surface normal. //|~~~~~> Integrate the dot product of Dphi with the surface normal.
double *RP_out, *IP_out; double *RP_out, *IP_out;
@@ -361,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -563,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -751,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1009,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1453,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -1897,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2092,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2287,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.
@@ -2384,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
pox[2][n] = rex * nz_g[n]; pox[2][n] = rex * nz_g[n];
} }
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level,
// so we do not need loop boxes
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
double Mass_out = 0;
double ang_outx, ang_outy, ang_outz;
double p_outx, p_outy, p_outz;
ang_outx = ang_outy = ang_outz = 0.0;
p_outx = p_outy = p_outz = 0.0;
const double f1o8 = 0.125;
int mp, Lp, Nmin, Nmax; int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize; mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp; Lp = n_tot - cpusize * mp;
if (Lp > myrank) if (Lp > myrank)
{ {
Nmin = myrank * mp + myrank; Nmin = myrank * mp + myrank;
@@ -2398,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
Nmax = Nmin + mp - 1; Nmax = Nmin + mp - 1;
} }
double *shellf;
shellf = new double[n_tot * InList];
// we have assumed there is only one box on this level,
// so we do not need loop boxes
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
double Mass_out = 0;
double ang_outx, ang_outy, ang_outz;
double p_outx, p_outy, p_outz;
ang_outx = ang_outy = ang_outz = 0.0;
p_outx = p_outy = p_outz = 0.0;
const double f1o8 = 0.125;
double Chi, Psi; double Chi, Psi;
double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz; double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz; double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2532,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -2653,7 +2587,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
// we have assumed there is only one box on this level, // we have assumed there is only one box on this level,
// so we do not need loop boxes // so we do not need loop boxes
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here); GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
double Mass_out = 0; double Mass_out = 0;
@@ -2802,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -3085,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
} }
} }
{ MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
double scalar_in[7]; MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3]; MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
} MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#ifdef GaussInt #ifdef GaussInt
mass = mass * rex * rex * dphi * factor; mass = mass * rex * rex * dphi * factor;
@@ -3670,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
} }
//|------+ Communicate and sum the results from each processor. //|------+ Communicate and sum the results from each processor.
{ MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP_out = new double[2 * NN]; MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
double *RPIP = new double[2 * NN];
memcpy(RPIP_out, RP_out, NN * sizeof(double));
memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, RPIP, NN * sizeof(double));
memcpy(IP, RPIP + NN, NN * sizeof(double));
delete[] RPIP_out;
delete[] RPIP;
}
//|------= Free memory. //|------= Free memory.

File diff suppressed because it is too large Load Diff

View File

@@ -1,30 +0,0 @@
#include "xh_tool.h"
extern "C"
{
int f_compute_rhs_bssn_xh(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co
);
}

View File

@@ -1,311 +0,0 @@
#include "xh_tool.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff;
const int NO_SYMM = 0, EQ_SYMM = 1;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
const double F1o4 = 2.5e-1; // 1/4
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double F1o12 = ONE / 12.0;
const double F1o144 = ONE / 144.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
const size_t nx = (size_t)ex1 + 2;
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
/* 系数:按 Fortran 原式 */
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX);
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
// double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// symmetry_bd(2, ex, f, fh, SoA);
const double SoA[3] = { SYM1, SYM2, SYM3 };
for (int k0 = 0; k0 < ex[2]; ++k0) {
for (int j0 = 0; j0 < ex[1]; ++j0) {
for (int i0 = 0; i0 < ex[0]; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
for (int ii = 0; ii <= 2 - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int jF = 1; jF <= ex[1]; ++jF) {
fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] =
fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
for (int jj = 0; jj <= 2 - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= ex[2]; ++kF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
for (int kk = 0; kk <= 2 - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
}
}
}
/* 输出清零fxx,fyy,fzz,fxy,fxz,fyz = 0 */
// const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
// for (size_t p = 0; p < all; ++p) {
// fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
// fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
// }
/*
* Fortran:
* do k=1,ex3-1
* do j=1,ex2-1
* do i=1,ex1-1
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* 高阶分支i±2,j±2,k±2 都在范围内 */
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
{
const double t_jm2 =
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
const double t_jm1 =
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
const double t_jp1 =
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
const double t_jp2 =
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
/* 二阶分支i±1,j±1,k±1 在范围内 */
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
);
}else{
fxx[p] = 0.0;
fyy[p] = 0.0;
fzz[p] = 0.0;
fxy[p] = 0.0;
fxz[p] = 0.0;
fyz[p] = 0.0;
}
}
}
}
// free(fh);
}

View File

@@ -1,145 +0,0 @@
#include "xh_tool.h"
/*
* C 版 fderivs
*
* Fortran:
* subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
*
* 约定:
* f, fx, fy, fz: ex1*ex2*ex3按 idx_ex 布局
* X: ex1, Y: ex2, Z: ex3
*/
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff; // Fortran 里没用到
const double ZEO = 0.0, ONE = 1.0;
const double TWO = 2.0, EIT = 8.0;
const double F12 = 12.0;
const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// dX = X(2)-X(1) -> C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
// SoA(1:3) = SYM1,SYM2,SYM3
const double SoA[3] = { SYM1, SYM2, SYM3 };
// fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
const size_t nx = (size_t)ex1 + 2;
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
// double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
// call symmetry_bd(2,ex,f,fh,SoA)
symmetry_bd(2, ex, f, fh, SoA);
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
// fx = fy = fz = 0
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO;
fy[p] = ZEO;
fz[p] = ZEO;
}
/*
* Fortran loops:
* do k=1,ex3-1
* do j=1,ex2-1
* do i=1,ex1-1
*
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
(jF + 2) <= ex2 && (jF - 2) >= jminF &&
(kF + 2) <= ex3 && (kF - 2) >= kminF)
{
fx[p] = d12dx * (
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
);
fy[p] = d12dy * (
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
);
fz[p] = d12dz * (
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
);
}
// elseif(i+1 <= imax .and. i-1 >= imin ...)
else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
(jF + 1) <= ex2 && (jF - 1) >= jminF &&
(kF + 1) <= ex3 && (kF - 1) >= kminF)
{
fx[p] = d2dx * (
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fy[p] = d2dy * (
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fz[p] = d2dz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
}
}
}
}
// free(fh);
}

View File

@@ -1,143 +0,0 @@
#include "xh_global_interp.h"
/* 你已有的 polin3由前面 Fortran->C 翻译得到) */
// void polin3(const double *x1a, const double *x2a, const double *x3a,
// const double *ya, double x1, double x2, double x3,
// double *y, double *dy, int ordn);
/*
你需要提供 decide3d 的实现(这里仅声明)。
Fortran: decide3d(ex,f,f,cxB,cxT,SoA,ya,ORDN,Symmetry)
- ex: [3]
- f: 三维场(列主序)
- cxB/cxT: 3 维窗口起止Fortran 1-based且可能 <=0
- SoA: [3]
- ya: 输出 ORDN^3 的采样块(列主序)
- return: 0 表示正常;非 0 表示错误(对应 Fortran logical = .true.
*/
// int xh_decide3d(const int ex[3],
// const double *f_in,
// const double *f_in2, /* Fortran 里传了 f,f按原样保留 */
// const int cxB[3],
// const int cxT[3],
// const double SoA[3],
// double *ya,
// int ordn,
// int symmetry);
/* 把 Fortran 1-based 下标 idxF (可为负/0) 映射到 C 的 X[idx] 访问(只用于 X(2-cxB) 这种表达式) */
static inline double X_at_FortranIndex(const double *X, int idxF) {
/* Fortran: X(1) 对应 C: X[0] */
return X[idxF - 1];
}
/* Fortran 整数截断idint 在这里可用 (int) 实现(对正数等价于 floor */
static inline int idint_like(double a) {
return (int)a; /* trunc toward zero */
}
/* global_interp 的 C 版 */
void xh_global_interp(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, /* f(ex1,ex2,ex3) column-major */
double &f_int,
double x1, double y1, double z1,
int ORDN,
const double SoA[3],
int symmetry)
{
// double time1, time2;
// time1 = omp_get_wtime();
enum { NO_SYMM = 0, EQUATORIAL = 1, OCTANT = 2 };
int j, m;
int imin, jmin, kmin;
int cxB[3], cxT[3], cxI[3], cmin[3], cmax[3];
double cx[3];
double dX, dY, dZ, ddy;
/* Fortran: imin=lbound(f,1) ... 通常是 1这里按 1 处理 */
imin = 1; jmin = 1; kmin = 1;
dX = X_at_FortranIndex(X, imin + 1) - X_at_FortranIndex(X, imin);
dY = X_at_FortranIndex(Y, jmin + 1) - X_at_FortranIndex(Y, jmin);
dZ = X_at_FortranIndex(Z, kmin + 1) - X_at_FortranIndex(Z, kmin);
/* x1a(j) = (j-1)*1.0 (j=1..ORDN) */
double *x1a = (double*)malloc((size_t)ORDN * sizeof(double));
double *ya = (double*)malloc((size_t)ORDN * (size_t)ORDN * (size_t)ORDN * sizeof(double));
if (!x1a || !ya) {
fprintf(stderr, "global_interp: malloc failed\n");
exit(1);
}
for (j = 0; j < ORDN; j++) x1a[j] = (double)j;
/* cxI(m) = idint((p - P(1))/dP + 0.4) + 1 (Fortran 1-based) */
cxI[0] = idint_like((x1 - X_at_FortranIndex(X, 1)) / dX + 0.4) + 1;
cxI[1] = idint_like((y1 - X_at_FortranIndex(Y, 1)) / dY + 0.4) + 1;
cxI[2] = idint_like((z1 - X_at_FortranIndex(Z, 1)) / dZ + 0.4) + 1;
/* cxB = cxI - ORDN/2 + 1 ; cxT = cxB + ORDN - 1 */
int half = ORDN / 2; /* Fortran 整数除法 */
for (m = 0; m < 3; m++) {
cxB[m] = cxI[m] - half + 1;
cxT[m] = cxB[m] + ORDN - 1;
}
/* cmin=1; cmax=ex */
cmin[0] = cmin[1] = cmin[2] = 1;
cmax[0] = ex[0];
cmax[1] = ex[1];
cmax[2] = ex[2];
/* 对称边界时允许 cxB 为负/0与 Fortran 一致) */
if (symmetry == OCTANT && fabs(X_at_FortranIndex(X, 1)) < dX) cmin[0] = -half + 2;
if (symmetry == OCTANT && fabs(X_at_FortranIndex(Y, 1)) < dY) cmin[1] = -half + 2;
if (symmetry != NO_SYMM && fabs(X_at_FortranIndex(Z, 1)) < dZ) cmin[2] = -half + 2;
/* 夹紧窗口 [cxB,cxT] 到 [cmin,cmax] */
for (m = 0; m < 3; m++) {
if (cxB[m] < cmin[m]) {
cxB[m] = cmin[m];
cxT[m] = cxB[m] + ORDN - 1;
}
if (cxT[m] > cmax[m]) {
cxT[m] = cmax[m];
cxB[m] = cxT[m] + 1 - ORDN;
}
}
/*
cx(m) 的计算:如果 cxB>0:
cx = (p - P(cxB))/dP
else:
cx = (p + P(2 - cxB))/dP
注意这里的 cxB 是 Fortran 1-based 语义下的整数,可能 <=0。
*/
if (cxB[0] > 0) cx[0] = (x1 - X_at_FortranIndex(X, cxB[0])) / dX;
else cx[0] = (x1 + X_at_FortranIndex(X, 2 - cxB[0])) / dX;
if (cxB[1] > 0) cx[1] = (y1 - X_at_FortranIndex(Y, cxB[1])) / dY;
else cx[1] = (y1 + X_at_FortranIndex(Y, 2 - cxB[1])) / dY;
if (cxB[2] > 0) cx[2] = (z1 - X_at_FortranIndex(Z, cxB[2])) / dZ;
else cx[2] = (z1 + X_at_FortranIndex(Z, 2 - cxB[2])) / dZ;
/* decide3d: 填充 ya(1:ORDN,1:ORDN,1:ORDN) */
if (xh_decide3d(ex, f, f, cxB, cxT, SoA, ya, ORDN, symmetry)) {
printf("global_interp position: %g %g %g\n", x1, y1, z1);
printf("data range: %g %g %g %g %g %g\n",
X_at_FortranIndex(X, 1), X_at_FortranIndex(X, ex[0]),
X_at_FortranIndex(Y, 1), X_at_FortranIndex(Y, ex[1]),
X_at_FortranIndex(Z, 1), X_at_FortranIndex(Z, ex[2]));
exit(1);
}
/* polin3(x1a,x1a,x1a,ya,cx(1),cx(2),cx(3),f_int,ddy,ORDN) */
xh_polin3(x1a, x1a, x1a, ya, cx[0], cx[1], cx[2], f_int, &ddy, ORDN);
free(x1a);
free(ya);
// time2 = omp_get_wtime();
// printf("Time for global_interp: %lf seconds\n", time2 - time1);
}

View File

@@ -1,12 +0,0 @@
#include "xh_po.h"
extern "C"{
void xh_global_interp(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, /* f(ex1,ex2,ex3) column-major */
double &f_int,
double x1, double y1, double z1,
int ORDN,
const double SoA[3],
int symmetry);
}

View File

@@ -1,116 +0,0 @@
#include "xh_tool.h"
/*
* C 版 kodis
*
* Fortran signature:
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
*
* 约定:
* X: ex1, Y: ex2, Z: ex3
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
* SoA[3]
* eps: double
*/
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps)
{
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
const double cof = 64.0; // 2^6
const int NO_SYMM = 0, OCTANT = 2;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
// Fortran: imax=ex(1) 等是 1-based 上界
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
// Fortran: imin=jmin=kmin=1某些对称情况变 -2
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
// 分配 fh大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
const size_t nx = (size_t)ex1 + 3;
const size_t ny = (size_t)ex2 + 3;
const size_t nz = (size_t)ex3 + 3;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
if (!fh) return;
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
symmetry_bd(3, ex, f, fh, SoA);
/*
* Fortran loops:
* do k=1,ex3
* do j=1,ex2
* do i=1,ex1
*
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
* 并定义 Fortran index: iF=i0+1, ...
*/
for (int k0 = 0; k0 < ex3; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 < ex2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 < ex1; ++i0) {
const int iF = i0 + 1;
// Fortran if 条件:
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
{
const size_t p = idx_ex(i0, j0, k0, ex);
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
const double Dx_term =
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
const double Dy_term =
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
const double Dz_term =
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
// Fortran:
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
}
}
}
}
// free(fh);
}

View File

@@ -1,262 +0,0 @@
#include "xh_tool.h"
/*
* 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。
* Fortran: call symmetry_bd(3,ex,f,fh,SoA)
*
* 约定:
* nghost = 3
* ex[3] = {ex1,ex2,ex3}
* f = 原始网格 (ex1*ex2*ex3)
* fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...)
* SoA[3] = 输入参数
*/
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3])
{
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
(void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
// 对应 Fortran: dX = X(2)-X(1) Fortran 1-based
// C: X[1]-X[0]
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
// Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来)
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
(void)d2dx; (void)d2dy; (void)d2dz;
// Fortran:
// imax = ex(1); jmax = ex(2); kmax = ex(3)
const int imaxF = ex1;
const int jmaxF = ex2;
const int kmaxF = ex3;
// Fortran:
// imin=jmin=kmin=1; 若满足对称条件则设为 -2
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
// 分配 fh大小 (ex1+3)*(ex2+3)*(ex3+3)
const size_t nx = (size_t)ex1 + 3;
const size_t ny = (size_t)ex2 + 3;
const size_t nz = (size_t)ex3 + 3;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
if (fh_size > cap) {
free(fh);
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错)
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
symmetry_bd(3, ex, f, fh, SoA);
/*
* Fortran 主循环:
* do k=1,ex(3)-1
* do j=1,ex(2)-1
* do i=1,ex(1)-1
*
* 转成 C 0-based
* k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
*
* 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值:
* iF=i0+1, jF=j0+1, kF=k0+1
*/
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
// ---------------- x direction ----------------
const double sfx = Sfx[p];
if (sfx > ZEO) {
// Fortran: if(i+3 <= imax)
// iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4
if (i0 <= ex1 - 4) {
f_rhs[p] += sfx * d12dx *
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
}
// elseif(i+2 <= imax) <=> i0 <= ex1-3
else if (i0 <= ex1 - 3) {
f_rhs[p] += sfx * d12dx *
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
}
// elseif(i+1 <= imax) <=> i0 <= ex1-2循环里总成立
else if (i0 <= ex1 - 2) {
f_rhs[p] -= sfx * d12dx *
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
}
} else if (sfx < ZEO) {
// Fortran: if(i-3 >= imin)
// (iF-3) >= iminF <=> (i0-2) >= iminF
if ((i0 - 2) >= iminF) {
f_rhs[p] -= sfx * d12dx *
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
}
// elseif(i-2 >= imin) <=> (i0-1) >= iminF
else if ((i0 - 1) >= iminF) {
f_rhs[p] += sfx * d12dx *
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
}
// elseif(i-1 >= imin) <=> i0 >= iminF
else if (i0 >= iminF) {
f_rhs[p] += sfx * d12dx *
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
}
}
// ---------------- y direction ----------------
const double sfy = Sfy[p];
if (sfy > ZEO) {
// jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
if (j0 <= ex2 - 4) {
f_rhs[p] += sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
} else if (j0 <= ex2 - 3) {
f_rhs[p] += sfy * d12dy *
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
} else if (j0 <= ex2 - 2) {
f_rhs[p] -= sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
}
} else if (sfy < ZEO) {
if ((j0 - 2) >= jminF) {
f_rhs[p] -= sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
} else if ((j0 - 1) >= jminF) {
f_rhs[p] += sfy * d12dy *
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
} else if (j0 >= jminF) {
f_rhs[p] += sfy * d12dy *
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
}
}
// ---------------- z direction ----------------
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0 <= ex3 - 4) {
f_rhs[p] += sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
} else if (k0 <= ex3 - 3) {
f_rhs[p] += sfz * d12dz *
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
} else if (k0 <= ex3 - 2) {
f_rhs[p] -= sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
}
} else if (sfz < ZEO) {
if ((k0 - 2) >= kminF) {
f_rhs[p] -= sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
} else if ((k0 - 1) >= kminF) {
f_rhs[p] += sfz * d12dz *
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
} else if (k0 >= kminF) {
f_rhs[p] += sfz * d12dz *
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
}
}
}
}
}
// free(fh);
}

View File

@@ -1,19 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
int xh_decide3d(const int ex[3],
const double *f,
const double *fpi, /* 这里未用Fortran 也没用到 */
const int cxB[3],
const int cxT[3],
const double SoA[3],
double *ya,
int ordn,
int Symmetry);
void xh_polint(const double *xa, const double *ya, double x,
double *y, double *dy, int ordn);
void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
const double *ya, double x1, double x2, double x3,
double &y, double *dy, int ordn);

View File

@@ -1,258 +0,0 @@
#include "xh_po.h"
/*
ex[0..2] == Fortran ex(1:3)
cxB/cxT == Fortran cxB(1:3), cxT(1:3) (可能 <=0)
SoA[0..2] == Fortran SoA(1:3)
f, fpi == Fortran f(ex1,ex2,ex3) column-major (1-based in formulas)
ya == 连续内存,尺寸为 ORDN^3对应 Fortran ya(cxB1:cxT1, cxB2:cxT2, cxB3:cxT3)
但注意:我们用 offset 映射把 Fortran 的 i/j/k 坐标写进去。
*/
static inline int imax(int a, int b) { return a > b ? a : b; }
static inline int imin(int a, int b) { return a < b ? a : b; }
/* f(i,j,k): Fortran column-major, i/j/k are Fortran 1-based in [1..ex] */
#define F(i,j,k) f[((i)-1) + ex1 * (((j)-1) + ex2 * ((k)-1))]
/*
ya(i,j,k): i in [cxB1..cxT1], j in [cxB2..cxT2], k in [cxB3..cxT3]
我们把它映射到 C 的 0..ORDN-1 立方体:
ii = i - cxB1
jj = j - cxB2
kk = k - cxB3
并按 column-major 存储(与 Fortran 一致,方便直接喂给你的 polin3
*/
#define YA(i,j,k) ya[((i)-cxB1) + ordn * (((j)-cxB2) + ordn * ((k)-cxB3))]
int xh_decide3d(const int ex[3],
const double *f,
const double *fpi, /* 这里未用Fortran 也没用到 */
const int cxB[3],
const int cxT[3],
const double SoA[3],
double *ya,
int ordn,
int Symmetry) /* Symmetry 在 decide3d 里也没直接用 */
{
(void)fpi;
(void)Symmetry;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
int fmin1[3], fmin2[3], fmax1[3], fmax2[3];
int i, j, k, m;
int gont = 0;
/* 方便 YA 宏使用 */
const int cxB1 = cxB[0], cxB2 = cxB[1], cxB3 = cxB[2];
for (m = 0; m < 3; m++) {
/* Fortran 的 “NaN 检查” 在整数上基本无意义,这里不额外处理 */
fmin1[m] = imax(1, cxB[m]);
fmax1[m] = cxT[m];
fmin2[m] = cxB[m];
fmax2[m] = imin(0, cxT[m]);
/* if((fmin1<=fmax1) and (fmin1<1 or fmax1>ex)) gont=true */
if ((fmin1[m] <= fmax1[m]) && (fmin1[m] < 1 || fmax1[m] > ex[m])) gont = 1;
/* if((fmin2<=fmax2) and (2-fmax2<1 or 2-fmin2>ex)) gont=true */
if ((fmin2[m] <= fmax2[m]) && (2 - fmax2[m] < 1 || 2 - fmin2[m] > ex[m])) gont = 1;
}
if (gont) {
printf("error in decide3d\n");
printf("cxB: %d %d %d cxT: %d %d %d ex: %d %d %d\n",
cxB[0], cxB[1], cxB[2], cxT[0], cxT[1], cxT[2], ex[0], ex[1], ex[2]);
printf("fmin1: %d %d %d fmax1: %d %d %d\n",
fmin1[0], fmin1[1], fmin1[2], fmax1[0], fmax1[1], fmax1[2]);
printf("fmin2: %d %d %d fmax2: %d %d %d\n",
fmin2[0], fmin2[1], fmin2[2], fmax2[0], fmax2[1], fmax2[2]);
return 1;
}
/* ---- 填充 ya完全照 Fortran 两大块循环写 ---- */
/* k in [fmin1(3)..fmax1(3)] */
for (k = fmin1[2]; k <= fmax1[2]; k++) {
/* j in [fmin1(2)..fmax1(2)] */
for (j = fmin1[1]; j <= fmax1[1]; j++) {
/* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,j,k) */
for (i = fmin1[0]; i <= fmax1[0]; i++) {
YA(i, j, k) = F(i, j, k);
}
/* i in [fmin2(1)..fmax2(1)] : ya(i,j,k)=f(2-i,j,k)*SoA(1) */
for (i = fmin2[0]; i <= fmax2[0]; i++) {
YA(i, j, k) = F(2 - i, j, k) * SoA[0];
}
}
/* j in [fmin2(2)..fmax2(2)] */
for (j = fmin2[1]; j <= fmax2[1]; j++) {
/* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,2-j,k)*SoA(2) */
for (i = fmin1[0]; i <= fmax1[0]; i++) {
YA(i, j, k) = F(i, 2 - j, k) * SoA[1];
}
/* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,k)*SoA(1)*SoA(2) */
for (i = fmin2[0]; i <= fmax2[0]; i++) {
YA(i, j, k) = F(2 - i, 2 - j, k) * SoA[0] * SoA[1];
}
}
}
/* k in [fmin2(3)..fmax2(3)] */
for (k = fmin2[2]; k <= fmax2[2]; k++) {
/* j in [fmin1(2)..fmax1(2)] */
for (j = fmin1[1]; j <= fmax1[1]; j++) {
/* i in [fmin1(1)..fmax1(1)] : ya=f(i,j,2-k)*SoA(3) */
for (i = fmin1[0]; i <= fmax1[0]; i++) {
YA(i, j, k) = F(i, j, 2 - k) * SoA[2];
}
/* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,j,2-k)*SoA(1)*SoA(3) */
for (i = fmin2[0]; i <= fmax2[0]; i++) {
YA(i, j, k) = F(2 - i, j, 2 - k) * SoA[0] * SoA[2];
}
}
/* j in [fmin2(2)..fmax2(2)] */
for (j = fmin2[1]; j <= fmax2[1]; j++) {
/* i in [fmin1(1)..fmax1(1)] : ya=f(i,2-j,2-k)*SoA(2)*SoA(3) */
for (i = fmin1[0]; i <= fmax1[0]; i++) {
YA(i, j, k) = F(i, 2 - j, 2 - k) * SoA[1] * SoA[2];
}
/* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,2-k)*SoA1*SoA2*SoA3 */
for (i = fmin2[0]; i <= fmax2[0]; i++) {
YA(i, j, k) = F(2 - i, 2 - j, 2 - k) * SoA[0] * SoA[1] * SoA[2];
}
}
}
return 0;
}
#undef F
#undef YA
void xh_polint(const double *xa, const double *ya, double x,
double *y, double *dy, int ordn)
{
int i, m, ns, n_m;
double dif, dift, hp, h, den_val;
double *c = (double*)malloc((size_t)ordn * sizeof(double));
double *d = (double*)malloc((size_t)ordn * sizeof(double));
double *ho = (double*)malloc((size_t)ordn * sizeof(double));
if (!c || !d || !ho) {
fprintf(stderr, "polint: malloc failed\n");
exit(1);
}
for (i = 0; i < ordn; i++) {
c[i] = ya[i];
d[i] = ya[i];
ho[i] = xa[i] - x;
}
ns = 0; // Fortran ns=1 -> C ns=0
dif = fabs(x - xa[0]);
for (i = 1; i < ordn; i++) {
dift = fabs(x - xa[i]);
if (dift < dif) {
ns = i;
dif = dift;
}
}
*y = ya[ns];
ns -= 1; // Fortran ns=ns-1
for (m = 1; m <= ordn - 1; m++) {
n_m = ordn - m; // number of active points this round
for (i = 0; i < n_m; i++) {
hp = ho[i];
h = ho[i + m];
den_val = hp - h;
if (den_val == 0.0) {
fprintf(stderr, "failure in polint for point %g\n", x);
fprintf(stderr, "with input points xa: ");
for (int t = 0; t < ordn; t++) fprintf(stderr, "%g ", xa[t]);
fprintf(stderr, "\n");
exit(1);
}
den_val = (c[i + 1] - d[i]) / den_val;
d[i] = h * den_val;
c[i] = hp * den_val;
}
// Fortran: if (2*ns < n_m) then dy=c(ns+1) else dy=d(ns); ns=ns-1
// Here ns is C-indexed and can be -1; logic still matches.
if (2 * ns < n_m) {
*dy = c[ns + 1];
} else {
*dy = d[ns];
ns -= 1;
}
*y += *dy;
}
free(c);
free(d);
free(ho);
}
void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
const double *ya, double x1, double x2, double x3,
double &y, double *dy, int ordn)
{
// ya is ordn x ordn x ordn in Fortran layout (column-major)
#define YA3(i,j,k) ya[(i) + ordn*((j) + ordn*(k))] // i,j,k: 0..ordn-1
int j, k;
double dy_temp;
// yatmp(j,k) in Fortran code is ordn x ordn, treat column-major:
// yatmp(j,k) -> yatmp[j + ordn*k]
double *yatmp = (double*)malloc((size_t)ordn * (size_t)ordn * sizeof(double));
double *ymtmp = (double*)malloc((size_t)ordn * sizeof(double));
if (!yatmp || !ymtmp) {
fprintf(stderr, "polin3: malloc failed\n");
exit(1);
}
#define YAT(j,k) yatmp[(j) + ordn*(k)]
for (k = 0; k < ordn; k++) {
for (j = 0; j < ordn; j++) {
// call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp)
// ya(:,j,k) contiguous: base is &YA3(0,j,k)
xh_polint(x1a, &YA3(0, j, k), x1, &YAT(j, k), &dy_temp, ordn);
}
}
for (k = 0; k < ordn; k++) {
// call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp)
xh_polint(x2a, &YAT(0, k), x2, &ymtmp[k], &dy_temp, ordn);
}
xh_polint(x3a, ymtmp, x3, &y, dy, ordn);
#undef YAT
free(yatmp);
free(ymtmp);
#undef YA3
}

View File

@@ -1,338 +0,0 @@
#ifndef SHARE_FUNC_H
#define SHARE_FUNC_H
#include <stdlib.h>
#include <stddef.h>
#include <math.h>
#include <stdio.h>
#include <omp.h>
/* 主网格0-based -> 1D */
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
const int ex1 = ex[0], ex2 = ex[1];
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
}
/*
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
* ord=2 => shift=1
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex
*/
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
const int shift = 1;
const int nx = ex[0] + 2; // ex1 + ord
const int ny = ex[1] + 2;
const int ii = iF + shift; // 0..ex1+1
const int jj = jF + shift; // 0..ex2+1
const int kk = kF + shift; // 0..ex3+1
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
* ord=3 => shift=2
* iF/jF/kF 是 Fortran 索引(可为负)
*/
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
const int shift = 2; // ord=3 -> -2..ex
const int nx = ex[0] + 3; // ex1 + ord
const int ny = ex[1] + 3;
const int ii = iF + shift; // 0..ex1+2
const int jj = jF + shift; // 0..ex2+2
const int kk = kF + shift; // 0..ex3+2
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
*
* C 里我们把:
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
* funcc 用“平移下标”存为一维数组:
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
* 总长度 nx = extc1 + ord
* 同理 ny = extc2 + ord, nz = extc3 + ord
*/
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
const int nx = extc[0], ny = extc[1];
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
}
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
const int shift = ord - 1; // iF = -shift .. extc1
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
const int ny = extc[1] + ord;
const int ii = iF + shift; // 0..extc1+shift
const int jj = jF + shift; // 0..extc2+shift
const int kk = kF + shift; // 0..extc3+shift
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* 等价于 Fortran:
* funcc(1:extc1,1:extc2,1:extc3)=func
* do i=0,ord-1
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
* enddo
* do i=0,ord-1
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
* enddo
* do i=0,ord-1
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
* enddo
*/
static inline void symmetry_bd(int ord,
const int extc[3],
const double *func,
double *funcc,
const double SoA[3])
{
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
// 1) funcc(1:extc1,1:extc2,1:extc3) = func
// Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
for (int k0 = 0; k0 < extc3; ++k0) {
for (int j0 = 0; j0 < extc2; ++j0) {
for (int i0 = 0; i0 < extc1; ++i0) {
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
}
}
}
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
for (int ii = 0; ii <= ord - 1; ++ii) {
const int iF_dst = -ii; // 0, -1, -2, ...
const int iF_src = ii + 1; // 1, 2, 3, ...
for (int kF = 1; kF <= extc3; ++kF) {
for (int jF = 1; jF <= extc2; ++jF) {
funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
}
}
}
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
for (int jj = 0; jj <= ord - 1; ++jj) {
const int jF_dst = -jj;
const int jF_src = jj + 1;
for (int kF = 1; kF <= extc3; ++kF) {
for (int iF = -ord + 1; iF <= extc1; ++iF) {
funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
}
}
}
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
for (int kk = 0; kk <= ord - 1; ++kk) {
const int kF_dst = -kk;
const int kF_src = kk + 1;
for (int jF = -ord + 1; jF <= extc2; ++jF) {
for (int iF = -ord + 1; iF <= extc1; ++iF) {
funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
}
}
}
}
#endif
/* 你已有的函数idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
static inline void fdderivs_xh(
int i0, int j0, int k0,
const int ex[3],
const double *fh,
int iminF, int jminF, int kminF,
int imaxF, int jmaxF, int kmaxF,
double Fdxdx, double Fdydy, double Fdzdz,
double Fdxdy, double Fdxdz, double Fdydz,
double Sdxdx, double Sdydy, double Sdzdz,
double Sdxdy, double Sdxdz, double Sdydz,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz
){
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double TWO = 2.0;
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* 高阶分支i±2,j±2,k±2 都在范围内 */
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
/* fxy 高阶 */
{
const double t_jm2 =
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
const double t_jm1 =
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
const double t_jp1 =
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
const double t_jp2 =
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz 高阶 */
{
const double t_km2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
const double t_km1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
const double t_kp1 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
const double t_kp2 =
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
/* 二阶分支i±1,j±1,k±1 在范围内 */
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
);
}
else {
fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
}
}

View File

@@ -1,27 +0,0 @@
#include "xh_share_func.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps);
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3]);

File diff suppressed because it is too large Load Diff

View File

@@ -1,195 +1,195 @@
################################################################## ##################################################################
## ##
## Generate input file for the AMSS-NCKU TwoPuncture routine ## Generate input file for the AMSS-NCKU TwoPuncture routine
## Author: Xiaoqu ## Author: Xiaoqu
## 2024/11/27 ## 2024/11/27
## Modified: 2025/01/21 ## Modified: 2025/01/21
## ##
################################################################## ##################################################################
import numpy import numpy
import os import os
import AMSS_NCKU_Input as input_data ## import program input file import AMSS_NCKU_Input as input_data ## import program input file
import math import math
################################################################## ##################################################################
## Import binary black hole coordinates ## Import binary black hole coordinates
## If puncture data are set to "Automatically-BBH", compute initial orbital ## If puncture data are set to "Automatically-BBH", compute initial orbital
## positions and momenta according to the settings and rescale the total ## positions and momenta according to the settings and rescale the total
## binary mass to M = 1 for TwoPuncture input. ## binary mass to M = 1 for TwoPuncture input.
if (input_data.puncture_data_set == "Automatically-BBH" ): if (input_data.puncture_data_set == "Automatically-BBH" ):
mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0] mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
if ( mass_ratio_Q < 1.0 ): if ( mass_ratio_Q < 1.0 ):
print( " mass_ratio setting is wrong, please reset!!!" ) print( " mass_ratio setting is wrong, please reset!!!" )
print( " set the first black hole to be the larger mass!!!" ) print( " set the first black hole to be the larger mass!!!" )
BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q ) BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
BBH_M2 = 1.0 / ( 1.0 + mass_ratio_Q ) BBH_M2 = 1.0 / ( 1.0 + mass_ratio_Q )
## Load binary separation and eccentricity ## Load binary separation and eccentricity
distance = input_data.Distance distance = input_data.Distance
e0 = input_data.e0 e0 = input_data.e0
## Set binary component coordinates ## Set binary component coordinates
## Note: place the larger-mass black hole at positive y and the ## Note: place the larger-mass black hole at positive y and the
## smaller-mass black hole at negative y to follow Brugmann's convention ## smaller-mass black hole at negative y to follow Brugmann's convention
## Coordinate convention for TwoPuncture input (Brugmann): ## Coordinate convention for TwoPuncture input (Brugmann):
## -----0-----> y ## -----0-----> y
## - + ## - +
BBH_X1 = 0.0 BBH_X1 = 0.0
BBH_Y1 = distance * 1.0 / ( 1 + mass_ratio_Q ) BBH_Y1 = distance * 1.0 / ( 1 + mass_ratio_Q )
BBH_Z1 = 0.0 BBH_Z1 = 0.0
BBH_X2 = 0.0 BBH_X2 = 0.0
BBH_Y2 = - distance * mass_ratio_Q / ( 1 + mass_ratio_Q ) BBH_Y2 = - distance * mass_ratio_Q / ( 1 + mass_ratio_Q )
BBH_Z2 = 0.0 BBH_Z2 = 0.0
position_BH = numpy.zeros( (2,3) ) position_BH = numpy.zeros( (2,3) )
position_BH[0] = [BBH_X1, BBH_Y1, BBH_Z1] position_BH[0] = [BBH_X1, BBH_Y1, BBH_Z1]
position_BH[1] = [BBH_X2, BBH_Y2, BBH_Z2] position_BH[1] = [BBH_X2, BBH_Y2, BBH_Z2]
## Optionally load momentum from parameter file ## Optionally load momentum from parameter file
## momentum_BH = input_data.momentum_BH ## momentum_BH = input_data.momentum_BH
## Compute orbital momenta using the BBH_orbit_parameter module ## Compute orbital momenta using the BBH_orbit_parameter module
import BBH_orbit_parameter import BBH_orbit_parameter
## Use the dimensionless spins defined in BBH_orbit_parameter ## Use the dimensionless spins defined in BBH_orbit_parameter
BBH_S1 = BBH_orbit_parameter.S1 BBH_S1 = BBH_orbit_parameter.S1
BBH_S2 = BBH_orbit_parameter.S2 BBH_S2 = BBH_orbit_parameter.S2
momentum_BH = numpy.zeros( (2,3) ) momentum_BH = numpy.zeros( (2,3) )
## Compute initial orbital momenta from post-Newtonian-based routine ## Compute initial orbital momenta from post-Newtonian-based routine
momentum_BH[0], momentum_BH[1] = BBH_orbit_parameter.generate_BBH_orbit_parameters( BBH_M1, BBH_M2, BBH_S1, BBH_S2, distance, e0 ) momentum_BH[0], momentum_BH[1] = BBH_orbit_parameter.generate_BBH_orbit_parameters( BBH_M1, BBH_M2, BBH_S1, BBH_S2, distance, e0 )
## Set spin angular momentum input for TwoPuncture ## Set spin angular momentum input for TwoPuncture
## Note: these are dimensional angular momenta (not dimensionless); multiply ## Note: these are dimensional angular momenta (not dimensionless); multiply
## by the square of the mass scale. Here masses are scaled so total M=1. ## by the square of the mass scale. Here masses are scaled so total M=1.
## angular_momentum_BH = input_data.angular_momentum_BH ## angular_momentum_BH = input_data.angular_momentum_BH
angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
if ( input_data.Symmetry == "equatorial-symmetry" ): if ( input_data.Symmetry == "equatorial-symmetry" ):
if i==0: if i==0:
angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * input_data.parameter_BH[i,2] ]
elif i==1: elif i==1:
angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * input_data.parameter_BH[i,2] ]
else: else:
angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
elif ( input_data.Symmetry == "no-symmetry" ): elif ( input_data.Symmetry == "no-symmetry" ):
if i==0: if i==0:
angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
elif i==1: elif i==1:
angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
else: else:
angular_momentum_BH[i] = (input_data.parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (input_data.parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
####################################################### #######################################################
## If puncture data are set to "Manually", read initial positions and momenta ## If puncture data are set to "Manually", read initial positions and momenta
## directly from the parameter file. Rescale the total binary mass to M=1 ## directly from the parameter file. Rescale the total binary mass to M=1
## for TwoPuncture input. ## for TwoPuncture input.
elif (input_data.puncture_data_set == "Manually" ): elif (input_data.puncture_data_set == "Manually" ):
mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0] mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
if ( mass_ratio_Q < 1.0 ): if ( mass_ratio_Q < 1.0 ):
print( " mass_ratio setting is wrong, please reset!!!" ) print( " mass_ratio setting is wrong, please reset!!!" )
print( " set the first black hole to be the larger mass!!!" ) print( " set the first black hole to be the larger mass!!!" )
BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q ) BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
BBH_M2 = 1.0 / ( 1.0 + mass_ratio_Q ) BBH_M2 = 1.0 / ( 1.0 + mass_ratio_Q )
parameter_BH = input_data.parameter_BH parameter_BH = input_data.parameter_BH
position_BH = input_data.position_BH position_BH = input_data.position_BH
momentum_BH = input_data.momentum_BH momentum_BH = input_data.momentum_BH
## Compute binary separation and load eccentricity ## Compute binary separation and load eccentricity
distance = math.sqrt( (position_BH[0,0]-position_BH[1,0])**2 + (position_BH[0,1]-position_BH[1,1])**2 + (position_BH[0,2]-position_BH[1,2])**2 ) distance = math.sqrt( (position_BH[0,0]-position_BH[1,0])**2 + (position_BH[0,1]-position_BH[1,1])**2 + (position_BH[0,2]-position_BH[1,2])**2 )
e0 = input_data.e0 e0 = input_data.e0
## Set spin angular momentum input for TwoPuncture ## Set spin angular momentum input for TwoPuncture
## Note: these are dimensional angular momenta (not dimensionless); multiply ## Note: these are dimensional angular momenta (not dimensionless); multiply
## by the square of the mass scale. Here masses are scaled so total M=1. ## by the square of the mass scale. Here masses are scaled so total M=1.
## angular_momentum_BH = input_data.angular_momentum_BH ## angular_momentum_BH = input_data.angular_momentum_BH
angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
if ( input_data.Symmetry == "equatorial-symmetry" ): if ( input_data.Symmetry == "equatorial-symmetry" ):
if i==0: if i==0:
angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * parameter_BH[i,2] ]
elif i==1: elif i==1:
angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * parameter_BH[i,2] ]
else: else:
angular_momentum_BH[i] = [ 0.0, 0.0, (parameter_BH[i,0]**2) * parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (parameter_BH[i,0]**2) * parameter_BH[i,2] ]
elif ( input_data.Symmetry == "no-symmetry" ): elif ( input_data.Symmetry == "no-symmetry" ):
if i==0: if i==0:
angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
elif i==1: elif i==1:
angular_momentum_BH[i] = (BBH_M2**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (BBH_M2**2) * input_data.dimensionless_spin_BH[i]
else: else:
angular_momentum_BH[i] = (parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
################################################################## ##################################################################
## Write the above binary data into the AMSS-NCKU TwoPuncture input file ## Write the above binary data into the AMSS-NCKU TwoPuncture input file
def generate_AMSSNCKU_TwoPuncture_input(): def generate_AMSSNCKU_TwoPuncture_input():
file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU-TwoPuncture.input"), "w") file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU-TwoPuncture.input"), "w")
print( "# -----0-----> y", file=file1 ) print( "# -----0-----> y", file=file1 )
print( "# - + use Brugmann's convention", file=file1 ) print( "# - + use Brugmann's convention", file=file1 )
print( "ABE::mp = -1.0", file=file1 ) ## use negative values so the code solves for bare masses automatically print( "ABE::mp = -1.0", file=file1 ) ## use negative values so the code solves for bare masses automatically
print( "ABE::mm = -1.0", file=file1 ) print( "ABE::mm = -1.0", file=file1 )
print( "# b = D/2", file=file1 ) print( "# b = D/2", file=file1 )
print( "ABE::b = ", ( distance / 2.0 ), file=file1 ) print( "ABE::b = ", ( distance / 2.0 ), file=file1 )
print( "ABE::P_plusx = ", momentum_BH[0,0], file=file1 ) print( "ABE::P_plusx = ", momentum_BH[0,0], file=file1 )
print( "ABE::P_plusy = ", momentum_BH[0,1], file=file1 ) print( "ABE::P_plusy = ", momentum_BH[0,1], file=file1 )
print( "ABE::P_plusz = ", momentum_BH[0,2], file=file1 ) print( "ABE::P_plusz = ", momentum_BH[0,2], file=file1 )
print( "ABE::P_minusx = ", momentum_BH[1,0], file=file1 ) print( "ABE::P_minusx = ", momentum_BH[1,0], file=file1 )
print( "ABE::P_minusy = ", momentum_BH[1,1], file=file1 ) print( "ABE::P_minusy = ", momentum_BH[1,1], file=file1 )
print( "ABE::P_minusz = ", momentum_BH[1,2], file=file1 ) print( "ABE::P_minusz = ", momentum_BH[1,2], file=file1 )
print( "ABE::S_plusx = ", angular_momentum_BH[0,0], file=file1 ) print( "ABE::S_plusx = ", angular_momentum_BH[0,0], file=file1 )
print( "ABE::S_plusy = ", angular_momentum_BH[0,1], file=file1 ) print( "ABE::S_plusy = ", angular_momentum_BH[0,1], file=file1 )
print( "ABE::S_plusz = ", angular_momentum_BH[0,2], file=file1 ) print( "ABE::S_plusz = ", angular_momentum_BH[0,2], file=file1 )
print( "ABE::S_minusx = ", angular_momentum_BH[1,0], file=file1 ) print( "ABE::S_minusx = ", angular_momentum_BH[1,0], file=file1 )
print( "ABE::S_minusy = ", angular_momentum_BH[1,1], file=file1 ) print( "ABE::S_minusy = ", angular_momentum_BH[1,1], file=file1 )
print( "ABE::S_minusz = ", angular_momentum_BH[1,2], file=file1 ) print( "ABE::S_minusz = ", angular_momentum_BH[1,2], file=file1 )
print( "ABE::Mp = ", BBH_M1, file=file1 ) print( "ABE::Mp = ", BBH_M1, file=file1 )
print( "ABE::Mm = ", BBH_M2, file=file1 ) print( "ABE::Mm = ", BBH_M2, file=file1 )
print( "ABE::admtol = 1.e-8", file=file1 ) print( "ABE::admtol = 1.e-8", file=file1 )
print( "ABE::Newtontol = 5.e-12", file=file1 ) print( "ABE::Newtontol = 5.e-12", file=file1 )
print( "ABE::nA = 50", file=file1 ) print( "ABE::nA = 50", file=file1 )
print( "ABE::nB = 50", file=file1 ) print( "ABE::nB = 50", file=file1 )
print( "ABE::nphi = 26", file=file1 ) print( "ABE::nphi = 26", file=file1 )
print( "ABE::Newtonmaxit = 50", file=file1 ) print( "ABE::Newtonmaxit = 50", file=file1 )
file1.close() file1.close()
return file1 return file1
################################################################## ##################################################################

File diff suppressed because it is too large Load Diff

View File

@@ -1,192 +1,175 @@
################################################################## ##################################################################
## ##
## This file defines the commands used to build and run AMSS-NCKU ## This file defines the commands used to build and run AMSS-NCKU
## Author: Xiaoqu ## Author: Xiaoqu
## 2025/01/24 ## 2025/01/24
## ##
################################################################## ##################################################################
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
import time
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask ##################################################################
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 0-47" ##################################################################
NUMACTL_CPU_BIND2 = "OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47} taskset -c 0-47"
#NUMACTL_CPU_BIND2 = "taskset -c 0-1" ## Compile the AMSS-NCKU main program ABE
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores def makefile_ABE():
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 32 print( )
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
print( )
##################################################################
## Build command
if (input_data.GPU_Calculation == "no"):
################################################################## makefile_command = "make -j4" + " ABE"
elif (input_data.GPU_Calculation == "yes"):
## Compile the AMSS-NCKU main program ABE makefile_command = "make -j4" + " ABEGPU"
else:
def makefile_ABE(): print( " CPU/GPU numerical calculation setting is wrong " )
print( )
print( )
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) ## Execute the command with subprocess.Popen and stream output
print( ) makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
## Build command with CPU binding to nohz_full cores ## Read and print output lines as they arrive
if (input_data.GPU_Calculation == "no"): for line in makefile_process.stdout:
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" print(line, end='') # stream output in real time
elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" ## Wait for the process to finish
else: makefile_return_code = makefile_process.wait()
print( " CPU/GPU numerical calculation setting is wrong " ) if makefile_return_code != 0:
print( ) raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
## Execute the command with subprocess.Popen and stream output print( )
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) print( " Compilation of the AMSS-NCKU executable file ABE is finished " )
print( )
## Read and print output lines as they arrive
for line in makefile_process.stdout: return
print(line, end='') # stream output in real time
##################################################################
## Wait for the process to finish
makefile_return_code = makefile_process.wait()
if makefile_return_code != 0:
raise subprocess.CalledProcessError(makefile_return_code, makefile_command) ##################################################################
print( ) ## Compile the AMSS-NCKU TwoPuncture program TwoPunctureABE
print( " Compilation of the AMSS-NCKU executable file ABE is finished " )
print( ) def makefile_TwoPunctureABE():
return print( )
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
################################################################## print( )
## Build command
makefile_command = "make" + " TwoPunctureABE"
##################################################################
## Execute the command with subprocess.Popen and stream output
## Compile the AMSS-NCKU TwoPuncture program TwoPunctureABE makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
def makefile_TwoPunctureABE(): ## Read and print output lines as they arrive
for line in makefile_process.stdout:
print( ) print(line, end='') # stream output in real time
print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
print( ) ## Wait for the process to finish
makefile_return_code = makefile_process.wait()
## Build command with CPU binding to nohz_full cores if makefile_return_code != 0:
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE" raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
## Execute the command with subprocess.Popen and stream output print( )
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) print( " Compilation of the AMSS-NCKU executable file TwoPunctureABE is finished " )
print( )
## Read and print output lines as they arrive
for line in makefile_process.stdout: return
print(line, end='') # stream output in real time
##################################################################
## Wait for the process to finish
makefile_return_code = makefile_process.wait()
if makefile_return_code != 0:
raise subprocess.CalledProcessError(makefile_return_code, makefile_command) ##################################################################
print( ) ## Run the AMSS-NCKU main program ABE
print( " Compilation of the AMSS-NCKU executable file TwoPunctureABE is finished " )
print( ) def run_ABE():
return print( )
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
################################################################## print( )
## Define the command to run; cast other values to strings as needed
################################################################## if (input_data.GPU_Calculation == "no"):
mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
## Run the AMSS-NCKU main program ABE mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
def run_ABE(): mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
print( )
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) ## Execute the MPI command and stream output
print( ) mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
## Define the command to run; cast other values to strings as needed ## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0:
if (input_data.GPU_Calculation == "no"): ## Read and print output lines; also write each line to file
#mpi_command = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" for line in mpi_process.stdout:
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" print(line, end='') # stream output in real time
mpi_command = """ OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES=cores mpirun -np 1 --cpu-bind=sockets ./ABE """ file0.write(line) # write the line to file
mpi_command_outfile = "ABE_out.log" file0.flush() # flush to ensure each line is written immediately (optional)
elif (input_data.GPU_Calculation == "yes"): file0.close()
mpi_command = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" ## Wait for the process to finish
mpi_return_code = mpi_process.wait()
## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) print( )
print( " The ABE/ABEGPU simulation is finished " )
## Write ABE run output to file while printing to stdout print( )
with open(mpi_command_outfile, 'w') as file0:
## Read and print output lines; also write each line to file return
for line in mpi_process.stdout:
print(line, end='') # stream output in real time ##################################################################
file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
file0.close()
##################################################################
## Wait for the process to finish
mpi_return_code = mpi_process.wait() ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
print( ) def run_TwoPunctureABE():
print( " The ABE/ABEGPU simulation is finished " )
print( ) print( )
print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
return print( )
################################################################## ## Define the command to run
TwoPuncture_command = "./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
################################################################## ## Execute the command with subprocess.Popen and stream output
TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
## Write TwoPunctureABE run output to file while printing to stdout
def run_TwoPunctureABE(): with open(TwoPuncture_command_outfile, 'w') as file0:
tp_time1=time.time() ## Read and print output lines; also write each line to file
print( ) for line in TwoPuncture_process.stdout:
print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print(line, end='') # stream output in real time
print( ) file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
## Define the command to run file0.close()
#TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command = " ./TwoPunctureABE" ## Wait for the process to finish
TwoPuncture_command_outfile = "TwoPunctureABE_out.log" TwoPuncture_command_return_code = TwoPuncture_process.wait()
## Execute the command with subprocess.Popen and stream output print( )
TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) print( " The TwoPunctureABE simulation is finished " )
print( )
## Write TwoPunctureABE run output to file while printing to stdout
with open(TwoPuncture_command_outfile, 'w') as file0: return
## Read and print output lines; also write each line to file
for line in TwoPuncture_process.stdout: ##################################################################
print(line, end='') # stream output in real time
file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
file0.close()
## Wait for the process to finish
TwoPuncture_command_return_code = TwoPuncture_process.wait()
print( )
print( " The TwoPunctureABE simulation is finished " )
print( )
tp_time2=time.time()
et=tp_time2-tp_time1
print(f"Used time: {et}")
return
##################################################################

File diff suppressed because it is too large Load Diff

View File

@@ -1,97 +0,0 @@
# AMSS-NCKU PGO Profile Analysis Report
## 1. Profiling Environment
| Item | Value |
|------|-------|
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
| Instrumentation Flag | `-fprofile-instr-generate` |
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
| Merged Profile | `default.profdata` (394 KB) |
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
## 2. Reduced Simulation Parameters (for profiling run)
| Parameter | Production Value | Profiling Value |
|-----------|-----------------|-----------------|
| MPI_processes | 64 | 1 |
| grid_level | 9 | 4 |
| static_grid_level | 5 | 3 |
| static_grid_number | 96 | 24 |
| moving_grid_number | 48 | 16 |
| largest_box_xyz_max | 320^3 | 160^3 |
| Final_Evolution_Time | 1000.0 | 10.0 |
| Evolution_Step_Number | 10,000,000 | 1,000 |
| Detector_Number | 12 | 2 |
## 3. Profile Summary
| Metric | Value |
|--------|-------|
| Total instrumented functions | 1,392 |
| Functions with non-zero counts | 117 (8.4%) |
| Functions with zero counts | 1,275 (91.6%) |
| Maximum function entry count | 386,459,248 |
| Maximum internal block count | 370,477,680 |
| Total block count | 4,198,023,118 |
## 4. Top 20 Hotspot Functions
| Rank | Total Count | Max Block Count | Function | Category |
|------|------------|-----------------|----------|----------|
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
## 5. Hotspot Category Breakdown
Top 20 functions account for ~98% of total execution counts:
| Category | Functions | Combined Count | Share |
|----------|-----------|---------------|-------|
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
| Dissipation | kodis_ | ~92M | ~2% |
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
## 6. Conclusions
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
## 7. PGO Phase 2 Usage
To apply the profile, use the following flags in `makefile.inc`:
```makefile
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-align array64byte -fpp -I${MKLROOT}/include
```

Binary file not shown.

View File

@@ -1,133 +1,133 @@
################################################################## ##################################################################
## ##
## Update puncture parameters from TwoPuncture output ## Update puncture parameters from TwoPuncture output
## Author: Xiaoqu ## Author: Xiaoqu
## 2024/12/04 ## 2024/12/04
## ##
################################################################## ##################################################################
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import numpy import numpy
import os import os
################################################################## ##################################################################
################################################################## ##################################################################
def read_TwoPuncture_Output(Output_File_directory): def read_TwoPuncture_Output(Output_File_directory):
dimensionless_mass_BH = numpy.zeros( input_data.puncture_number ) dimensionless_mass_BH = numpy.zeros( input_data.puncture_number )
bare_mass_BH = numpy.zeros( input_data.puncture_number ) ## initialize bare mass for each black hole bare_mass_BH = numpy.zeros( input_data.puncture_number ) ## initialize bare mass for each black hole
position_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize initial position for each black hole position_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize initial position for each black hole
momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize momentum for each black hole momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize momentum for each black hole
angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize spin angular momentum for each black hole angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize spin angular momentum for each black hole
# Read TwoPuncture output file # Read TwoPuncture output file
data = numpy.loadtxt( os.path.join(Output_File_directory, "puncture_parameters_new.txt") ) data = numpy.loadtxt( os.path.join(Output_File_directory, "puncture_parameters_new.txt") )
# Ensure data is parsed as a 1-D array # Ensure data is parsed as a 1-D array
data = data.reshape(-1) data = data.reshape(-1)
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
## Read parameters for the first two punctures from TwoPuncture output ## Read parameters for the first two punctures from TwoPuncture output
## For additional punctures, read parameters from the input file ## For additional punctures, read parameters from the input file
if i<2: if i<2:
bare_mass_BH[i] = data[12*i] bare_mass_BH[i] = data[12*i]
dimensionless_mass_BH[i] = data[12*i+1] dimensionless_mass_BH[i] = data[12*i+1]
position_BH[i] = [ data[12*i+3], data[12*i+4], data[12*i+5] ] position_BH[i] = [ data[12*i+3], data[12*i+4], data[12*i+5] ]
momentum_BH[i] = [ data[12*i+6], data[12*i+7], data[12*i+8] ] momentum_BH[i] = [ data[12*i+6], data[12*i+7], data[12*i+8] ]
angular_momentum_BH[i] = [ data[12*i+9], data[12*i+10], data[12*i+11] ] angular_momentum_BH[i] = [ data[12*i+9], data[12*i+10], data[12*i+11] ]
else: else:
dimensionless_mass_BH[i] = input_data.parameter_BH[i,0] dimensionless_mass_BH[i] = input_data.parameter_BH[i,0]
bare_mass_BH[i] = input_data.parameter_BH[i,0] bare_mass_BH[i] = input_data.parameter_BH[i,0]
position_BH[i] = input_data.position_BH[i] position_BH[i] = input_data.position_BH[i]
momentum_BH[i] = input_data.momentum_BH[i] momentum_BH[i] = input_data.momentum_BH[i]
## Read angular momentum according to symmetry ## Read angular momentum according to symmetry
if ( input_data.Symmetry == "equatorial-symmetry" ): if ( input_data.Symmetry == "equatorial-symmetry" ):
angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
elif ( input_data.Symmetry == "no-symmetry" ): elif ( input_data.Symmetry == "no-symmetry" ):
angular_momentum_BH[i] = (dimensionless_mass_BH[i]**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (dimensionless_mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
return bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH return bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH
################################################################## ##################################################################
################################################################## ##################################################################
## Append the computed puncture information into the AMSS-NCKU input file ## Append the computed puncture information into the AMSS-NCKU input file
def append_AMSSNCKU_BSSN_input(File_directory, TwoPuncture_File_directory): def append_AMSSNCKU_BSSN_input(File_directory, TwoPuncture_File_directory):
charge_Q_BH = numpy.zeros( input_data.puncture_number ) ## initialize charge for each black hole charge_Q_BH = numpy.zeros( input_data.puncture_number ) ## initialize charge for each black hole
## If using Ansorg-TwoPuncture to solve the initial-data problem, read ## If using Ansorg-TwoPuncture to solve the initial-data problem, read
## bare masses, positions and angular momenta from TwoPuncture output ## bare masses, positions and angular momenta from TwoPuncture output
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ): if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH = read_TwoPuncture_Output(TwoPuncture_File_directory) bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH = read_TwoPuncture_Output(TwoPuncture_File_directory)
# set charge for each black hole # set charge for each black hole
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
charge_Q_BH[i] = dimensionless_mass_BH[i] * input_data.parameter_BH[i,1] charge_Q_BH[i] = dimensionless_mass_BH[i] * input_data.parameter_BH[i,1]
## If using another method for initial data, read parameters directly from input ## If using another method for initial data, read parameters directly from input
else: else:
position_BH = input_data.position_BH position_BH = input_data.position_BH
momentum_BH = input_data.momentum_BH momentum_BH = input_data.momentum_BH
## angular_momentum_BH = input_data.angular_momentum_BH ## angular_momentum_BH = input_data.angular_momentum_BH
angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize spin angular momentum array angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) ) ## initialize spin angular momentum array
mass_BH = numpy.zeros( input_data.puncture_number ) ## initialize mass array mass_BH = numpy.zeros( input_data.puncture_number ) ## initialize mass array
## Set charge and spin angular momentum for each puncture ## Set charge and spin angular momentum for each puncture
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
if ( input_data.Symmetry == "octant-symmetry" ): if ( input_data.Symmetry == "octant-symmetry" ):
mass_BH[i] = input_data.parameter_BH[i,0] mass_BH[i] = input_data.parameter_BH[i,0]
charge_Q_BH[i] = mass_BH[i]* input_data.parameter_BH[i,1] charge_Q_BH[i] = mass_BH[i]* input_data.parameter_BH[i,1]
angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
elif ( input_data.Symmetry == "equatorial-symmetry" ): elif ( input_data.Symmetry == "equatorial-symmetry" ):
mass_BH[i] = input_data.parameter_BH[i,0] mass_BH[i] = input_data.parameter_BH[i,0]
charge_Q_BH[i] = mass_BH[i]* input_data.parameter_BH[i,1] charge_Q_BH[i] = mass_BH[i]* input_data.parameter_BH[i,1]
angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ] angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
elif ( input_data.Symmetry == "no-symmetry" ): elif ( input_data.Symmetry == "no-symmetry" ):
mass_BH[i] = input_data.parameter_BH[i,0] mass_BH[i] = input_data.parameter_BH[i,0]
angular_momentum_BH[i] = (mass_BH[i]**2) * input_data.dimensionless_spin_BH[i] angular_momentum_BH[i] = (mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
charge_Q_BH[i] = mass_BH[i] * input_data.parameter_BH[i,1] charge_Q_BH[i] = mass_BH[i] * input_data.parameter_BH[i,1]
file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU.input"), "a") ## open file in append mode file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU.input"), "a") ## open file in append mode
## Output BSSN related settings ## Output BSSN related settings
print( file=file1 ) print( file=file1 )
print( "BSSN::chitiny = 1e-5", file=file1 ) print( "BSSN::chitiny = 1e-5", file=file1 )
print( "BSSN::time refinement start from level = ", input_data.refinement_level, file=file1 ) print( "BSSN::time refinement start from level = ", input_data.refinement_level, file=file1 )
print( "BSSN::BH_num = ", input_data.puncture_number, file=file1 ) print( "BSSN::BH_num = ", input_data.puncture_number, file=file1 )
for i in range(input_data.puncture_number): for i in range(input_data.puncture_number):
if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ): if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
print( f"BSSN::Mass[{i}] = { bare_mass_BH[i] } ", file=file1 ) print( f"BSSN::Mass[{i}] = { bare_mass_BH[i] } ", file=file1 )
else: else:
print( f"BSSN::Mass[{i}] = { mass_BH[i] } ", file=file1 ) print( f"BSSN::Mass[{i}] = { mass_BH[i] } ", file=file1 )
print( f"BSSN::Qchar[{i}] = { charge_Q_BH[i] } ", file=file1 ) print( f"BSSN::Qchar[{i}] = { charge_Q_BH[i] } ", file=file1 )
print( f"BSSN::Porgx[{i}] = { position_BH[i,0] } ", file=file1 ) print( f"BSSN::Porgx[{i}] = { position_BH[i,0] } ", file=file1 )
print( f"BSSN::Porgy[{i}] = { position_BH[i,1] } ", file=file1 ) print( f"BSSN::Porgy[{i}] = { position_BH[i,1] } ", file=file1 )
print( f"BSSN::Porgz[{i}] = { position_BH[i,2] } ", file=file1 ) print( f"BSSN::Porgz[{i}] = { position_BH[i,2] } ", file=file1 )
print( f"BSSN::Pmomx[{i}] = { momentum_BH[i,0] } ", file=file1 ) print( f"BSSN::Pmomx[{i}] = { momentum_BH[i,0] } ", file=file1 )
print( f"BSSN::Pmomy[{i}] = { momentum_BH[i,1] } ", file=file1 ) print( f"BSSN::Pmomy[{i}] = { momentum_BH[i,1] } ", file=file1 )
print( f"BSSN::Pmomz[{i}] = { momentum_BH[i,2] } ", file=file1 ) print( f"BSSN::Pmomz[{i}] = { momentum_BH[i,2] } ", file=file1 )
print( f"BSSN::Spinx[{i}] = { angular_momentum_BH[i,0] } ", file=file1 ) print( f"BSSN::Spinx[{i}] = { angular_momentum_BH[i,0] } ", file=file1 )
print( f"BSSN::Spiny[{i}] = { angular_momentum_BH[i,1] } ", file=file1 ) print( f"BSSN::Spiny[{i}] = { angular_momentum_BH[i,1] } ", file=file1 )
print( f"BSSN::Spinz[{i}] = { angular_momentum_BH[i,2] } ", file=file1 ) print( f"BSSN::Spinz[{i}] = { angular_momentum_BH[i,2] } ", file=file1 )
print( file=file1 ) print( file=file1 )
file1.close() file1.close()
return return
################################################# #################################################