Compare commits

..

5 Commits

Author SHA1 Message Date
ed89bc029b Fix potential division by zero in reta_val calculation and enable NaN checks
Added a safety check for the denominator in the reta_val calculation to
prevent division by zero when chi approaches zero (e.g., at far-field
boundaries). Also enabled DEBUG_NAN_CHECK macro to catch invalid inputs early.
Initialized output arrays to zero to prevent uninitialized memory access.
2026-01-19 20:29:48 +08:00
19274e93d1 Fix boundary handling in bssn_rhs_opt.f90 to prevent NaNs
Refactored calc_derivs and calc_dderivs to include correct boundary
handling logic matching the legacy code. Implemented fallback to 2nd
order derivatives when near boundaries where 4th order stencils cannot
be used. Added logic to initialize output arrays to zero to avoid
uninitialized memory access.
2026-01-19 20:03:22 +08:00
ae1a474cca Fix compilation errors and complete logic in BSSN RHS optimization 2026-01-19 19:22:52 +08:00
cbb8fb3a87 patched last commit 2026-01-19 17:14:28 +08:00
4472d89a9f Optimize bssn_rhs calculation with cache blocking and vectorization
- Implemented cache blocking (BLK=8) in bssn_rhs_opt.f90 to improve L1/L2 cache hit rate.
- Introduced bssn_rhs_opt.f90 module with vectorized derivative and physics kernels.
- Renamed original implementation to bssn_rhs_legacy.f90 for fallback.
- Updated bssn_rhs.f90 to act as a dispatcher, using the optimized path for ghost_width=3.
- Updated makefile to include new source files.
- Added DEBUG_NAN_CHECK macro to optionally disable NaN checks in production.
2026-01-19 16:39:24 +08:00
12 changed files with 2460 additions and 1253 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -997,7 +997,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1152,7 +1151,6 @@
fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1229,7 +1227,6 @@
fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1300,7 +1297,6 @@
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1405,7 +1401,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1581,7 +1576,6 @@
fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1649,7 +1643,6 @@
fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1719,7 +1712,6 @@
fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1789,7 +1781,6 @@
fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1860,7 +1851,6 @@
fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1929,7 +1919,6 @@
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -1019,7 +1019,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1135,7 +1134,6 @@
fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1229,7 +1227,6 @@
fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1317,7 +1314,6 @@
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1434,7 +1430,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1585,7 +1580,6 @@
fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1665,7 +1659,6 @@
fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1747,7 +1740,6 @@
fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1829,7 +1821,6 @@
fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1912,7 +1903,6 @@
fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1993,7 +1983,6 @@
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -1186,7 +1186,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1301,7 +1300,6 @@
fx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1383,7 +1381,6 @@
fy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1459,7 +1456,6 @@
fz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1569,7 +1565,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1786,7 +1781,6 @@
fxx = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1862,7 +1856,6 @@
fyy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1940,7 +1933,6 @@
fzz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2018,7 +2010,6 @@
fxy = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2107,7 +2098,6 @@
fxz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2194,7 +2184,6 @@
fyz = ZEO
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -159,7 +159,6 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -369,7 +369,6 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -233,7 +233,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
!$omp parallel do collapse(3) private(i,j,k) if(ex(1)*ex(2)*ex(3) > 4096)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -34,7 +34,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
rungekutta4_rout.o bssn_rhs_opt.o bssn_rhs.o bssn_rhs_legacy.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\

View File

@@ -7,18 +7,19 @@
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -lifcore -limf -lmpi \
-L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl
## Aggressive optimization flags:
## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## OpenMP re-enabled for MPI+OpenMP hybrid parallelism (MKL stays sequential to avoid nested parallelism)
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
f90appflags = -O3 -xHost -fp-model fast=2 -fma \
-fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx

View File

@@ -13,9 +13,13 @@ import subprocess
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
NUMACTL_CPU_BIND = "taskset -c 0-111"
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 104
@@ -110,18 +114,12 @@ def run_ABE():
print( )
## Define the command to run; cast other values to strings as needed
## MPI+OpenMP hybrid: compute threads per rank from total cores / MPI ranks
omp_threads = max(1, 96 // input_data.MPI_processes)
omp_env = (f" -genv OMP_NUM_THREADS={omp_threads}"
f" -genv OMP_PROC_BIND=close"
f" -genv OMP_PLACES=cores"
f" -genv I_MPI_PIN_DOMAIN=omp")
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABE"
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + omp_env + " ./ABEGPU"
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output