Add C kernel for BSSN-EM (Maxwell/electromagnetic field) RHS computation

New bssn_em_rhs_c.C computes EM field RHS (E,B,Kpsi,Kphi) and stress-energy
tensor, then calls the C BSSN RHS kernel with source terms. Replaces empart.f90
when USE_CXX_EM_KERNEL=1. Supports all ghost_width orders via existing derivative
kernels. Controlled by USE_CXX_EM_KERNEL switch (default 0, experimental).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-14 11:25:08 +08:00
parent d46418f1c3
commit 8e8a93bad0
4 changed files with 464 additions and 89 deletions

View File

@@ -1,8 +1,28 @@
## Toolchain selection
## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI
## intel : Intel oneAPI toolchain (legacy path)
## aocc : AMD AOCC + AOCL + OpenMPI (for AMD EPYC Zen 5, with CUDA)
TOOLCHAIN ?= intel
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
@@ -23,18 +43,6 @@ else
INTERP_LB_FLAGS =
endif
MKLROOT ?= /home/intel/oneapi/mkl/latest
MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
MKL_INC ?= -I$(MKLROOT)/include
## AMD AOCC toolchain paths (used when TOOLCHAIN=aocc)
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
OMPI_PREFIX ?= /home/aocc/aocc-openmpi
NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
CUDA_HOME ?= $(NVHPC_ROOT)/cuda
CUDA_ARCH ?= sm_80
## Kernel implementation switch
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 : fall back to original Fortran kernels
@@ -45,64 +53,36 @@ USE_CXX_KERNELS ?= 1
## 0 : use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 1
## BSSN-EScalar RHS switch
## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 1
## BSSN-EM RHS switch
## 1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path
## 0 : keep the original Fortran empart.f90 RHS for the EM fields (default)
## Note: experimental, requires USE_CXX_KERNELS=1
USE_CXX_EM_KERNEL ?= 0
## Cached transfer switch
## auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
## 1 : force cached Sync/Restrict/OutBd transfer on evolution hot paths
## 0 : force the original uncached transfer path
USE_TRANSFER_CACHE ?= auto
## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(TOOLCHAIN),intel)
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lifcore -limf -liomp5 -lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
else ifeq ($(TOOLCHAIN),aocc)
f90 = flang
f77 = flang
CXX = clang++
CC = clang
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
filein = -I/usr/include/ -I$(AOCL_ROOT)/include -I$(CUDA_HOME)/include
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath \
-ljemalloc -lpthread -lm -ldl -lomp \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
else ifeq ($(TOOLCHAIN),nvhpc)
f90 = mpifort
f77 = mpifort
CXX = mpicxx
CC = mpicc
CLINKER = mpicxx
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
-fortranlibs
endif
ifeq ($(TOOLCHAIN),intel)
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
endif
Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc