Add C implementations of shell-patch derivative kernels (WithShell support)

New files provide C equivalents of Fortran diff_new_sh.f90 and kodiss_sh.f90:
- fderivs_sh_c.C: first derivatives in shell (rho, sigma, R) coords
- fdderivs_sh_c.C: second derivatives in shell coords
- fderivs_shc_c.C: shell first derivs + chain rule to Cartesian
- fdderivs_shc_c.C: shell second derivs + chain rule to Cartesian
- kodiss_sh_c.C: Kreiss-Oliger dissipation on shell patches

Also add symmetry_stbd() C implementation and shell fh indexing to share_func.h.
Controlled by USE_CXX_SHELL_KERNELS switch (default 0, experimental).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-14 02:52:48 +08:00
parent b84561426e
commit 29b2406d57
7 changed files with 998 additions and 25 deletions

View File

@@ -41,9 +41,12 @@ TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
CXXAPPFLAGS = -O3 -march=znver4 -ffast-math -flto \
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include $(INTERP_LB_FLAGS) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=znver4 -ffast-math -flto \
-cpp -I$(AOCL_ROOT)/include $(POLINT6_FLAG)
@@ -53,11 +56,11 @@ f90appflags = -O3 -march=znver4 -ffast-math -flto \
$(f90) $(f90appflags) -c $< -o $@
.C.o:
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
# ShellPatch.C uses OpenMP for setupintintstuff search loops
ShellPatch.o: ShellPatch.C
$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
.for.o:
$(f77) -c $< -o $@
@@ -67,38 +70,56 @@ ShellPatch.o: ShellPatch.C
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_c.o: fdderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_c.o: kodiss_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
# C rewrite of shell-patch derivative kernels
fderivs_sh_c.o: fderivs_sh_c.C
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_sh_c.o: fdderivs_sh_c.C
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_shc_c.o: fderivs_shc_c.C
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_shc_c.o: fdderivs_shc_c.C
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_sh_c.o: kodiss_sh_c.C
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
z4c_rhs_c.o: z4c_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
# $(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags (AMD AOCC, no PGO)
TP_OPTFLAGS = -O3 -march=znver4 -ffast-math -flto \
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files
@@ -129,13 +150,24 @@ else
RK4_F90_OBJ = rungekutta4_rout.o
endif
## Shell-patch derivative kernel switch (independent from USE_CXX_KERNELS)
## 1 : use C++ rewrite of shell derivative functions (experimental)
## 0 : use original Fortran diff_new_sh.o and kodiss_sh.o (default)
USE_CXX_SHELL_KERNELS ?= 0
ifeq ($(USE_CXX_SHELL_KERNELS),1)
CFILES += fderivs_sh_c.o fdderivs_sh_c.o fderivs_shc_c.o fdderivs_shc_c.o kodiss_sh_c.o
SH_F90_OBJ =
else
SH_F90_OBJ = diff_new_sh.o kodiss_sh.o point_diff_new_sh.o
endif
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
@@ -146,11 +178,11 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
$(RK4_F90_OBJ) diff_new.o kodiss.o\
lopsidediff.o sommerfeld_rout.o getnp4.o $(SH_F90_OBJ)\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\
fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o\
cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
getnpem2.o empart.o NullNews.o fourdcurvature.o\
bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
@@ -172,7 +204,7 @@ tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscf
linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \
initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
@@ -186,8 +218,8 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
initial_null2.h NullShellPatch2.h
initial_null2.h NullShellPatch2.h
$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
@@ -197,7 +229,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
initial_null2.h NullShellPatch2.h \
bssn_gpu_class.h bssn_macro.h
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
@@ -211,12 +243,12 @@ misc.o : zbesh.o
# projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -fopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f