Fix BSSN C gauge RHS parity

Fix lower-order C lopsided boundary fallbacks
Fix eighth-order C derivative and lopsided stencils
2026-05-15 18:03:38 +08:00 · 2026-05-14 21:37:36 +08:00 · 2026-05-14 20:42:04 +08:00 · 2026-05-14 16:03:02 +08:00 · 2026-05-14 15:21:16 +08:00 · 2026-05-14 14:09:33 +08:00
5 changed files with 42 additions and 76 deletions
--- a/AMSS_NCKU_source/ShellPatch.C
+++ b/AMSS_NCKU_source/ShellPatch.C
@@ -59,7 +59,7 @@ bool shell_fast_interp_enabled()
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_SHELL_FAST_INTERP");
-    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
@@ -70,7 +70,7 @@ bool shell_parallel_interp_enabled()
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
-    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,7 @@ using namespace std;
 #endif

 #include "TwoPunctures.h"
-#include <mkl_cblas.h>
+#include <cblas.h>

 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -18,7 +18,7 @@ using namespace std;
 #endif

 // Intel oneMKL LAPACK interface
-#include <mkl_lapacke.h>
+#include <lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -58,30 +58,10 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)

-## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
+## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
 ##   make                        -> opt  (PGO-guided, maximum performance)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
-PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata

-ifeq ($(PGO_MODE),instrument)
-## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
-CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
-f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
-## PGO has been turned off, now tested and found to be negative optimization
-## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
-
-
-CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
-f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-endif

 .SUFFIXES: .o .f90 .C .for .cu

@@ -89,11 +69,11 @@ endif
 	$(f90) $(f90appflags) -c $< -o $@

 .C.o:
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 # ShellPatch.C uses OpenMP for setupintintstuff search loops
 ShellPatch.o: ShellPatch.C
-	${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@

 .for.o:
 	$(f77) -c $< -o $@
@@ -103,59 +83,59 @@ ShellPatch.o: ShellPatch.C

 # C rewrite of BSSN RHS kernel and helpers
 bssn_rhs_c.o: bssn_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fderivs_c.o: fderivs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_c.o: fdderivs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 kodiss_c.o: kodiss_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 lopsided_c.o: lopsided_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 lopsided_kodis_c.o: lopsided_kodis_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 # C rewrite of shell-patch derivative kernels
 fderivs_sh_c.o: fderivs_sh_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_sh_c.o: fdderivs_sh_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fderivs_shc_c.o: fderivs_shc_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_shc_c.o: fdderivs_shc_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 kodiss_sh_c.o: kodiss_sh_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@


 bssn_em_rhs_c.o: bssn_em_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
 z4c_rhs_c.o: z4c_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+#	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@

 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include
+              -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include

 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@

 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@

 # Input files

--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,33 +1,20 @@
-## GCC version (commented out)
-## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
+## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)

-## Intel oneAPI version with oneMKL (Optimized for performance)
-filein  = -I/usr/include/ -I${MKLROOT}/include
+## AOCL root path for includes and libraries
+AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc

-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+## AOCC-built OpenMPI prefix
+OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10

-## Memory allocator switch
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
-##   0           : use system default allocator (ptmalloc)
-USE_TBBMALLOC ?= 1
-TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
-ifneq ($(wildcard $(TBBMALLOC_SO)),)
-TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
-else
-TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
-endif
-ifeq ($(USE_TBBMALLOC),1)
-LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
-endif
+filein  = -I/usr/include/ -I$(AOCL_ROOT)/include

-## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
-##   opt        : (default) maximum performance with PGO profile-guided optimization
-##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
-PGO_MODE ?= opt
+## Using AOCL BLIS + libFLAME for BLAS/LAPACK
+## AOCC Fortran runtime: -lflang (includes FortranRuntime)
+## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
+LDLIBS  = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
+
+# OpenMP flag for selective compilation
+OMP_FLAG = -fopenmp

 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
@@ -76,13 +63,12 @@ USE_TRANSFER_CACHE ?= auto
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1

-f90          = ifx
-f77          = ifx
-CXX          = icpx
-CC           = icx
-CLINKER      = mpiicpx
+f90          = flang
+f77          = flang
+CXX          = clang++
+CC           = clang
+CLINKER      = $(OMPI_PREFIX)/bin/mpicxx

 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
-#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
Author	SHA1	Message	Date
CGH0S7	29c16e7333	Fix BSSN C gauge RHS parity	2026-05-15 18:03:38 +08:00
CGH0S7	8bc89c900f	Fix lower-order C lopsided boundary fallbacks	2026-05-14 21:37:36 +08:00
CGH0S7	1769d48428	Fix eighth-order C derivative and lopsided stencils	2026-05-14 20:42:04 +08:00
CGH0S7	2f518a03bb	Fix C derivative ghost-buffer indexing across FD orders	2026-05-14 16:03:02 +08:00
CGH0S7	74849effb7	Fix fourth-order C lopsided and KO stencil indexing	2026-05-14 15:21:16 +08:00
CGH0S7	b1e80f9778	Fix shell C kernel symbol names for Fortran linkage (fderivs_sh_ etc.) Shell C functions must export Fortran-compatible symbols with trailing underscore so bssn_rhs_ss.f90 and getnp4.f90 can link when WithShell is active and USE_CXX_SHELL_KERNELS=1 replaces Fortran diff_new_sh.o. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 14:09:33 +08:00
CGH0S7	320a3694c2	Add full GAUGE 2-7 support to Z4C C RHS kernel (z4c_rhs_c.C) Previously only GAUGE 0 and 1 were supported with a compile error for 2-7. Now supports all 8 gauge choices matching BSSN Fortran formulas: - GAUGE 2: variable-eta gamma-driver, chi-sqrt denominator - GAUGE 3: variable-eta gamma-driver, chi-linear denominator - GAUGE 4: first-order variable-eta, chi-sqrt denominator - GAUGE 5: first-order variable-eta, chi-linear denominator - GAUGE 6: Jason's rational position-dependent damping - GAUGE 7: Jason's exponential position-dependent damping Also fixes dtSf advection/dissipation guards for gauges where dtSf is active. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 13:01:36 +08:00
CGH0S7	ac92b75dc1	Add C kernel for BSSN-EM (Maxwell/electromagnetic field) RHS computation New bssn_em_rhs_c.C computes EM field RHS (E,B,Kpsi,Kphi) and stress-energy tensor, then calls the C BSSN RHS kernel with source terms. Replaces empart.f90 when USE_CXX_EM_KERNEL=1. Supports all ghost_width orders via existing derivative kernels. Controlled by USE_CXX_EM_KERNEL switch (default 0, experimental). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:33:44 +08:00
CGH0S7	29b2406d57	Add C implementations of shell-patch derivative kernels (WithShell support) New files provide C equivalents of Fortran diff_new_sh.f90 and kodiss_sh.f90: - fderivs_sh_c.C: first derivatives in shell (rho, sigma, R) coords - fdderivs_sh_c.C: second derivatives in shell coords - fderivs_shc_c.C: shell first derivs + chain rule to Cartesian - fdderivs_shc_c.C: shell second derivs + chain rule to Cartesian - kodiss_sh_c.C: Kreiss-Oliger dissipation on shell patches Also add symmetry_stbd() C implementation and shell fh indexing to share_func.h. Controlled by USE_CXX_SHELL_KERNELS switch (default 0, experimental). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:31:30 +08:00
CGH0S7	b84561426e	Add full FD order support (2nd/4th/6th/8th) to C derivative kernels via ghost_width dispatch Wrap each C kernel in #if (ghost_width == N) blocks matching Fortran stencil coefficients from diff_new.f90, kodiss.f90, and lopsidediff.f90. Add fast-path indexing for ord=1,4,5 in share_func.h. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:29:06 +08:00
CGH0S7	e1e4b1d0fa	Add plot-only restart script to skip recomputation when plotting is interrupted Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-12 15:01:25 +08:00
CGH0S7	3d3a3ba759	Add thread-safe ShellPatch::setupintintstuff with OpenMP Split prolongpointstru into search-only (prolongpointstru_search) and append-only (prolongpointstru_append) functions. Parallelize shell-point interpolation table construction with #pragma omp parallel for collapse(3) and per-thread linked lists (merged after the loop to avoid data races). Add OMP_FLAG = -fopenmp in makefile.inc and ShellPatch.o override rule in makefile for AOCC OpenMP runtime (-lomp already linked). Speedup: setupintintstuff ~2.2x faster on multi-core. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 22:09:10 +08:00
CGH0S7	b13a187219	Accelerate Shell-Patch CPU interpolation	2026-05-08 14:36:54 +08:00
CGH0S7	bf74f2f688	updated aocc openmpi path	2026-05-05 11:13:33 +08:00
CGH0S7	96c5b79a23	Migrate build system from Intel oneAPI to AMD AOCC/AOCL/OpenMPI Replace Intel compilers (ifx/icpx/icx) with AOCC (flang/clang++/clang), Intel MPI (mpiicpx) with AOCC-built OpenMPI (mpicxx), and Intel MKL with AOCL BLIS/libFLAME. Replace -xHost with -march=znver4, -ipo with -flto, -fp-model fast=2 with -ffast-math, -qopenmp with -fopenmp. Remove PGO, TBB allocator, and Intel-specific runtime libraries. Fix MKL-specific includes in TwoPunctures.C and gaussj.C to use standard CBLAS/LAPACKE headers from AOCL. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-04-28 23:03:59 +08:00