From b185f84cce31cd3e839afa90fddda29834b691d8 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 28 Feb 2026 21:12:19 +0800 Subject: [PATCH] Add switchable C RK4 kernel and build toggle (cherry picked from commit b91cfff301ad42a70935b323d4872b26176604b4) --- AMSS_NCKU_source/makefile | 64 ++++++----- AMSS_NCKU_source/makefile.inc | 6 + AMSS_NCKU_source/rungekutta4_rout_c.C | 155 ++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 28 deletions(-) create mode 100644 AMSS_NCKU_source/rungekutta4_rout_c.C diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 9c8848f..ba12c5b 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,33 +1,33 @@ -include makefile.inc - -## polint(ordn=6) kernel selector: -## 1 (default): barycentric fast path -## 0 : fallback to Neville path -POLINT6_USE_BARY ?= 1 -POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) - -## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) -## make -> opt (PGO-guided, maximum performance) -## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) -PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata +include makefile.inc -ifeq ($(PGO_MODE),instrument) -## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -else -## opt (default): maximum performance with PGO profile data -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -endif +## polint(ordn=6) kernel selector: +## 1 (default): barycentric fast path +## 0 : fallback to Neville path +POLINT6_USE_BARY ?= 1 +POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) + +## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) +## make -> opt (PGO-guided, maximum performance) +## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) +PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata + +ifeq ($(PGO_MODE),instrument) +## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability +CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +else +## opt (default): maximum performance with PGO profile data +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +endif .SUFFIXES: .o .f90 .C .for .cu @@ -92,6 +92,14 @@ endif # CUDA rewrite: bssn_rhs_cuda.o replaces all CFILES (stencils are built-in) CFILES_CUDA = bssn_rhs_cuda.o +## RK4 kernel switch (independent from USE_CXX_KERNELS) +ifeq ($(USE_CXX_RK4),1) +CFILES += rungekutta4_rout_c.o +RK4_F90_OBJ = +else +RK4_F90_OBJ = rungekutta4_rout.o +endif + C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ @@ -109,7 +117,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ - rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\ + $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index b72ee4d..eddbf3a 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -33,6 +33,12 @@ endif ## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) ## 0 : fall back to original Fortran kernels USE_CXX_KERNELS ?= 1 + +## RK4 kernel implementation switch +## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) +## 0 : use original Fortran rungekutta4_rout.o +USE_CXX_RK4 ?= 1 + f90 = ifx f77 = ifx CXX = icpx diff --git a/AMSS_NCKU_source/rungekutta4_rout_c.C b/AMSS_NCKU_source/rungekutta4_rout_c.C new file mode 100644 index 0000000..a017b88 --- /dev/null +++ b/AMSS_NCKU_source/rungekutta4_rout_c.C @@ -0,0 +1,155 @@ +#include "rungekutta4_rout.h" +#include +#include +#include +#include + +namespace { + +inline void rk4_stage0(std::size_t n, + const double *__restrict f0, + const double *__restrict frhs, + double *__restrict f1, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d vr = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d vr = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * frhs[i]; + } +} + +inline void rk4_rhs_accum(std::size_t n, + const double *__restrict f1, + double *__restrict frhs) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d v2 = _mm512_set1_pd(2.0); + for (; i + 7 < n; i += 8) { + const __m512d v1 = _mm512_loadu_pd(f1 + i); + const __m512d vrhs = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs)); + } +#elif defined(__AVX2__) + const __m256d v2 = _mm256_set1_pd(2.0); + for (; i + 3 < n; i += 4) { + const __m256d v1 = _mm256_loadu_pd(f1 + i); + const __m256d vrhs = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + frhs[i] = frhs[i] + 2.0 * f1[i]; + } +} + +inline void rk4_f1_from_f0_f1(std::size_t n, + const double *__restrict f0, + double *__restrict f1, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d v1 = _mm512_loadu_pd(f1 + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d v1 = _mm256_loadu_pd(f1 + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * f1[i]; + } +} + +inline void rk4_stage3(std::size_t n, + const double *__restrict f0, + double *__restrict f1, + const double *__restrict frhs, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d v1 = _mm512_loadu_pd(f1 + i); + const __m512d vr = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d v1 = _mm256_loadu_pd(f1 + i); + const __m256d vr = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * (f1[i] + frhs[i]); + } +} + +} // namespace + +extern "C" { + +int f_rungekutta4_rout(int *ex, double &dT, + double *f0, double *f1, double *f_rhs, + int &RK4) { + const std::size_t n = static_cast(ex[0]) * + static_cast(ex[1]) * + static_cast(ex[2]); + const double *const __restrict f0r = f0; + double *const __restrict f1r = f1; + double *const __restrict frhs = f_rhs; + + if (__builtin_expect(static_cast(RK4) > 3u, 0)) { + std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } + + switch (RK4) { + case 0: + rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT); + break; + case 1: + rk4_rhs_accum(n, f1r, frhs); + rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT); + break; + case 2: + rk4_rhs_accum(n, f1r, frhs); + rk4_f1_from_f0_f1(n, f0r, f1r, dT); + break; + default: + rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT); + break; + } + + return 0; +} + +} // extern "C"