From cca3c16c2bf2af85539b9202403f4639c35106b5 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sun, 1 Mar 2026 13:20:46 +0800 Subject: [PATCH] perf(polint): add switchable barycentric ordn=6 path --- AMSS_NCKU_source/fmisc.f90 | 59 +++++++++++++++++++++++++++++++++++++- AMSS_NCKU_source/makefile | 40 +++++++++++++++----------- 2 files changed, 81 insertions(+), 18 deletions(-) diff --git a/AMSS_NCKU_source/fmisc.f90 b/AMSS_NCKU_source/fmisc.f90 index d545644..11c3467 100644 --- a/AMSS_NCKU_source/fmisc.f90 +++ b/AMSS_NCKU_source/fmisc.f90 @@ -1111,10 +1111,13 @@ end subroutine d2dump !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -! common code for cell and vertex +! common code for cell and vertex !------------------------------------------------------------------------------ ! Lagrangian polynomial interpolation !------------------------------------------------------------------------------ +#ifndef POLINT6_USE_BARYCENTRIC +#define POLINT6_USE_BARYCENTRIC 1 +#endif !DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville subroutine polint6_neville(xa, ya, x, y, dy) @@ -1177,6 +1180,56 @@ end subroutine d2dump return end subroutine polint6_neville +!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric + subroutine polint6_barycentric(xa, ya, x, y, dy) + implicit none + + real*8, dimension(6), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, j + real*8, dimension(6) :: lambda + real*8 :: dx, den_i, term, num, den + + do i = 1, 6 + if (x == xa(i)) then + y = ya(i) + dy = 0.d0 + return + end if + end do + + do i = 1, 6 + den_i = 1.d0 + do j = 1, 6 + if (j /= i) then + dx = xa(i) - xa(j) + if (dx == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + den_i = den_i * dx + end if + end do + lambda(i) = 1.d0 / den_i + end do + + num = 0.d0 + den = 0.d0 + do i = 1, 6 + term = lambda(i) / (x - xa(i)) + num = num + term * ya(i) + den = den + term + end do + + y = num / den + dy = 0.d0 + + return + end subroutine polint6_barycentric + !DIR$ ATTRIBUTES FORCEINLINE :: polint subroutine polint(xa, ya, x, y, dy, ordn) implicit none @@ -1191,7 +1244,11 @@ end subroutine d2dump real*8 :: dif, dift, hp, h, den_val if (ordn == 6) then +#if POLINT6_USE_BARYCENTRIC + call polint6_barycentric(xa, ya, x, y, dy) +#else call polint6_neville(xa, ya, x, y, dy) +#endif return end if diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index aab8031..40cba90 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,25 +1,31 @@ -include makefile.inc - -## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) -## make -> opt (PGO-guided, maximum performance) -## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) -PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata +include makefile.inc + +## polint(ordn=6) kernel selector: +## 1 (default): barycentric fast path +## 0 : fallback to Neville path +POLINT6_USE_BARY ?= 1 +POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) + +## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) +## make -> opt (PGO-guided, maximum performance) +## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) +PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include -else -## opt (default): maximum performance with PGO profile data -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -align array64byte -fpp -I${MKLROOT}/include -endif +CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +else +## opt (default): maximum performance with PGO profile data +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +endif .SUFFIXES: .o .f90 .C .for .cu