perf(polint): add switchable barycentric ordn=6 path
This commit is contained in:
@@ -1111,10 +1111,13 @@ end subroutine d2dump
|
|||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
! common code for cell and vertex
|
! common code for cell and vertex
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
! Lagrangian polynomial interpolation
|
! Lagrangian polynomial interpolation
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
|
#ifndef POLINT6_USE_BARYCENTRIC
|
||||||
|
#define POLINT6_USE_BARYCENTRIC 1
|
||||||
|
#endif
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
|
||||||
subroutine polint6_neville(xa, ya, x, y, dy)
|
subroutine polint6_neville(xa, ya, x, y, dy)
|
||||||
@@ -1177,6 +1180,56 @@ end subroutine d2dump
|
|||||||
return
|
return
|
||||||
end subroutine polint6_neville
|
end subroutine polint6_neville
|
||||||
|
|
||||||
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
|
||||||
|
subroutine polint6_barycentric(xa, ya, x, y, dy)
|
||||||
|
implicit none
|
||||||
|
|
||||||
|
real*8, dimension(6), intent(in) :: xa, ya
|
||||||
|
real*8, intent(in) :: x
|
||||||
|
real*8, intent(out) :: y, dy
|
||||||
|
|
||||||
|
integer :: i, j
|
||||||
|
real*8, dimension(6) :: lambda
|
||||||
|
real*8 :: dx, den_i, term, num, den
|
||||||
|
|
||||||
|
do i = 1, 6
|
||||||
|
if (x == xa(i)) then
|
||||||
|
y = ya(i)
|
||||||
|
dy = 0.d0
|
||||||
|
return
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
|
||||||
|
do i = 1, 6
|
||||||
|
den_i = 1.d0
|
||||||
|
do j = 1, 6
|
||||||
|
if (j /= i) then
|
||||||
|
dx = xa(i) - xa(j)
|
||||||
|
if (dx == 0.0d0) then
|
||||||
|
write(*,*) 'failure in polint for point',x
|
||||||
|
write(*,*) 'with input points: ',xa
|
||||||
|
stop
|
||||||
|
end if
|
||||||
|
den_i = den_i * dx
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
lambda(i) = 1.d0 / den_i
|
||||||
|
end do
|
||||||
|
|
||||||
|
num = 0.d0
|
||||||
|
den = 0.d0
|
||||||
|
do i = 1, 6
|
||||||
|
term = lambda(i) / (x - xa(i))
|
||||||
|
num = num + term * ya(i)
|
||||||
|
den = den + term
|
||||||
|
end do
|
||||||
|
|
||||||
|
y = num / den
|
||||||
|
dy = 0.d0
|
||||||
|
|
||||||
|
return
|
||||||
|
end subroutine polint6_barycentric
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
||||||
subroutine polint(xa, ya, x, y, dy, ordn)
|
subroutine polint(xa, ya, x, y, dy, ordn)
|
||||||
implicit none
|
implicit none
|
||||||
@@ -1191,7 +1244,11 @@ end subroutine d2dump
|
|||||||
real*8 :: dif, dift, hp, h, den_val
|
real*8 :: dif, dift, hp, h, den_val
|
||||||
|
|
||||||
if (ordn == 6) then
|
if (ordn == 6) then
|
||||||
|
#if POLINT6_USE_BARYCENTRIC
|
||||||
|
call polint6_barycentric(xa, ya, x, y, dy)
|
||||||
|
#else
|
||||||
call polint6_neville(xa, ya, x, y, dy)
|
call polint6_neville(xa, ya, x, y, dy)
|
||||||
|
#endif
|
||||||
return
|
return
|
||||||
end if
|
end if
|
||||||
|
|
||||||
|
|||||||
@@ -1,25 +1,31 @@
|
|||||||
|
|
||||||
|
|
||||||
include makefile.inc
|
include makefile.inc
|
||||||
|
|
||||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
## polint(ordn=6) kernel selector:
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
## 1 (default): barycentric fast path
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
## 0 : fallback to Neville path
|
||||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
POLINT6_USE_BARY ?= 1
|
||||||
|
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||||
|
|
||||||
|
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||||
|
## make -> opt (PGO-guided, maximum performance)
|
||||||
|
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||||
|
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||||
|
|
||||||
ifeq ($(PGO_MODE),instrument)
|
ifeq ($(PGO_MODE),instrument)
|
||||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||||
else
|
else
|
||||||
## opt (default): maximum performance with PGO profile data
|
## opt (default): maximum performance with PGO profile data
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user