Optimize BSSN RHS and finite difference calculations

- Integrate Intel oneMKL VML for efficient Gauge calculation in bssn_rhs.f90
- Refactor fderivs in diff_new.f90 to separate bulk/boundary loops for better vectorization
- Add optimization report in docs/optimization_report.md
This commit is contained in:
CGH0S7
2026-01-19 10:49:14 +08:00
parent 039dce4d65
commit 3914659ebb
3 changed files with 827 additions and 106 deletions

View File

@@ -24,6 +24,9 @@
Gmx_Res, Gmy_Res, Gmz_Res, &
Symmetry,Lev,eps,co) result(gont)
! calculate constraint violation when co=0
#if (GAUGE == 6 || GAUGE == 7)
use mkl_vml
#endif
implicit none
!~~~~~~> Input parameters:
@@ -97,11 +100,13 @@
#endif
#if (GAUGE == 6 || GAUGE == 7)
integer :: BHN,i,j,k
integer :: BHN,i,j,k, idx, total_points
real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2
real*8 :: r1,r2,M,A,w1,w2,C1,C2,denom_r
real*8, dimension(ex(1),ex(2),ex(3)) :: reta
real*8, allocatable :: vml_r1(:), vml_r2(:)
real*8, allocatable :: vml_res1(:), vml_res2(:)
call getpbh(BHN,Porg,Mass)
#endif
@@ -862,17 +867,41 @@
C1 = 1.d0/Mass(1) - A
C2 = 1.d0/Mass(2) - A
denom_r = ((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
total_points = ex(1)*ex(2)*ex(3)
allocate(vml_r1(total_points), vml_r2(total_points))
allocate(vml_res1(total_points), vml_res2(total_points))
idx = 0
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
r1 = ((Porg(1)-X(i))**2+(Porg(2)-Y(j))**2+(Porg(3)-Z(k))**2)/ &
((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
r2 = ((Porg(4)-X(i))**2+(Porg(5)-Y(j))**2+(Porg(6)-Z(k))**2)/ &
((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
reta(i,j,k) = A + C1/(ONE+w1*r1) + C2/(ONE+w2*r2)
idx = idx + 1
r1 = ((Porg(1)-X(i))**2+(Porg(2)-Y(j))**2+(Porg(3)-Z(k))**2)/denom_r
r2 = ((Porg(4)-X(i))**2+(Porg(5)-Y(j))**2+(Porg(6)-Z(k))**2)/denom_r
! Prepare for 1/(1+w*r) -> vdInv
vml_r1(idx) = ONE + w1*r1
vml_r2(idx) = ONE + w2*r2
enddo
enddo
enddo
call vdInv(total_points, vml_r1, vml_res1)
call vdInv(total_points, vml_r2, vml_res2)
idx = 0
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
idx = idx + 1
reta(i,j,k) = A + C1*vml_res1(idx) + C2*vml_res2(idx)
enddo
enddo
enddo
deallocate(vml_r1, vml_r2, vml_res1, vml_res2)
else
write(*,*) "not support BH_num in Jason's form 1",BHN
endif
@@ -892,17 +921,41 @@
C1 = 1.d0/Mass(1) - A
C2 = 1.d0/Mass(2) - A
denom_r = ((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
total_points = ex(1)*ex(2)*ex(3)
allocate(vml_r1(total_points), vml_r2(total_points))
allocate(vml_res1(total_points), vml_res2(total_points))
idx = 0
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
r1 = ((Porg(1)-X(i))**2+(Porg(2)-Y(j))**2+(Porg(3)-Z(k))**2)/ &
((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
r2 = ((Porg(4)-X(i))**2+(Porg(5)-Y(j))**2+(Porg(6)-Z(k))**2)/ &
((Porg(1)-Porg(4))**2+(Porg(2)-Porg(5))**2+(Porg(3)-Porg(6))**2)
reta(i,j,k) = A + C1*dexp(-w1*r1) + C2*dexp(-w2*r2)
idx = idx + 1
r1 = ((Porg(1)-X(i))**2+(Porg(2)-Y(j))**2+(Porg(3)-Z(k))**2)/denom_r
r2 = ((Porg(4)-X(i))**2+(Porg(5)-Y(j))**2+(Porg(6)-Z(k))**2)/denom_r
! Prepare for dexp -> vdExp
vml_r1(idx) = -w1*r1
vml_r2(idx) = -w2*r2
enddo
enddo
enddo
call vdExp(total_points, vml_r1, vml_res1)
call vdExp(total_points, vml_r2, vml_res2)
idx = 0
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
idx = idx + 1
reta(i,j,k) = A + C1*vml_res1(idx) + C2*vml_res2(idx)
enddo
enddo
enddo
deallocate(vml_r1, vml_r2, vml_res1, vml_res2)
else
write(*,*) "not support BH_num in Jason's form 2",BHN
endif