From d11eaa22425c6474ae68fc05a40792d27db4a297 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 21 Jan 2026 11:22:33 +0800 Subject: [PATCH] Optimize bssn_rhs.f90: Fuse loops for metric inversion and Christoffel symbols to improve cache locality --- AMSS_NCKU_source/bssn_rhs.f90 | 159 +++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 70 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs.f90 b/AMSS_NCKU_source/bssn_rhs.f90 index e7637b0..ccb76be 100644 --- a/AMSS_NCKU_source/bssn_rhs.f90 +++ b/AMSS_NCKU_source/bssn_rhs.f90 @@ -61,7 +61,9 @@ real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res ! gont = 0: success; gont = 1: something wrong - integer::gont + integer::gont,i,j,k + real*8 :: val1, val2 + real*8 :: det, t_gupxx, t_gupxy, t_gupxz, t_gupyy, t_gupyz, t_gupzz !~~~~~~> Other variables: @@ -191,82 +193,99 @@ gyz * betayx + gzz * betazx & - gxz * betayy !rhs for gij -! invert tilted metric - gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - & - gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz - gupzz = ONE / gupzz - gupxx = ( gyy * gzz - gyz * gyz ) * gupzz - gupxy = - ( gxy * gzz - gyz * gxz ) * gupzz - gupxz = ( gxy * gyz - gyy * gxz ) * gupzz - gupyy = ( gxx * gzz - gxz * gxz ) * gupzz - gupyz = - ( gxx * gyz - gxy * gxz ) * gupzz - gupzz = ( gxx * gyy - gxy * gxy ) * gupzz +! fused loop for metric inversion and connections + !DIR$ SIMD + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) + ! 1. Metric Inversion + det = ONE / ( & + gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + & + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - & + gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) ) + + t_gupxx = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) * det + t_gupxy = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) * det + t_gupxz = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) * det + t_gupyy = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) * det + t_gupyz = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) * det + t_gupzz = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) * det - if(co == 0)then -! Gam^i_Res = Gam^i + gup^ij_,j - Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)& - +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)& - +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)& - +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)& - +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)& - +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)& - +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)& - +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& - +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) - Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)& - +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)& - +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)& - +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)& - +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)& - +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)& - +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)& - +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& - +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) - Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)& - +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)& - +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)& - +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)& - +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)& - +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)& - +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)& - +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)& - +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz)) - endif + gupxx(i,j,k) = t_gupxx + gupxy(i,j,k) = t_gupxy + gupxz(i,j,k) = t_gupxz + gupyy(i,j,k) = t_gupyy + gupyz(i,j,k) = t_gupyz + gupzz(i,j,k) = t_gupzz -! second kind of connection - ! reuse fxx/fxy as scratch arrays for common terms - fxx = TWO * gxyx - gxxy - fxy = TWO * gxzx - gxxz - Gamxxx =HALF*( gupxx*gxxx + gupxy*fxx + gupxz*fxy ) - Gamyxx =HALF*( gupxy*gxxx + gupyy*fxx + gupyz*fxy ) - Gamzxx =HALF*( gupxz*gxxx + gupyz*fxx + gupzz*fxy ) + if(co == 0)then + Gmx_Res(i,j,k) = Gamx(i,j,k) - (t_gupxx*(t_gupxx*gxxx(i,j,k)+t_gupxy*gxyx(i,j,k)+t_gupxz*gxzx(i,j,k))& + +t_gupxy*(t_gupxx*gxyx(i,j,k)+t_gupxy*gyyx(i,j,k)+t_gupxz*gyzx(i,j,k))& + +t_gupxz*(t_gupxx*gxzx(i,j,k)+t_gupxy*gyzx(i,j,k)+t_gupxz*gzzx(i,j,k))& + +t_gupxx*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))& + +t_gupxy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))& + +t_gupxz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))& + +t_gupxx*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))& + +t_gupxy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))& + +t_gupxz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k))) + Gmy_Res(i,j,k) = Gamy(i,j,k) - (t_gupxx*(t_gupxy*gxxx(i,j,k)+t_gupyy*gxyx(i,j,k)+t_gupyz*gxzx(i,j,k))& + +t_gupxy*(t_gupxy*gxyx(i,j,k)+t_gupyy*gyyx(i,j,k)+t_gupyz*gyzx(i,j,k))& + +t_gupxz*(t_gupxy*gxzx(i,j,k)+t_gupyy*gyzx(i,j,k)+t_gupyz*gzzx(i,j,k))& + +t_gupxy*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))& + +t_gupyy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))& + +t_gupyz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))& + +t_gupxy*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))& + +t_gupyy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))& + +t_gupyz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k))) + Gmz_Res(i,j,k) = Gamz(i,j,k) - (t_gupxx*(t_gupxz*gxxx(i,j,k)+t_gupyz*gxyx(i,j,k)+t_gupzz*gxzx(i,j,k))& + +t_gupxy*(t_gupxz*gxyx(i,j,k)+t_gupyz*gyyx(i,j,k)+t_gupzz*gyzx(i,j,k))& + +t_gupxz*(t_gupxz*gxzx(i,j,k)+t_gupyz*gyzx(i,j,k)+t_gupzz*gzzx(i,j,k))& + +t_gupxy*(t_gupxz*gxxy(i,j,k)+t_gupyz*gxyy(i,j,k)+t_gupzz*gxzy(i,j,k))& + +t_gupyy*(t_gupxz*gxyy(i,j,k)+t_gupyz*gyyy(i,j,k)+t_gupzz*gyzy(i,j,k))& + +t_gupyz*(t_gupxz*gxzy(i,j,k)+t_gupyz*gyzy(i,j,k)+t_gupzz*gzzy(i,j,k))& + +t_gupxz*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))& + +t_gupyz*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))& + +t_gupzz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k))) + endif - fxx = TWO * gxyy - gyyx - fxy = TWO * gyzy - gyyz - Gamxyy =HALF*( gupxx*fxx + gupxy*gyyy + gupxz*fxy ) - Gamyyy =HALF*( gupxy*fxx + gupyy*gyyy + gupyz*fxy ) - Gamzyy =HALF*( gupxz*fxx + gupyz*gyyy + gupzz*fxy ) + ! 2. Christoffel Symbols + val1 = TWO * gxyx(i,j,k) - gxxy(i,j,k) + val2 = TWO * gxzx(i,j,k) - gxxz(i,j,k) + Gamxxx(i,j,k) =HALF*( t_gupxx*gxxx(i,j,k) + t_gupxy*val1 + t_gupxz*val2 ) + Gamyxx(i,j,k) =HALF*( t_gupxy*gxxx(i,j,k) + t_gupyy*val1 + t_gupyz*val2 ) + Gamzxx(i,j,k) =HALF*( t_gupxz*gxxx(i,j,k) + t_gupyz*val1 + t_gupzz*val2 ) - fxx = TWO * gxzz - gzzx - fxy = TWO * gyzz - gzzy - Gamxzz =HALF*( gupxx*fxx + gupxy*fxy + gupxz*gzzz ) - Gamyzz =HALF*( gupxy*fxx + gupyy*fxy + gupyz*gzzz ) - Gamzzz =HALF*( gupxz*fxx + gupyz*fxy + gupzz*gzzz ) + val1 = TWO * gxyy(i,j,k) - gyyx(i,j,k) + val2 = TWO * gyzy(i,j,k) - gyyz(i,j,k) + Gamxyy(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyy(i,j,k) + t_gupxz*val2 ) + Gamyyy(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyy(i,j,k) + t_gupyz*val2 ) + Gamzyy(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyy(i,j,k) + t_gupzz*val2 ) - fxx = gxzy + gyzx - gxyz - Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*fxx ) - Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*fxx ) - Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*fxx ) + val1 = TWO * gxzz(i,j,k) - gzzx(i,j,k) + val2 = TWO * gyzz(i,j,k) - gzzy(i,j,k) + Gamxzz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*val2 + t_gupxz*gzzz(i,j,k) ) + Gamyzz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*val2 + t_gupyz*gzzz(i,j,k) ) + Gamzzz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*val2 + t_gupzz*gzzz(i,j,k) ) + + val1 = gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k) + Gamxxy(i,j,k) =HALF*( t_gupxx*gxxy(i,j,k) + t_gupxy*gyyx(i,j,k) + t_gupxz*val1 ) + Gamyxy(i,j,k) =HALF*( t_gupxy*gxxy(i,j,k) + t_gupyy*gyyx(i,j,k) + t_gupyz*val1 ) + Gamzxy(i,j,k) =HALF*( t_gupxz*gxxy(i,j,k) + t_gupyz*gyyx(i,j,k) + t_gupzz*val1 ) + + val1 = gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k) + Gamxxz(i,j,k) =HALF*( t_gupxx*gxxz(i,j,k) + t_gupxy*val1 + t_gupxz*gzzx(i,j,k) ) + Gamyxz(i,j,k) =HALF*( t_gupxy*gxxz(i,j,k) + t_gupyy*val1 + t_gupyz*gzzx(i,j,k) ) + Gamzxz(i,j,k) =HALF*( t_gupxz*gxxz(i,j,k) + t_gupyz*val1 + t_gupzz*gzzx(i,j,k) ) + + val1 = gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k) + Gamxyz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyz(i,j,k) + t_gupxz*gzzy(i,j,k) ) + Gamyyz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyz(i,j,k) + t_gupyz*gzzy(i,j,k) ) + Gamzyz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyz(i,j,k) + t_gupzz*gzzy(i,j,k) ) + enddo + enddo + enddo - fxx = gxyz + gyzx - gxzy - Gamxxz =HALF*( gupxx*gxxz + gupxy*fxx + gupxz*gzzx ) - Gamyxz =HALF*( gupxy*gxxz + gupyy*fxx + gupyz*gzzx ) - Gamzxz =HALF*( gupxz*gxxz + gupyz*fxx + gupzz*gzzx ) - fxx = gxyz + gxzy - gyzx - Gamxyz =HALF*( gupxx*fxx + gupxy*gyyz + gupxz*gzzy ) - Gamyyz =HALF*( gupxy*fxx + gupyy*gyyz + gupyz*gzzy ) - Gamzyz =HALF*( gupxz*fxx + gupyz*gyyz + gupzz*gzzy ) ! Raise indices of \tilde A_{ij} and store in R_ij Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &