对prolong3做访存优化
This commit is contained in:
@@ -1912,7 +1912,7 @@
|
||||
! 63/8192*f_6 - 495/8192*f_5 + 1155/4096*f_4
|
||||
!--------------------------------------------------------------------------
|
||||
#if 1
|
||||
subroutine prolong3(wei,llbc,uubc,extc,func,&
|
||||
subroutine prolong3(wei,llbc,uubc,extc,func,&
|
||||
llbf,uubf,extf,funf,&
|
||||
llbp,uubp,SoA,Symmetry)
|
||||
implicit none
|
||||
@@ -1944,6 +1944,14 @@
|
||||
integer, dimension(extf(2)) :: piy
|
||||
integer, dimension(extf(3)) :: piz
|
||||
|
||||
! ~~~> 在子程序开头新增声明 <~~~
|
||||
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
||||
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果
|
||||
real*8 :: v1, v2, v3, v4, v5, v6
|
||||
integer :: ic, jc, kc, iy, ix_offset
|
||||
real*8 :: v_m2, v_m1, v_0, v_p1, v_p2, v_p3
|
||||
real*8 :: res_line
|
||||
! ~~~> 声明end <~~~
|
||||
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
|
||||
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
|
||||
real*8, dimension(6,2), parameter :: WC = reshape((/&
|
||||
@@ -2069,119 +2077,146 @@
|
||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||
|
||||
!~~~~~~> prolongation start...
|
||||
do k = kmino,kmaxo
|
||||
do j = jmino,jmaxo
|
||||
do i = imino,imaxo
|
||||
cxI(1) = cix(i)
|
||||
cxI(2) = ciy(j)
|
||||
cxI(3) = ciz(k)
|
||||
px = pix(i)
|
||||
py = piy(j)
|
||||
pz = piz(k)
|
||||
! ~~~> 循环体开始 <~~~
|
||||
#if 0
|
||||
if(ii/2*2==ii)then
|
||||
if(jj/2*2==jj)then
|
||||
if(kk/2*2==kk)then
|
||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||
else
|
||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||
endif
|
||||
else
|
||||
if(kk/2*2==kk)then
|
||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||
else
|
||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
if(jj/2*2==jj)then
|
||||
if(kk/2*2==kk)then
|
||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||
else
|
||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||
endif
|
||||
else
|
||||
if(kk/2*2==kk)then
|
||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||
else
|
||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
#else
|
||||
tmp2= WC(1,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||
WC(2,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||
WC(3,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||
WC(4,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||
WC(5,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||
WC(6,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||
do k = kmino, kmaxo
|
||||
pz = piz(k)
|
||||
kc = ciz(k)
|
||||
|
||||
tmp1= WC(1,py)*tmp2(:,1)+WC(2,py)*tmp2(:,2)+WC(3,py)*tmp2(:,3)+&
|
||||
WC(4,py)*tmp2(:,4)+WC(5,py)*tmp2(:,5)+WC(6,py)*tmp2(:,6)
|
||||
do j = jmino, jmaxo
|
||||
py = piy(j)
|
||||
jc = ciy(j)
|
||||
|
||||
funf(i,j,k)= WC(1,px)*tmp1(1)+WC(2,px)*tmp1(2)+WC(3,px)*tmp1(3)+&
|
||||
WC(4,px)*tmp1(4)+WC(5,px)*tmp1(5)+WC(6,px)*tmp1(6)
|
||||
! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值
|
||||
! 结果存入 tmp_yz(x_index, y_offset)
|
||||
do jj = 1, 6
|
||||
iy = jc - 3 + jj
|
||||
do ii = 1, extc(1)
|
||||
tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + &
|
||||
WC(2,pz)*funcc(ii, iy, kc-1) + &
|
||||
WC(3,pz)*funcc(ii, iy, kc ) + &
|
||||
WC(4,pz)*funcc(ii, iy, kc+1) + &
|
||||
WC(5,pz)*funcc(ii, iy, kc+2) + &
|
||||
WC(6,pz)*funcc(ii, iy, kc+3)
|
||||
end do
|
||||
end do
|
||||
|
||||
! 2. 【降维:Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值
|
||||
do ii = 1, extc(1)
|
||||
tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + &
|
||||
WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + &
|
||||
WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6)
|
||||
end do
|
||||
|
||||
i = imino
|
||||
do while (i < imaxo)
|
||||
! 针对 i
|
||||
ic = cix(i)
|
||||
v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1)
|
||||
v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1)
|
||||
v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3)
|
||||
|
||||
! 这里的 WC(1:6, pix(i)) 被直接硬编码
|
||||
! 注意:根据你的 pix 定义,1 和 2 分别对应 C1..C6 和 C6..C1
|
||||
if (pix(i) == 2) then
|
||||
funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3
|
||||
funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3
|
||||
else
|
||||
funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3
|
||||
funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3
|
||||
end if
|
||||
|
||||
i = i + 2
|
||||
end do
|
||||
|
||||
! 处理可能存在的最后一个孤立点
|
||||
if (i == imaxo) then
|
||||
ic = cix(i); px = pix(i)
|
||||
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + &
|
||||
WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + &
|
||||
WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3)
|
||||
end if
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
do i = imino, imaxo
|
||||
px = pix(i)
|
||||
ic = cix(i)
|
||||
|
||||
! 直接从预计算好的 line 中读取连续的 6 个点
|
||||
! ic-2 到 ic+3 对应原始 6 点算子
|
||||
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
|
||||
WC(2,px)*tmp_xyz_line(ic-1) + &
|
||||
WC(3,px)*tmp_xyz_line(ic ) + &
|
||||
WC(4,px)*tmp_xyz_line(ic+1) + &
|
||||
WC(5,px)*tmp_xyz_line(ic+2) + &
|
||||
WC(6,px)*tmp_xyz_line(ic+3)
|
||||
end do
|
||||
#endif
|
||||
end do
|
||||
end do
|
||||
#endif
|
||||
enddo
|
||||
enddo
|
||||
enddo
|
||||
|
||||
|
||||
! ~~~> 修改后的核心循环块 <~~~
|
||||
do k = kmino, kmaxo
|
||||
pz = piz(k)
|
||||
kc = ciz(k)
|
||||
|
||||
do j = jmino, jmaxo
|
||||
py = piy(j)
|
||||
jc = ciy(j)
|
||||
|
||||
! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 ---
|
||||
! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问
|
||||
do ii = 1, extc(1)
|
||||
! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy)
|
||||
! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组
|
||||
! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存
|
||||
|
||||
res_line = 0.0d0
|
||||
do jj = 1, 6
|
||||
iy = jc - 3 + jj
|
||||
! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重
|
||||
! 编译器会把 WC(jj, py) 存在寄存器里
|
||||
res_line = res_line + WC(jj, py) * ( &
|
||||
WC(1, pz) * funcc(ii, iy, kc-2) + &
|
||||
WC(2, pz) * funcc(ii, iy, kc-1) + &
|
||||
WC(3, pz) * funcc(ii, iy, kc ) + &
|
||||
WC(4, pz) * funcc(ii, iy, kc+1) + &
|
||||
WC(5, pz) * funcc(ii, iy, kc+2) + &
|
||||
WC(6, pz) * funcc(ii, iy, kc+3) )
|
||||
end do
|
||||
tmp_xyz_line(ii) = res_line
|
||||
end do
|
||||
|
||||
! --- 步骤 3:X 向插值(保持你现在的双点展开,这部分已经很快了) ---
|
||||
i = imino
|
||||
do while (i < imaxo)
|
||||
ic = cix(i)
|
||||
v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1)
|
||||
v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1)
|
||||
v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3)
|
||||
|
||||
if (pix(i) == 2) then
|
||||
funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3
|
||||
funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3
|
||||
else
|
||||
funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3
|
||||
funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3
|
||||
end if
|
||||
i = i + 2
|
||||
end do
|
||||
|
||||
if (i == imaxo) then
|
||||
ic = cix(i); px = pix(i)
|
||||
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + &
|
||||
WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + &
|
||||
WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3)
|
||||
end if
|
||||
end do
|
||||
end do
|
||||
return
|
||||
|
||||
end subroutine prolong3
|
||||
|
||||
Reference in New Issue
Block a user