perf(restrict3): shrink X-pass ii sweep to required overlap window
- compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes - keep stencil math unchanged; add bounds sanity check for ii window
This commit is contained in:
@@ -1955,11 +1955,11 @@
|
|||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
||||||
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
|
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
|
||||||
real*8 :: v1, v2, v3, v4, v5, v6
|
real*8 :: v1, v2, v3, v4, v5, v6
|
||||||
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max
|
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max
|
||||||
real*8 :: res_line
|
real*8 :: res_line
|
||||||
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
|
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
||||||
write(*,*)"dim = ",wei
|
write(*,*)"dim = ",wei
|
||||||
@@ -2072,26 +2072,26 @@
|
|||||||
|
|
||||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||||
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
||||||
jc_min = minval(ciy(jmino:jmaxo))
|
jc_min = minval(ciy(jmino:jmaxo))
|
||||||
jc_max = maxval(ciy(jmino:jmaxo))
|
jc_max = maxval(ciy(jmino:jmaxo))
|
||||||
ic_min = minval(cix(imino:imaxo))
|
ic_min = minval(cix(imino:imaxo))
|
||||||
ic_max = maxval(cix(imino:imaxo))
|
ic_max = maxval(cix(imino:imaxo))
|
||||||
|
|
||||||
do k = kmino, kmaxo
|
do k = kmino, kmaxo
|
||||||
pz = piz(k); kc = ciz(k)
|
pz = piz(k); kc = ciz(k)
|
||||||
! --- Pass 1: Z 方向,只算一次 ---
|
! --- Pass 1: Z 方向,只算一次 ---
|
||||||
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3)
|
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3)
|
||||||
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3)
|
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3)
|
||||||
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
||||||
end do
|
end do
|
||||||
end do
|
end do
|
||||||
|
|
||||||
do j = jmino, jmaxo
|
do j = jmino, jmaxo
|
||||||
py = piy(j); jc = ciy(j)
|
py = piy(j); jc = ciy(j)
|
||||||
! --- Pass 2: Y 方向 ---
|
! --- Pass 2: Y 方向 ---
|
||||||
do ii = ic_min-2, ic_max+3
|
do ii = ic_min-2, ic_max+3
|
||||||
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
||||||
end do
|
end do
|
||||||
! --- Pass 3: X 方向 ---
|
! --- Pass 3: X 方向 ---
|
||||||
do i = imino, imaxo
|
do i = imino, imaxo
|
||||||
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
|
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
|
||||||
@@ -2352,9 +2352,10 @@ end do
|
|||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
|
|
||||||
real*8 :: tmp_xz_plane(-1:extf(1), 6)
|
real*8 :: tmp_xz_plane(-1:extf(1), 6)
|
||||||
real*8 :: tmp_x_line(-1:extf(1))
|
real*8 :: tmp_x_line(-1:extf(1))
|
||||||
integer :: fi, fj, fk, ii, jj, kk
|
integer :: fi, fj, fk, ii, jj, kk
|
||||||
|
integer :: fi_min, fi_max, ii_lo, ii_hi
|
||||||
|
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
||||||
@@ -2436,6 +2437,18 @@ end do
|
|||||||
|
|
||||||
call symmetry_bd(2,extf,funf,funff,SoA)
|
call symmetry_bd(2,extf,funf,funff,SoA)
|
||||||
|
|
||||||
|
! 仅计算 X 向最终写回所需的窗口:
|
||||||
|
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
|
||||||
|
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||||
|
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
|
||||||
|
ii_lo = fi_min - 2
|
||||||
|
ii_hi = fi_max + 3
|
||||||
|
if(ii_lo < -1 .or. ii_hi > extf(1))then
|
||||||
|
write(*,*)"restrict3: invalid ii window",ii_lo,ii_hi
|
||||||
|
write(*,*)"imino,imaxo,lbc(1),lbf(1),extf(1) = ",imino,imaxo,lbc(1),lbf(1),extf(1)
|
||||||
|
stop
|
||||||
|
endif
|
||||||
|
|
||||||
!~~~~~~> restriction start...
|
!~~~~~~> restriction start...
|
||||||
do k = kmino, kmaxo
|
do k = kmino, kmaxo
|
||||||
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
|
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
|
||||||
@@ -2446,7 +2459,7 @@ do k = kmino, kmaxo
|
|||||||
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
||||||
! 确保 ii 循环是最内层且连续访问
|
! 确保 ii 循环是最内层且连续访问
|
||||||
!DIR$ VECTOR ALWAYS
|
!DIR$ VECTOR ALWAYS
|
||||||
do ii = -1, extf(1)
|
do ii = ii_lo, ii_hi
|
||||||
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
||||||
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
||||||
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
||||||
@@ -2471,7 +2484,7 @@ do k = kmino, kmaxo
|
|||||||
|
|
||||||
! 优化点 2: 同样向量化 Y 方向压缩
|
! 优化点 2: 同样向量化 Y 方向压缩
|
||||||
!DIR$ VECTOR ALWAYS
|
!DIR$ VECTOR ALWAYS
|
||||||
do ii = -1, extf(1)
|
do ii = ii_lo, ii_hi
|
||||||
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
||||||
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
||||||
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
||||||
|
|||||||
Reference in New Issue
Block a user