From 83c826eb496b1493009564ce22a1daddc6615df0 Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Mon, 2 Mar 2026 21:20:49 +0800 Subject: [PATCH] =?UTF-8?q?prolong3:=20=E5=87=8F=E5=B0=91Z-pass=20?= =?UTF-8?q?=E5=86=97=E4=BD=99=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 112 +++++++++++++++++----- 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index f1957df..6666321 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1957,8 +1957,9 @@ real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, ix_offset,ix,iy,iz + integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max real*8 :: res_line + real*8 :: tmp_z_slab(extc(1), extc(2)) ! 分配在 k 循环外 if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2070,8 +2071,34 @@ endif call symmetry_bd(3,extc,func,funcc,SoA) - + ! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片 +jc_min = minval(ciy(jmino:jmaxo)) +jc_max = maxval(ciy(jmino:jmaxo)) + +do k = kmino, kmaxo + pz = piz(k); kc = ciz(k) + ! --- Pass 1: Z 方向,只算一次 --- + do iy = jc_min-3, jc_max+3 ! 仅需的 iy 范围 + do ii = imini-3, imaxi+3 ! 仅需的 ii 范围 + tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) + end do + end do + + do j = jmino, jmaxo + py = piy(j); jc = ciy(j) + ! --- Pass 2: Y 方向 --- + do ii = imini-3, imaxi+3 + tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) + end do + ! --- Pass 3: X 方向 --- + do i = imino, imaxo + funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3)) + end do + end do +end do + !~~~~~~> prolongation start... +#if 0 do k = kmino, kmaxo pz = piz(k) kc = ciz(k) @@ -2105,28 +2132,7 @@ -#if 0 - ! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值 - ! 结果存入 tmp_yz(x_index, y_offset) - do jj = 1, 6 - iy = jc - 3 + jj - do ii = 1, extc(1) - tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + & - WC(2,pz)*funcc(ii, iy, kc-1) + & - WC(3,pz)*funcc(ii, iy, kc ) + & - WC(4,pz)*funcc(ii, iy, kc+1) + & - WC(5,pz)*funcc(ii, iy, kc+2) + & - WC(6,pz)*funcc(ii, iy, kc+3) - end do - end do - ! 2. 【降维:Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值 - do ii = 1, extc(1) - tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + & - WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + & - WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6) - end do -#endif ! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权 ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次 do i = imino, imaxo @@ -2144,7 +2150,7 @@ end do end do end do - +#endif return end subroutine prolong3 @@ -2343,7 +2349,11 @@ integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo real*8,dimension(3) :: CD,FD - + + real*8 :: tmp_xz_plane(extf(1), 6) + real*8 :: tmp_x_line(extf(1)) + integer :: fi, fj, fk, ii, jj, kk + if(wei.ne.3)then write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2425,6 +2435,56 @@ call symmetry_bd(2,extf,funf,funff,SoA) !~~~~~~> restriction start... +do k = kmino, kmaxo + fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1 + + do j = jmino, jmaxo + fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1 + + ! 优化点 1: 显式展开 Z 方向计算,减少循环开销 + ! 确保 ii 循环是最内层且连续访问 + !DIR$ VECTOR ALWAYS + do ii = 1, extf(1) + ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果 + ! 这里直接硬编码 jj 的偏移,彻底消除一层循环 + tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + & + C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + & + C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1)) + tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + & + C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + & + C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1)) + tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + & + C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + & + C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1)) + tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + & + C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + & + C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1)) + tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + & + C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + & + C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1)) + tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + & + C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + & + C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1)) + end do + + ! 优化点 2: 同样向量化 Y 方向压缩 + !DIR$ VECTOR ALWAYS + do ii = 1, extf(1) + tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + & + C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + & + C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4)) + end do + + ! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据 + do i = imino, imaxo + fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1 + func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + & + C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + & + C3*(tmp_x_line(fi ) + tmp_x_line(fi+1)) + end do + end do +end do +#if 0 do k = kmino,kmaxo do j = jmino,jmaxo do i = imino,imaxo @@ -2448,7 +2508,7 @@ enddo enddo enddo - +#endif return end subroutine restrict3