From 23a82d063b6a1b1dbf84a9abbf8d1b362a9c5b30 Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Mon, 2 Mar 2026 01:16:10 +0800 Subject: [PATCH 01/13] =?UTF-8?q?=E5=AF=B9prolong3=E5=81=9A=E8=AE=BF?= =?UTF-8?q?=E5=AD=98=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 397 ++++++++++++---------- 1 file changed, 216 insertions(+), 181 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index b62e722..b879953 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1912,7 +1912,7 @@ ! 63/8192*f_6 - 495/8192*f_5 + 1155/4096*f_4 !-------------------------------------------------------------------------- #if 1 - subroutine prolong3(wei,llbc,uubc,extc,func,& + subroutine prolong3(wei,llbc,uubc,extc,func,& llbf,uubf,extf,funf,& llbp,uubp,SoA,Symmetry) implicit none @@ -1932,29 +1932,37 @@ real*8, dimension(1:3) :: base integer,dimension(3) :: lbc,ubc,lbf,ubf,lbp,ubp,lbpc,ubpc ! when if=1 -> ic=0, this is different to vertex center grid - real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc - integer,dimension(3) :: cxI - integer :: i,j,k,ii,jj,kk,px,py,pz - real*8, dimension(6,6) :: tmp2 - real*8, dimension(6) :: tmp1 - integer, dimension(extf(1)) :: cix - integer, dimension(extf(2)) :: ciy - integer, dimension(extf(3)) :: ciz - integer, dimension(extf(1)) :: pix - integer, dimension(extf(2)) :: piy - integer, dimension(extf(3)) :: piz - - real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 - real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 - real*8, dimension(6,2), parameter :: WC = reshape((/& - C1,C2,C3,C4,C5,C6,& - C6,C5,C4,C3,C2,C1/), (/6,2/)) - - integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi - integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo - integer::maxcx,maxcy,maxcz - - real*8,dimension(3) :: CD,FD + real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc + integer,dimension(3) :: cxI + integer :: i,j,k,ii,jj,kk,px,py,pz + real*8, dimension(6,6) :: tmp2 + real*8, dimension(6) :: tmp1 + integer, dimension(extf(1)) :: cix + integer, dimension(extf(2)) :: ciy + integer, dimension(extf(3)) :: ciz + integer, dimension(extf(1)) :: pix + integer, dimension(extf(2)) :: piy + integer, dimension(extf(3)) :: piz + +! ~~~> 在子程序开头新增声明 <~~~ + real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 + real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 + real*8 :: v1, v2, v3, v4, v5, v6 + integer :: ic, jc, kc, iy, ix_offset + real*8 :: v_m2, v_m1, v_0, v_p1, v_p2, v_p3 + real*8 :: res_line +! ~~~> 声明end <~~~ + real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 + real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 + real*8, dimension(6,2), parameter :: WC = reshape((/& + C1,C2,C3,C4,C5,C6,& + C6,C5,C4,C3,C2,C1/), (/6,2/)) + + integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi + integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo + integer::maxcx,maxcy,maxcz + + real*8,dimension(3) :: CD,FD if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" @@ -2013,10 +2021,10 @@ kmini=lbpc(3)-lbc(3) + 1 kmaxi=ubpc(3)-lbc(3) + 1 - if(imino.lt.1.or.jmino.lt.1.or.kmino.lt.1.or.& - imini.lt.1.or.jmini.lt.1.or.kmini.lt.1.or.& - imaxo.gt.extf(1).or.jmaxo.gt.extf(2).or.kmaxo.gt.extf(3).or.& - imaxi.gt.extc(1)-2.or.jmaxi.gt.extc(2)-2.or.kmaxi.gt.extc(3)-2)then + if(imino.lt.1.or.jmino.lt.1.or.kmino.lt.1.or.& + imini.lt.1.or.jmini.lt.1.or.kmini.lt.1.or.& + imaxo.gt.extf(1).or.jmaxo.gt.extf(2).or.kmaxo.gt.extf(3).or.& + imaxi.gt.extc(1)-2.or.jmaxi.gt.extc(2)-2.or.kmaxi.gt.extc(3)-2)then write(*,*)"error in prolongation for" write(*,*)"from" write(*,*)llbc,uubc @@ -2027,161 +2035,188 @@ write(*,*)"want" write(*,*)llbp,uubp write(*,*)lbp,ubp,lbpc,ubpc - return - endif - - do i = imino,imaxo - ii = i + lbf(1) - 1 - cix(i) = ii/2 - lbc(1) + 1 - if(ii/2*2 == ii)then - pix(i) = 1 - else - pix(i) = 2 - endif - enddo - do j = jmino,jmaxo - jj = j + lbf(2) - 1 - ciy(j) = jj/2 - lbc(2) + 1 - if(jj/2*2 == jj)then - piy(j) = 1 - else - piy(j) = 2 - endif - enddo - do k = kmino,kmaxo - kk = k + lbf(3) - 1 - ciz(k) = kk/2 - lbc(3) + 1 - if(kk/2*2 == kk)then - piz(k) = 1 - else - piz(k) = 2 - endif - enddo - - maxcx = maxval(cix(imino:imaxo)) - maxcy = maxval(ciy(jmino:jmaxo)) - maxcz = maxval(ciz(kmino:kmaxo)) - if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then - write(*,*)"error in prolong" - return - endif - - call symmetry_bd(3,extc,func,funcc,SoA) - -!~~~~~~> prolongation start... - do k = kmino,kmaxo - do j = jmino,jmaxo - do i = imino,imaxo - cxI(1) = cix(i) - cxI(2) = ciy(j) - cxI(3) = ciz(k) - px = pix(i) - py = piy(j) - pz = piz(k) -#if 0 - if(ii/2*2==ii)then - if(jj/2*2==jj)then - if(kk/2*2==kk)then - tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6) - funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6) - else - tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6) - funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6) - endif - else - if(kk/2*2==kk)then - tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6) - funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6) - else - tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6) - funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6) - endif - endif - else - if(jj/2*2==jj)then - if(kk/2*2==kk)then - tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6) - funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6) - else - tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6) - funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6) - endif - else - if(kk/2*2==kk)then - tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6) - funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6) - else - tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6) - funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6) - endif - endif - endif -#else - tmp2= WC(1,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - WC(2,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - WC(3,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - WC(4,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - WC(5,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - WC(6,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - - tmp1= WC(1,py)*tmp2(:,1)+WC(2,py)*tmp2(:,2)+WC(3,py)*tmp2(:,3)+& - WC(4,py)*tmp2(:,4)+WC(5,py)*tmp2(:,5)+WC(6,py)*tmp2(:,6) - - funf(i,j,k)= WC(1,px)*tmp1(1)+WC(2,px)*tmp1(2)+WC(3,px)*tmp1(3)+& - WC(4,px)*tmp1(4)+WC(5,px)*tmp1(5)+WC(6,px)*tmp1(6) -#endif - enddo - enddo + return + endif + + do i = imino,imaxo + ii = i + lbf(1) - 1 + cix(i) = ii/2 - lbc(1) + 1 + if(ii/2*2 == ii)then + pix(i) = 1 + else + pix(i) = 2 + endif + enddo + do j = jmino,jmaxo + jj = j + lbf(2) - 1 + ciy(j) = jj/2 - lbc(2) + 1 + if(jj/2*2 == jj)then + piy(j) = 1 + else + piy(j) = 2 + endif + enddo + do k = kmino,kmaxo + kk = k + lbf(3) - 1 + ciz(k) = kk/2 - lbc(3) + 1 + if(kk/2*2 == kk)then + piz(k) = 1 + else + piz(k) = 2 + endif enddo + maxcx = maxval(cix(imino:imaxo)) + maxcy = maxval(ciy(jmino:jmaxo)) + maxcz = maxval(ciz(kmino:kmaxo)) + if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then + write(*,*)"error in prolong" + return + endif + + call symmetry_bd(3,extc,func,funcc,SoA) + +!~~~~~~> prolongation start... + ! ~~~> 循环体开始 <~~~ +#if 0 + do k = kmino, kmaxo + pz = piz(k) + kc = ciz(k) + + do j = jmino, jmaxo + py = piy(j) + jc = ciy(j) + + ! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值 + ! 结果存入 tmp_yz(x_index, y_offset) + do jj = 1, 6 + iy = jc - 3 + jj + do ii = 1, extc(1) + tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + & + WC(2,pz)*funcc(ii, iy, kc-1) + & + WC(3,pz)*funcc(ii, iy, kc ) + & + WC(4,pz)*funcc(ii, iy, kc+1) + & + WC(5,pz)*funcc(ii, iy, kc+2) + & + WC(6,pz)*funcc(ii, iy, kc+3) + end do + end do + + ! 2. 【降维:Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值 + do ii = 1, extc(1) + tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + & + WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + & + WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6) + end do + + i = imino + do while (i < imaxo) + ! 针对 i + ic = cix(i) + v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1) + v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1) + v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3) + + ! 这里的 WC(1:6, pix(i)) 被直接硬编码 + ! 注意:根据你的 pix 定义,1 和 2 分别对应 C1..C6 和 C6..C1 + if (pix(i) == 2) then + funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 + funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 + else + funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 + funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 + end if + + i = i + 2 + end do + + ! 处理可能存在的最后一个孤立点 + if (i == imaxo) then + ic = cix(i); px = pix(i) + funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + & + WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + & + WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3) + end if + + + +#if 0 + do i = imino, imaxo + px = pix(i) + ic = cix(i) + + ! 直接从预计算好的 line 中读取连续的 6 个点 + ! ic-2 到 ic+3 对应原始 6 点算子 + funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + & + WC(2,px)*tmp_xyz_line(ic-1) + & + WC(3,px)*tmp_xyz_line(ic ) + & + WC(4,px)*tmp_xyz_line(ic+1) + & + WC(5,px)*tmp_xyz_line(ic+2) + & + WC(6,px)*tmp_xyz_line(ic+3) + end do +#endif + end do + end do +#endif + + +! ~~~> 修改后的核心循环块 <~~~ + do k = kmino, kmaxo + pz = piz(k) + kc = ciz(k) + + do j = jmino, jmaxo + py = piy(j) + jc = ciy(j) + + ! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 --- + ! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问 + do ii = 1, extc(1) + ! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy) + ! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组 + ! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存 + + res_line = 0.0d0 + do jj = 1, 6 + iy = jc - 3 + jj + ! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重 + ! 编译器会把 WC(jj, py) 存在寄存器里 + res_line = res_line + WC(jj, py) * ( & + WC(1, pz) * funcc(ii, iy, kc-2) + & + WC(2, pz) * funcc(ii, iy, kc-1) + & + WC(3, pz) * funcc(ii, iy, kc ) + & + WC(4, pz) * funcc(ii, iy, kc+1) + & + WC(5, pz) * funcc(ii, iy, kc+2) + & + WC(6, pz) * funcc(ii, iy, kc+3) ) + end do + tmp_xyz_line(ii) = res_line + end do + + ! --- 步骤 3:X 向插值(保持你现在的双点展开,这部分已经很快了) --- + i = imino + do while (i < imaxo) + ic = cix(i) + v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1) + v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1) + v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3) + + if (pix(i) == 2) then + funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 + funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 + else + funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 + funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 + end if + i = i + 2 + end do + + if (i == imaxo) then + ic = cix(i); px = pix(i) + funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + & + WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + & + WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3) + end if + end do + end do return end subroutine prolong3 From 75dd5353b03ad0d90b998a5dd7ad81ddbfb298ec Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Mon, 2 Mar 2026 02:01:07 +0800 Subject: [PATCH 02/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9prolong?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 115 ++-------------------- 1 file changed, 9 insertions(+), 106 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index b879953..8593cad 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1912,7 +1912,7 @@ ! 63/8192*f_6 - 495/8192*f_5 + 1155/4096*f_4 !-------------------------------------------------------------------------- #if 1 - subroutine prolong3(wei,llbc,uubc,extc,func,& + subroutine prolong3(wei,llbc,uubc,extc,func,& llbf,uubf,extf,funf,& llbp,uubp,SoA,Symmetry) implicit none @@ -1944,14 +1944,6 @@ integer, dimension(extf(2)) :: piy integer, dimension(extf(3)) :: piz -! ~~~> 在子程序开头新增声明 <~~~ - real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 - real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 - real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, iy, ix_offset - real*8 :: v_m2, v_m1, v_0, v_p1, v_p2, v_p3 - real*8 :: res_line -! ~~~> 声明end <~~~ real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 real*8, dimension(6,2), parameter :: WC = reshape((/& @@ -1963,7 +1955,11 @@ integer::maxcx,maxcy,maxcz real*8,dimension(3) :: CD,FD - + real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 + real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 + real*8 :: v1, v2, v3, v4, v5, v6 + integer :: ic, jc, kc, ix_offset,ix,iy,iz + if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2077,9 +2073,7 @@ call symmetry_bd(3,extc,func,funcc,SoA) !~~~~~~> prolongation start... - ! ~~~> 循环体开始 <~~~ -#if 0 - do k = kmino, kmaxo + do k = kmino, kmaxo pz = piz(k) kc = ciz(k) @@ -2108,38 +2102,8 @@ WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6) end do - i = imino - do while (i < imaxo) - ! 针对 i - ic = cix(i) - v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1) - v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1) - v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3) - - ! 这里的 WC(1:6, pix(i)) 被直接硬编码 - ! 注意:根据你的 pix 定义,1 和 2 分别对应 C1..C6 和 C6..C1 - if (pix(i) == 2) then - funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 - funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 - else - funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 - funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 - end if - - i = i + 2 - end do - - ! 处理可能存在的最后一个孤立点 - if (i == imaxo) then - ic = cix(i); px = pix(i) - funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + & - WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + & - WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3) - end if - - - -#if 0 + ! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权 + ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次 do i = imino, imaxo px = pix(i) ic = cix(i) @@ -2153,70 +2117,9 @@ WC(5,px)*tmp_xyz_line(ic+2) + & WC(6,px)*tmp_xyz_line(ic+3) end do -#endif end do end do -#endif - -! ~~~> 修改后的核心循环块 <~~~ - do k = kmino, kmaxo - pz = piz(k) - kc = ciz(k) - - do j = jmino, jmaxo - py = piy(j) - jc = ciy(j) - - ! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 --- - ! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问 - do ii = 1, extc(1) - ! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy) - ! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组 - ! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存 - - res_line = 0.0d0 - do jj = 1, 6 - iy = jc - 3 + jj - ! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重 - ! 编译器会把 WC(jj, py) 存在寄存器里 - res_line = res_line + WC(jj, py) * ( & - WC(1, pz) * funcc(ii, iy, kc-2) + & - WC(2, pz) * funcc(ii, iy, kc-1) + & - WC(3, pz) * funcc(ii, iy, kc ) + & - WC(4, pz) * funcc(ii, iy, kc+1) + & - WC(5, pz) * funcc(ii, iy, kc+2) + & - WC(6, pz) * funcc(ii, iy, kc+3) ) - end do - tmp_xyz_line(ii) = res_line - end do - - ! --- 步骤 3:X 向插值(保持你现在的双点展开,这部分已经很快了) --- - i = imino - do while (i < imaxo) - ic = cix(i) - v_m2 = tmp_xyz_line(ic-2); v_m1 = tmp_xyz_line(ic-1) - v_0 = tmp_xyz_line(ic ); v_p1 = tmp_xyz_line(ic+1) - v_p2 = tmp_xyz_line(ic+2); v_p3 = tmp_xyz_line(ic+3) - - if (pix(i) == 2) then - funf(i, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 - funf(i+1, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 - else - funf(i, j, k) = C1*v_m2 + C2*v_m1 + C3*v_0 + C4*v_p1 + C5*v_p2 + C6*v_p3 - funf(i+1, j, k) = C6*v_m2 + C5*v_m1 + C4*v_0 + C3*v_p1 + C2*v_p2 + C1*v_p3 - end if - i = i + 2 - end do - - if (i == imaxo) then - ic = cix(i); px = pix(i) - funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + WC(2,px)*tmp_xyz_line(ic-1) + & - WC(3,px)*tmp_xyz_line(ic ) + WC(4,px)*tmp_xyz_line(ic+1) + & - WC(5,px)*tmp_xyz_line(ic+2) + WC(6,px)*tmp_xyz_line(ic+3) - end if - end do - end do return end subroutine prolong3 From f70e90f694ab5a2bcf6c4208978f9f12d18a3f28 Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Mon, 2 Mar 2026 10:31:46 +0800 Subject: [PATCH 03/13] =?UTF-8?q?prolong3=EF=BC=9A=E6=8F=90=E5=8D=87cache?= =?UTF-8?q?=E5=91=BD=E4=B8=AD=E7=8E=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 30 +++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index 8593cad..9074b26 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1959,7 +1959,7 @@ real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 real*8 :: v1, v2, v3, v4, v5, v6 integer :: ic, jc, kc, ix_offset,ix,iy,iz - + real*8 :: res_line if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2081,6 +2081,32 @@ py = piy(j) jc = ciy(j) +! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 --- + ! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问 + do ii = 1, extc(1) + ! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy) + ! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组 + ! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存 + + res_line = 0.0d0 + do jj = 1, 6 + iy = jc - 3 + jj + ! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重 + ! 编译器会把 WC(jj, py) 存在寄存器里 + res_line = res_line + WC(jj, py) * ( & + WC(1, pz) * funcc(ii, iy, kc-2) + & + WC(2, pz) * funcc(ii, iy, kc-1) + & + WC(3, pz) * funcc(ii, iy, kc ) + & + WC(4, pz) * funcc(ii, iy, kc+1) + & + WC(5, pz) * funcc(ii, iy, kc+2) + & + WC(6, pz) * funcc(ii, iy, kc+3) ) + end do + tmp_xyz_line(ii) = res_line + end do + + + +#if 0 ! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值 ! 结果存入 tmp_yz(x_index, y_offset) do jj = 1, 6 @@ -2101,7 +2127,7 @@ WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + & WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6) end do - +#endif ! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权 ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次 do i = imino, imaxo From e11363e06e4b634cbcb0c05e81c73689f8f3a22e Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 03:21:21 +0800 Subject: [PATCH 04/13] Optimize fdderivs: skip redundant 2nd-order work in 4th-order overlap --- AMSS_NCKU_source/fdderivs_c.C | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/AMSS_NCKU_source/fdderivs_c.C b/AMSS_NCKU_source/fdderivs_c.C index 5e2f298..4ae31d4 100644 --- a/AMSS_NCKU_source/fdderivs_c.C +++ b/AMSS_NCKU_source/fdderivs_c.C @@ -141,12 +141,26 @@ void fdderivs(const int ex[3], const int j4_hi = ex2 - 3; const int k4_hi = ex3 - 3; + /* + * Strategy A: + * Avoid redundant work in overlap of 2nd/4th-order regions. + * Only compute 2nd-order on shell points that are NOT overwritten by + * the 4th-order pass. + */ + const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi); + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { const int kF = k0 + 1; for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { const int jF = j0 + 1; for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + if (has4 && + i0 >= i4_lo && i0 <= i4_hi && + j0 >= j4_lo && j0 <= j4_hi && + k0 >= k4_lo && k0 <= k4_hi) { + continue; + } const int iF = i0 + 1; const size_t p = idx_ex(i0, j0, k0, ex); @@ -193,7 +207,7 @@ void fdderivs(const int ex[3], } } - if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + if (has4) { for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { const int kF = k0 + 1; for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { From 61ccef9f978bb82d97ec9343e58db9fdc9fa3d90 Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Mon, 2 Mar 2026 21:20:49 +0800 Subject: [PATCH 05/13] =?UTF-8?q?prolong3:=20=E5=87=8F=E5=B0=91Z-pass=20?= =?UTF-8?q?=E5=86=97=E4=BD=99=E8=AE=A1=E7=AE=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 112 +++++++++++++++++----- 1 file changed, 86 insertions(+), 26 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index 9074b26..c1efb36 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1958,8 +1958,9 @@ real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, ix_offset,ix,iy,iz + integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max real*8 :: res_line + real*8 :: tmp_z_slab(extc(1), extc(2)) ! 分配在 k 循环外 if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2071,8 +2072,34 @@ endif call symmetry_bd(3,extc,func,funcc,SoA) - + ! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片 +jc_min = minval(ciy(jmino:jmaxo)) +jc_max = maxval(ciy(jmino:jmaxo)) + +do k = kmino, kmaxo + pz = piz(k); kc = ciz(k) + ! --- Pass 1: Z 方向,只算一次 --- + do iy = jc_min-3, jc_max+3 ! 仅需的 iy 范围 + do ii = imini-3, imaxi+3 ! 仅需的 ii 范围 + tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) + end do + end do + + do j = jmino, jmaxo + py = piy(j); jc = ciy(j) + ! --- Pass 2: Y 方向 --- + do ii = imini-3, imaxi+3 + tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) + end do + ! --- Pass 3: X 方向 --- + do i = imino, imaxo + funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3)) + end do + end do +end do + !~~~~~~> prolongation start... +#if 0 do k = kmino, kmaxo pz = piz(k) kc = ciz(k) @@ -2106,28 +2133,7 @@ -#if 0 - ! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值 - ! 结果存入 tmp_yz(x_index, y_offset) - do jj = 1, 6 - iy = jc - 3 + jj - do ii = 1, extc(1) - tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + & - WC(2,pz)*funcc(ii, iy, kc-1) + & - WC(3,pz)*funcc(ii, iy, kc ) + & - WC(4,pz)*funcc(ii, iy, kc+1) + & - WC(5,pz)*funcc(ii, iy, kc+2) + & - WC(6,pz)*funcc(ii, iy, kc+3) - end do - end do - ! 2. 【降维:Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值 - do ii = 1, extc(1) - tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + & - WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + & - WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6) - end do -#endif ! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权 ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次 do i = imino, imaxo @@ -2145,7 +2151,7 @@ end do end do end do - +#endif return end subroutine prolong3 @@ -2344,7 +2350,11 @@ integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo real*8,dimension(3) :: CD,FD - + + real*8 :: tmp_xz_plane(extf(1), 6) + real*8 :: tmp_x_line(extf(1)) + integer :: fi, fj, fk, ii, jj, kk + if(wei.ne.3)then write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2426,6 +2436,56 @@ call symmetry_bd(2,extf,funf,funff,SoA) !~~~~~~> restriction start... +do k = kmino, kmaxo + fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1 + + do j = jmino, jmaxo + fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1 + + ! 优化点 1: 显式展开 Z 方向计算,减少循环开销 + ! 确保 ii 循环是最内层且连续访问 + !DIR$ VECTOR ALWAYS + do ii = 1, extf(1) + ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果 + ! 这里直接硬编码 jj 的偏移,彻底消除一层循环 + tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + & + C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + & + C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1)) + tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + & + C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + & + C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1)) + tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + & + C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + & + C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1)) + tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + & + C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + & + C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1)) + tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + & + C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + & + C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1)) + tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + & + C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + & + C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1)) + end do + + ! 优化点 2: 同样向量化 Y 方向压缩 + !DIR$ VECTOR ALWAYS + do ii = 1, extf(1) + tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + & + C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + & + C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4)) + end do + + ! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据 + do i = imino, imaxo + fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1 + func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + & + C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + & + C3*(tmp_x_line(fi ) + tmp_x_line(fi+1)) + end do + end do +end do +#if 0 do k = kmino,kmaxo do j = jmino,jmaxo do i = imino,imaxo @@ -2449,7 +2509,7 @@ enddo enddo enddo - +#endif return end subroutine restrict3 From 466b084a588bb3634f76e9972956b34f03dba8a6 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 13:59:47 +0800 Subject: [PATCH 06/13] fix prolong/restrict index bounds after cherry-pick 12e1f63 --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 56 ++++++++++++----------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index c1efb36..8469f94 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1956,11 +1956,11 @@ real*8,dimension(3) :: CD,FD real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 - real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果 - real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max - real*8 :: res_line - real*8 :: tmp_z_slab(extc(1), extc(2)) ! 分配在 k 循环外 + real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界 + real*8 :: v1, v2, v3, v4, v5, v6 + integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max + real*8 :: res_line + real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界 if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2073,24 +2073,26 @@ call symmetry_bd(3,extc,func,funcc,SoA) ! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片 -jc_min = minval(ciy(jmino:jmaxo)) -jc_max = maxval(ciy(jmino:jmaxo)) - -do k = kmino, kmaxo - pz = piz(k); kc = ciz(k) - ! --- Pass 1: Z 方向,只算一次 --- - do iy = jc_min-3, jc_max+3 ! 仅需的 iy 范围 - do ii = imini-3, imaxi+3 ! 仅需的 ii 范围 - tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) - end do - end do - - do j = jmino, jmaxo - py = piy(j); jc = ciy(j) - ! --- Pass 2: Y 方向 --- - do ii = imini-3, imaxi+3 - tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) - end do +jc_min = minval(ciy(jmino:jmaxo)) +jc_max = maxval(ciy(jmino:jmaxo)) +ic_min = minval(cix(imino:imaxo)) +ic_max = maxval(cix(imino:imaxo)) + +do k = kmino, kmaxo + pz = piz(k); kc = ciz(k) + ! --- Pass 1: Z 方向,只算一次 --- + do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3) + do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3) + tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) + end do + end do + + do j = jmino, jmaxo + py = piy(j); jc = ciy(j) + ! --- Pass 2: Y 方向 --- + do ii = ic_min-2, ic_max+3 + tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) + end do ! --- Pass 3: X 方向 --- do i = imino, imaxo funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3)) @@ -2351,8 +2353,8 @@ end do real*8,dimension(3) :: CD,FD - real*8 :: tmp_xz_plane(extf(1), 6) - real*8 :: tmp_x_line(extf(1)) + real*8 :: tmp_xz_plane(-1:extf(1), 6) + real*8 :: tmp_x_line(-1:extf(1)) integer :: fi, fj, fk, ii, jj, kk if(wei.ne.3)then @@ -2445,7 +2447,7 @@ do k = kmino, kmaxo ! 优化点 1: 显式展开 Z 方向计算,减少循环开销 ! 确保 ii 循环是最内层且连续访问 !DIR$ VECTOR ALWAYS - do ii = 1, extf(1) + do ii = -1, extf(1) ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果 ! 这里直接硬编码 jj 的偏移,彻底消除一层循环 tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + & @@ -2470,7 +2472,7 @@ do k = kmino, kmaxo ! 优化点 2: 同样向量化 Y 方向压缩 !DIR$ VECTOR ALWAYS - do ii = 1, extf(1) + do ii = -1, extf(1) tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + & C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + & C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4)) From 95220a05c88fe9418ddb170ba34aceee9866860a Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 17:33:26 +0800 Subject: [PATCH 07/13] optimize fdderivs core-region branch elimination for ghost_width=3 --- AMSS_NCKU_source/diff_newwb.f90 | 69 ++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/AMSS_NCKU_source/diff_newwb.f90 b/AMSS_NCKU_source/diff_newwb.f90 index e6ee09d..1fbbcd2 100644 --- a/AMSS_NCKU_source/diff_newwb.f90 +++ b/AMSS_NCKU_source/diff_newwb.f90 @@ -33,7 +33,7 @@ real*8 :: dX,dY,dZ real*8,dimension(0:ex(1),0:ex(2),0:ex(3)) :: fh real*8, dimension(3) :: SoA - integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k + integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k real*8 :: d2dx,d2dy,d2dz integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2 real*8, parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1 @@ -137,7 +137,7 @@ real*8 :: dX real*8,dimension(0:ex(1),0:ex(2),0:ex(3)) :: fh real*8, dimension(3) :: SoA - integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k + integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k real*8 :: d2dx integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2 real*8, parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1 @@ -1512,8 +1512,9 @@ real*8 :: dX,dY,dZ real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh real*8, dimension(3) :: SoA - integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k - real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz + integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k + integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max + real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2 real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1, F9=9.d0, F45=4.5d1 @@ -1560,17 +1561,55 @@ fxx = ZEO fyy = ZEO - fzz = ZEO - fxy = ZEO - fxz = ZEO - fyz = ZEO - - do k=1,ex(3) - do j=1,ex(2) - do i=1,ex(1) -!~~~~~~ fxx - if(i+2 <= imax .and. i-2 >= imin)then -! + fzz = ZEO + fxy = ZEO + fxz = ZEO + fyz = ZEO + + i_core_min = max(1, imin+2) + i_core_max = min(ex(1), imax-2) + j_core_min = max(1, jmin+2) + j_core_max = min(ex(2), jmax-2) + k_core_min = max(1, kmin+2) + k_core_max = min(ex(3), kmax-2) + + if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then + do k=k_core_min,k_core_max + do j=j_core_min,j_core_max + do i=i_core_min,i_core_max +! interior points always use 4th-order stencils without branch checks + fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) & + -fh(i+2,j,k)+F16*fh(i+1,j,k) ) + fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) & + -fh(i,j+2,k)+F16*fh(i,j+1,k) ) + fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) & + -fh(i,j,k+2)+F16*fh(i,j,k+1) ) + fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) & + -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) & + +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) & + - (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k))) + fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) & + -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) & + +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) & + - (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2))) + fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) & + -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) & + +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) & + - (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2))) + enddo + enddo + enddo + endif + + do k=1,ex(3) + do j=1,ex(2) + do i=1,ex(1) + if(i>=i_core_min .and. i<=i_core_max .and. & + j>=j_core_min .and. j<=j_core_max .and. & + k>=k_core_min .and. k<=k_core_max) cycle +!~~~~~~ fxx + if(i+2 <= imax .and. i-2 >= imin)then +! ! - f(i-2) + 16 f(i-1) - 30 f(i) + 16 f(i+1) - f(i+2) ! fxx(i) = ---------------------------------------------------------- ! 12 dx^2 From 42c69fab24201199b3a42b6804f17c620f162864 Mon Sep 17 00:00:00 2001 From: ianchb Date: Sun, 1 Mar 2026 16:20:51 +0800 Subject: [PATCH 08/13] refactor(Parallel): streamline MPI communication by consolidating request handling and memory management --- AMSS_NCKU_source/Parallel.C | 741 ++++++++++++++++++++++-------------- 1 file changed, 453 insertions(+), 288 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index bd591c4..e3bb4a3 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3883,175 +3883,263 @@ int Parallel::data_packermix(double *data, MyList *src, MyLis return size_out; } // -void Parallel::transfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs; - MPI_Status *stats; - reqs = new MPI_Request[2 * cpusize]; - stats = new MPI_Status[2 * cpusize]; - int req_no = 0; - - double **send_data, **rec_data; - send_data = new double *[cpusize]; - rec_data = new double *[cpusize]; - int length; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - if (node == myrank) - { - if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - // send from this cpu to cpu#node - if (length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - send_data[node] = new double[length]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - // receive from cpu#node to this cpu - if (length = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - } - } - // wait for all requests to complete - MPI_Waitall(req_no, reqs, stats); - - for (node = 0; node < cpusize; node++) - if (rec_data[node]) - data_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] send_data; - delete[] rec_data; -} +void Parallel::transfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} // -void Parallel::transfermix(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs; - MPI_Status *stats; - reqs = new MPI_Request[2 * cpusize]; - stats = new MPI_Status[2 * cpusize]; - int req_no = 0; - - double **send_data, **rec_data; - send_data = new double *[cpusize]; - rec_data = new double *[cpusize]; - int length; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - if (node == myrank) - { - if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - // send from this cpu to cpu#node - if (length = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - send_data[node] = new double[length]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - // receive from cpu#node to this cpu - if (length = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - } - } - // wait for all requests to complete - MPI_Waitall(req_no, reqs, stats); - - for (node = 0; node < cpusize; node++) - if (rec_data[node]) - data_packermix(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] send_data; - delete[] rec_data; -} +void Parallel::transfermix(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) { int cpusize; @@ -4279,73 +4367,110 @@ void Parallel::SyncCache::destroy() cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache -void Parallel::transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - int myrank; +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - int node; - - for (node = 0; node < cpusize; node++) - { - if (node == myrank) - { - int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = length; - if (length > 0) - { - if (length > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[length]; - cache.recv_buf_caps[node] = length; - } - data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - // send - int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); - } - // recv - int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); - } - } - } - - MPI_Waitall(req_no, cache.reqs, cache.stats); - - for (node = 0; node < cpusize; node++) - if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) - data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); -} + int req_no = 0; + int pending_recv = 0; + int node; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} // Sync_cached: build grid segment lists on first call, reuse on subsequent calls void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { @@ -5758,9 +5883,9 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, } // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking -void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) { if (!cache.valid) { @@ -5806,60 +5931,100 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) - { - int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = length; - if (length > 0) - { - if (length > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[length]; - cache.recv_buf_caps[node] = length; - } - data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); - } - int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++); - } - } - } - - MPI_Waitall(req_no, cache.reqs, cache.stats); - - for (int node = 0; node < cpusize; node++) - if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) - data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); -} + int req_no = 0; + int pending_recv = 0; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} // collect all buffer grid segments or blocks for given patch MyList *Parallel::build_buffer_gsl(Patch *Pat) From 7543d3e8c76d8007b287d7c88b1e7cbe5852713f Mon Sep 17 00:00:00 2001 From: ianchb Date: Mon, 2 Mar 2026 15:54:37 +0800 Subject: [PATCH 09/13] =?UTF-8?q?perf(MPatch):=20=E7=94=A8=E7=A9=BA?= =?UTF-8?q?=E9=97=B4=20bin=20=E7=B4=A2=E5=BC=95=E5=8A=A0=E9=80=9F=20Interp?= =?UTF-8?q?=5FPoints=20=E7=9A=84=20block=20=E5=BD=92=E5=B1=9E=E6=9F=A5?= =?UTF-8?q?=E6=89=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 为 Patch::Interp_Points 三个重载引入 BlockBinIndex(候选筛选 + 全扫回退) - 保持原 point-in-block 判定与后续插值/通信流程不变 - 将逐点线性扫块从 O(N_points*N_blocks) 降为近似 O(N_points*k) - 测试:bin 上限如果太大,会引入不必要的索引构建开销。将 bins 上限设为 16。 Co-authored-by: gpt-5.3-codex --- AMSS_NCKU_source/MPatch.C | 350 +++++++++++++++++++++++--------------- 1 file changed, 210 insertions(+), 140 deletions(-) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 563bfcc..956e9c8 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -7,6 +7,7 @@ #include #include #include +#include using namespace std; #include "misc.h" @@ -17,6 +18,168 @@ using namespace std; #include "interp_lb_profile.h" #endif +namespace +{ +struct InterpBlockView +{ + Block *bp; + double llb[dim]; + double uub[dim]; +}; + +struct BlockBinIndex +{ + int bins[dim]; + double lo[dim]; + double inv[dim]; + vector views; + vector> bin_to_blocks; + bool valid; + + BlockBinIndex() : valid(false) + { + for (int i = 0; i < dim; i++) + { + bins[i] = 1; + lo[i] = 0.0; + inv[i] = 0.0; + } + } +}; + +inline int clamp_int(int v, int lo, int hi) +{ + return (v < lo) ? lo : ((v > hi) ? hi : v); +} + +inline int coord_to_bin(double x, double lo, double inv, int nb) +{ + if (nb <= 1 || inv <= 0.0) + return 0; + int b = int(floor((x - lo) * inv)); + return clamp_int(b, 0, nb - 1); +} + +inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2) +{ + return b0 + index.bins[0] * (b1 + index.bins[1] * b2); +} + +inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH) +{ + for (int i = 0; i < dim; i++) + { + if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2) + return false; + } + return true; +} + +void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index) +{ + index = BlockBinIndex(); + + MyList *Bp = patch->blb; + while (Bp) + { + Block *BP = Bp->data; + InterpBlockView view; + view.bp = BP; + for (int i = 0; i < dim; i++) + { +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; + view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; +#else +#ifdef Cell + view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; + view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + index.views.push_back(view); + if (Bp == patch->ble) + break; + Bp = Bp->next; + } + + const int nblocks = int(index.views.size()); + if (nblocks <= 0) + return; + + int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0))); + bins_1d = clamp_int(bins_1d, 1, 32); + for (int i = 0; i < dim; i++) + { + index.bins[i] = bins_1d; + index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i]; + const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i]; + if (hi > index.lo[i] && bins_1d > 1) + index.inv[i] = bins_1d / (hi - index.lo[i]); + else + index.inv[i] = 0.0; + } + + index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]); + + for (int bi = 0; bi < nblocks; bi++) + { + const InterpBlockView &view = index.views[bi]; + int bmin[dim], bmax[dim]; + for (int d = 0; d < dim; d++) + { + const double low = view.llb[d] - DH[d] / 2; + const double up = view.uub[d] + DH[d] / 2; + bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]); + bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]); + if (bmax[d] < bmin[d]) + { + int t = bmin[d]; + bmin[d] = bmax[d]; + bmax[d] = t; + } + } + + for (int bz = bmin[2]; bz <= bmax[2]; bz++) + for (int by = bmin[1]; by <= bmax[1]; by++) + for (int bx = bmin[0]; bx <= bmax[0]; bx++) + index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi); + } + + index.valid = true; +} + +int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) +{ + if (!index.valid) + return -1; + + const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); + const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); + const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]); + const vector &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)]; + + for (size_t ci = 0; ci < cand.size(); ci++) + { + const int bi = cand[ci]; + if (point_in_block_view(index.views[bi], pox, DH)) + return bi; + } + + // Fallback to full scan for numerical edge cases around bin boundaries. + for (size_t bi = 0; bi < index.views.size(); bi++) + if (point_in_block_view(index.views[bi], pox, DH)) + return int(bi); + + return -1; +} +} // namespace + Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { @@ -367,9 +530,11 @@ void Patch::Interp_Points(MyList *VarList, for (int j = 0; j < NN; j++) owner_rank[j] = -1; - double DH[dim], llb[dim], uub[dim]; + double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); for (int j = 0; j < NN; j++) // run along points { @@ -392,57 +557,24 @@ void Patch::Interp_Points(MyList *VarList, } } - MyList *Bp = blb; - bool notfind = true; - while (notfind && Bp) // run along Blocks + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i >= 0) { - Block *BP = Bp->data; - - bool flag = true; - for (int i = 0; i < dim; i++) + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + if (myrank == BP->rank) { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; -#else -#ifdef Cell - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2) + //---> interpolation + varl = VarList; + int k = 0; + while (varl) // run along variables { - flag = false; - break; + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; } } - - if (flag) - { - notfind = false; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - if (Bp == ble) - break; - Bp = Bp->next; } } @@ -535,9 +667,11 @@ void Patch::Interp_Points(MyList *VarList, for (int j = 0; j < NN; j++) owner_rank[j] = -1; - double DH[dim], llb[dim], uub[dim]; + double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); // --- Interpolation phase (identical to original) --- for (int j = 0; j < NN; j++) @@ -561,56 +695,23 @@ void Patch::Interp_Points(MyList *VarList, } } - MyList *Bp = blb; - bool notfind = true; - while (notfind && Bp) + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i >= 0) { - Block *BP = Bp->data; - - bool flag = true; - for (int i = 0; i < dim; i++) + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + if (myrank == BP->rank) { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; -#else -#ifdef Cell - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2) + varl = VarList; + int k = 0; + while (varl) { - flag = false; - break; + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; } } - - if (flag) - { - notfind = false; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - varl = VarList; - int k = 0; - while (varl) - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - if (Bp == ble) - break; - Bp = Bp->next; } } @@ -833,9 +934,11 @@ void Patch::Interp_Points(MyList *VarList, MPI_Comm_group(MPI_COMM_WORLD, &world_group); MPI_Comm_group(Comm_here, &local_group); - double DH[dim], llb[dim], uub[dim]; + double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); for (int j = 0; j < NN; j++) // run along points { @@ -858,57 +961,24 @@ void Patch::Interp_Points(MyList *VarList, } } - MyList *Bp = blb; - bool notfind = true; - while (notfind && Bp) // run along Blocks + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i >= 0) { - Block *BP = Bp->data; - - bool flag = true; - for (int i = 0; i < dim; i++) + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + if (myrank == BP->rank) { -#ifdef Vertex -#ifdef Cell -#error Both Cell and Vertex are defined -#endif - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; -#else -#ifdef Cell - llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; - uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; -#else -#error Not define Vertex nor Cell -#endif -#endif - if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2) + //---> interpolation + varl = VarList; + int k = 0; + while (varl) // run along variables { - flag = false; - break; + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; } } - - if (flag) - { - notfind = false; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - if (Bp == ble) - break; - Bp = Bp->next; } } From e73911f292abd48439c9997b32362139c5bbf8b3 Mon Sep 17 00:00:00 2001 From: ianchb Date: Mon, 2 Mar 2026 16:08:13 +0800 Subject: [PATCH 10/13] perf(restrict3): shrink X-pass ii sweep to required overlap window - compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes - keep stencil math unchanged; add bounds sanity check for ii window --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 71 ++++++++++++++--------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index 8469f94..4f6676b 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1956,11 +1956,11 @@ real*8,dimension(3) :: CD,FD real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 - real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界 - real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max - real*8 :: res_line - real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界 + real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界 + real*8 :: v1, v2, v3, v4, v5, v6 + integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max + real*8 :: res_line + real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界 if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"dim = ",wei @@ -2073,26 +2073,26 @@ call symmetry_bd(3,extc,func,funcc,SoA) ! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片 -jc_min = minval(ciy(jmino:jmaxo)) -jc_max = maxval(ciy(jmino:jmaxo)) -ic_min = minval(cix(imino:imaxo)) -ic_max = maxval(cix(imino:imaxo)) - -do k = kmino, kmaxo - pz = piz(k); kc = ciz(k) - ! --- Pass 1: Z 方向,只算一次 --- - do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3) - do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3) - tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) - end do - end do - - do j = jmino, jmaxo - py = piy(j); jc = ciy(j) - ! --- Pass 2: Y 方向 --- - do ii = ic_min-2, ic_max+3 - tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) - end do +jc_min = minval(ciy(jmino:jmaxo)) +jc_max = maxval(ciy(jmino:jmaxo)) +ic_min = minval(cix(imino:imaxo)) +ic_max = maxval(cix(imino:imaxo)) + +do k = kmino, kmaxo + pz = piz(k); kc = ciz(k) + ! --- Pass 1: Z 方向,只算一次 --- + do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3) + do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3) + tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3)) + end do + end do + + do j = jmino, jmaxo + py = piy(j); jc = ciy(j) + ! --- Pass 2: Y 方向 --- + do ii = ic_min-2, ic_max+3 + tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3)) + end do ! --- Pass 3: X 方向 --- do i = imino, imaxo funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3)) @@ -2353,9 +2353,10 @@ end do real*8,dimension(3) :: CD,FD - real*8 :: tmp_xz_plane(-1:extf(1), 6) - real*8 :: tmp_x_line(-1:extf(1)) + real*8 :: tmp_xz_plane(-1:extf(1), 6) + real*8 :: tmp_x_line(-1:extf(1)) integer :: fi, fj, fk, ii, jj, kk + integer :: fi_min, fi_max, ii_lo, ii_hi if(wei.ne.3)then write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" @@ -2437,6 +2438,18 @@ end do call symmetry_bd(2,extf,funf,funff,SoA) + ! 仅计算 X 向最终写回所需的窗口: + ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3) + fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1 + fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1 + ii_lo = fi_min - 2 + ii_hi = fi_max + 3 + if(ii_lo < -1 .or. ii_hi > extf(1))then + write(*,*)"restrict3: invalid ii window",ii_lo,ii_hi + write(*,*)"imino,imaxo,lbc(1),lbf(1),extf(1) = ",imino,imaxo,lbc(1),lbf(1),extf(1) + stop + endif + !~~~~~~> restriction start... do k = kmino, kmaxo fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1 @@ -2447,7 +2460,7 @@ do k = kmino, kmaxo ! 优化点 1: 显式展开 Z 方向计算,减少循环开销 ! 确保 ii 循环是最内层且连续访问 !DIR$ VECTOR ALWAYS - do ii = -1, extf(1) + do ii = ii_lo, ii_hi ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果 ! 这里直接硬编码 jj 的偏移,彻底消除一层循环 tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + & @@ -2472,7 +2485,7 @@ do k = kmino, kmaxo ! 优化点 2: 同样向量化 Y 方向压缩 !DIR$ VECTOR ALWAYS - do ii = -1, extf(1) + do ii = ii_lo, ii_hi tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + & C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + & C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4)) From b3c367f15b8b6f050def9bbbceeb44f255223f71 Mon Sep 17 00:00:00 2001 From: ianchb Date: Mon, 2 Mar 2026 17:17:16 +0800 Subject: [PATCH 11/13] =?UTF-8?q?prolong3=20=E6=94=B9=E4=B8=BA=E5=85=88?= =?UTF-8?q?=E7=AE=97=E5=AE=9E=E9=99=85=20stencil=20=E7=AA=97=E5=8F=A3?= =?UTF-8?q?=EF=BC=9B=E5=8F=AA=E6=9C=89=E7=AA=97=E5=8F=A3=E8=A7=A6=E5=8F=8A?= =?UTF-8?q?=E5=AF=B9=E7=A7=B0=E8=BE=B9=E7=95=8C=E6=97=B6=E6=89=8D=E8=B5=B0?= =?UTF-8?q?=E5=85=A8=E5=9F=9F=20symmetry=5Fbd=EF=BC=8C=E5=90=A6=E5=88=99?= =?UTF-8?q?=E5=8F=AA=E5=A4=8D=E5=88=B6=E5=BF=85=E9=9C=80=E7=AA=97=E5=8F=A3?= =?UTF-8?q?=E3=80=82restrict3=20=E5=90=8C=E6=A0=B7=E6=94=B9=E6=88=90?= =?UTF-8?q?=E7=AA=97=E5=8F=A3=E5=88=A4=E5=AE=9A=EF=BC=8C=E6=97=A0=E8=A7=A6?= =?UTF-8?q?=E8=BE=B9=E6=97=B6=E4=BB=85=E5=A1=AB=20ii/jj/kk=20=E5=BF=85?= =?UTF-8?q?=E9=9C=80=E7=AA=97=E5=8F=A3=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AMSS_NCKU_source/prolongrestrict_cell.f90 | 62 ++++++++++++++++++----- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index 4f6676b..9fefe48 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1958,7 +1958,9 @@ real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果 real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界 real*8 :: v1, v2, v3, v4, v5, v6 - integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max + integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max + integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi + logical :: need_full_symmetry real*8 :: res_line real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界 if(wei.ne.3)then @@ -2063,20 +2065,35 @@ endif enddo - maxcx = maxval(cix(imino:imaxo)) - maxcy = maxval(ciy(jmino:jmaxo)) - maxcz = maxval(ciz(kmino:kmaxo)) + ic_min = minval(cix(imino:imaxo)) + ic_max = maxval(cix(imino:imaxo)) + jc_min = minval(ciy(jmino:jmaxo)) + jc_max = maxval(ciy(jmino:jmaxo)) + kc_min = minval(ciz(kmino:kmaxo)) + kc_max = maxval(ciz(kmino:kmaxo)) + + maxcx = ic_max + maxcy = jc_max + maxcz = kc_max if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then write(*,*)"error in prolong" return endif - call symmetry_bd(3,extc,func,funcc,SoA) + i_lo = ic_min - 2 + i_hi = ic_max + 3 + j_lo = jc_min - 2 + j_hi = jc_max + 3 + k_lo = kc_min - 2 + k_hi = kc_max + 3 + need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1) + if(need_full_symmetry)then + call symmetry_bd(3,extc,func,funcc,SoA) + else + funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) + endif + ! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片 -jc_min = minval(ciy(jmino:jmaxo)) -jc_max = maxval(ciy(jmino:jmaxo)) -ic_min = minval(cix(imino:imaxo)) -ic_max = maxval(cix(imino:imaxo)) do k = kmino, kmaxo pz = piz(k); kc = ciz(k) @@ -2357,6 +2374,8 @@ end do real*8 :: tmp_x_line(-1:extf(1)) integer :: fi, fj, fk, ii, jj, kk integer :: fi_min, fi_max, ii_lo, ii_hi + integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi + logical :: need_full_symmetry if(wei.ne.3)then write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" @@ -2436,19 +2455,34 @@ end do stop endif - call symmetry_bd(2,extf,funf,funff,SoA) - ! 仅计算 X 向最终写回所需的窗口: ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3) fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1 fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1 + fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1 + fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1 + fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1 + fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1 ii_lo = fi_min - 2 ii_hi = fi_max + 3 - if(ii_lo < -1 .or. ii_hi > extf(1))then - write(*,*)"restrict3: invalid ii window",ii_lo,ii_hi - write(*,*)"imino,imaxo,lbc(1),lbf(1),extf(1) = ",imino,imaxo,lbc(1),lbf(1),extf(1) + jj_lo = fj_min - 2 + jj_hi = fj_max + 3 + kk_lo = fk_min - 2 + kk_hi = fk_max + 3 + if(ii_lo < -1 .or. ii_hi > extf(1) .or. & + jj_lo < -1 .or. jj_hi > extf(2) .or. & + kk_lo < -1 .or. kk_hi > extf(3))then + write(*,*)"restrict3: invalid stencil window" + write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi + write(*,*)"extf=",extf stop endif + need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1) + if(need_full_symmetry)then + call symmetry_bd(2,extf,funf,funff,SoA) + else + funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) + endif !~~~~~~> restriction start... do k = kmino, kmaxo From 4012e9d0686e3941b1ae90c56b9b38c2cf551926 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 20:48:38 +0800 Subject: [PATCH 12/13] =?UTF-8?q?perf(RestrictProlong):=20=E7=94=A8=20Rest?= =?UTF-8?q?rict=5Fcached/OutBdLow2Hi=5Fcached=20=E6=9B=BF=E6=8D=A2?= =?UTF-8?q?=E9=9D=9E=E7=BC=93=E5=AD=98=E7=89=88=E6=9C=AC=EF=BC=8CSync=5Ffi?= =?UTF-8?q?nish=20=E6=94=B9=E4=B8=BA=E6=B8=90=E8=BF=9B=E5=BC=8F=E8=A7=A3?= =?UTF-8?q?=E5=8C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - RestrictProlong/RestrictProlong_aux 中的 Restrict() 和 OutBdLow2Hi() 替换为 _cached 版本, 复用 gridseg 列表和 MPI 缓冲区,避免每次调用重新分配 - 新增 sync_cache_restrict/sync_cache_outbd 两组 per-level 缓存 - Sync_finish 从 MPI_Waitall 改为 MPI_Waitsome 渐进式解包,降低尾延迟 - AsyncSyncState 扩展 req_node/req_is_recv/pending_recv 字段支持渐进解包 Co-Authored-By: Claude Opus 4.6 (1M context) --- AMSS_NCKU_source/Parallel.C | 959 ++++++++++++++++++---------------- AMSS_NCKU_source/Parallel.h | 5 +- AMSS_NCKU_source/bssn_class.C | 44 +- AMSS_NCKU_source/bssn_class.h | 2 + 4 files changed, 527 insertions(+), 483 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index e3bb4a3..4e5e4ec 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -3883,263 +3883,263 @@ int Parallel::data_packermix(double *data, MyList *src, MyLis return size_out; } // -void Parallel::transfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} +void Parallel::transfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} // -void Parallel::transfermix(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ - int myrank, cpusize; - MPI_Comm_size(MPI_COMM_WORLD, &cpusize); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - - int node; - - MPI_Request *reqs = new MPI_Request[2 * cpusize]; - MPI_Status *stats = new MPI_Status[2 * cpusize]; - int *req_node = new int[2 * cpusize]; - int *req_is_recv = new int[2 * cpusize]; - int *completed = new int[2 * cpusize]; - int req_no = 0; - int pending_recv = 0; - - double **send_data = new double *[cpusize]; - double **rec_data = new double *[cpusize]; - int *send_lengths = new int[cpusize]; - int *recv_lengths = new int[cpusize]; - - for (node = 0; node < cpusize; node++) - { - send_data[node] = rec_data[node] = 0; - send_lengths[node] = recv_lengths[node] = 0; - } - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - if (recv_lengths[node] > 0) - { - rec_data[node] = new double[recv_lengths[node]]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - if (recv_lengths[myrank] > 0) - { - rec_data[myrank] = new double[recv_lengths[myrank]]; - if (!rec_data[myrank]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - if (send_lengths[node] > 0) - { - send_data[node] = new double[send_lengths[node]]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, reqs, &outcount, completed, stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node = req_node[idx]; - data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, reqs, stats); - - if (rec_data[myrank]) - data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { - if (send_data[node]) - delete[] send_data[node]; - if (rec_data[node]) - delete[] rec_data[node]; - } - - delete[] reqs; - delete[] stats; - delete[] req_node; - delete[] req_is_recv; - delete[] completed; - delete[] send_data; - delete[] rec_data; - delete[] send_lengths; - delete[] recv_lengths; -} +void Parallel::transfermix(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ + int myrank, cpusize; + MPI_Comm_size(MPI_COMM_WORLD, &cpusize); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + int node; + + MPI_Request *reqs = new MPI_Request[2 * cpusize]; + MPI_Status *stats = new MPI_Status[2 * cpusize]; + int *req_node = new int[2 * cpusize]; + int *req_is_recv = new int[2 * cpusize]; + int *completed = new int[2 * cpusize]; + int req_no = 0; + int pending_recv = 0; + + double **send_data = new double *[cpusize]; + double **rec_data = new double *[cpusize]; + int *send_lengths = new int[cpusize]; + int *recv_lengths = new int[cpusize]; + + for (node = 0; node < cpusize; node++) + { + send_data[node] = rec_data[node] = 0; + send_lengths[node] = recv_lengths[node] = 0; + } + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + recv_lengths[node] = data_packermix(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (recv_lengths[node] > 0) + { + rec_data[node] = new double[recv_lengths[node]]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + recv_lengths[myrank] = data_packermix(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + if (recv_lengths[myrank] > 0) + { + rec_data[myrank] = new double[recv_lengths[myrank]]; + if (!rec_data[myrank]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + send_lengths[node] = data_packermix(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (send_lengths[node] > 0) + { + send_data[node] = new double[send_lengths[node]]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, reqs, &outcount, completed, stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node = req_node[idx]; + data_packermix(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, reqs, stats); + + if (rec_data[myrank]) + data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + for (node = 0; node < cpusize; node++) + { + if (send_data[node]) + delete[] send_data[node]; + if (rec_data[node]) + delete[] rec_data[node]; + } + + delete[] reqs; + delete[] stats; + delete[] req_node; + delete[] req_is_recv; + delete[] completed; + delete[] send_data; + delete[] rec_data; + delete[] send_lengths; + delete[] recv_lengths; +} void Parallel::Sync(Patch *Pat, MyList *VarList, int Symmetry) { int cpusize; @@ -4367,111 +4367,110 @@ void Parallel::SyncCache::destroy() cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache -void Parallel::transfer_cached(MyList **src, MyList **dst, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) -{ - int myrank; +void Parallel::transfer_cached(MyList **src, MyList **dst, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) +{ + int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - int pending_recv = 0; - int node; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; - - // Post receives first so peers can progress rendezvous early. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[myrank] = self_len; - if (self_len > 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; -} -// Sync_cached: build grid segment lists on first call, reuse on subsequent calls + int req_no = 0; + int pending_recv = 0; + int node; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packer(cache.recv_bufs[recv_node_i], src[recv_node_i], dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { if (!cache.valid) @@ -4669,6 +4668,11 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr int cpusize = cache.cpusize; state.req_no = 0; state.active = true; + state.pending_recv = 0; + // Allocate tracking arrays + delete[] state.req_node; delete[] state.req_is_recv; + state.req_node = new int[cache.max_reqs]; + state.req_is_recv = new int[cache.max_reqs]; MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; @@ -4713,6 +4717,8 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.send_buf_caps[node] = slength; } data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry); + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 0; MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); } int rlength; @@ -4730,29 +4736,60 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.recv_bufs[node] = new double[rlength]; cache.recv_buf_caps[node] = rlength; } + state.req_node[state.req_no] = node; + state.req_is_recv[state.req_no] = 1; + state.pending_recv++; MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++); } } } cache.lengths_valid = true; } -// Sync_finish: wait for async MPI operations and unpack +// Sync_finish: progressive unpack as receives complete, then wait for sends void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state, MyList *VarList, int Symmetry) { if (!state.active) return; - MPI_Waitall(state.req_no, cache.reqs, cache.stats); - - int cpusize = cache.cpusize; + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList **src = cache.combined_src; MyList **dst = cache.combined_dst; - for (int node = 0; node < cpusize; node++) - if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0) - data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry); + // Unpack local data first (no MPI needed) + if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0) + data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList, VarList, Symmetry); + // Progressive unpack of remote receives + if (state.pending_recv > 0 && state.req_no > 0) + { + int pending = state.pending_recv; + int *completed = new int[cache.max_reqs]; + while (pending > 0) + { + int outcount = 0; + MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && state.req_is_recv[idx]) + { + int recv_node = state.req_node[idx]; + data_packer(cache.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList, VarList, Symmetry); + pending--; + } + } + } + delete[] completed; + } + + // Wait for remaining sends + if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats); + + delete[] state.req_node; state.req_node = 0; + delete[] state.req_is_recv; state.req_is_recv = 0; state.active = false; } // collect buffer grid segments or blocks for the periodic boundary condition of given patch @@ -5883,9 +5920,9 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, } // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking -void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, - MyList *VarList1, MyList *VarList2, - int Symmetry, SyncCache &cache) +void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, + MyList *VarList1, MyList *VarList2, + int Symmetry, SyncCache &cache) { if (!cache.valid) { @@ -5931,100 +5968,100 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int cpusize = cache.cpusize; - int req_no = 0; - int pending_recv = 0; - int *req_node = new int[cache.max_reqs]; - int *req_is_recv = new int[cache.max_reqs]; - int *completed = new int[cache.max_reqs]; - - // Post receives first so peers can progress rendezvous early. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[node] = rlength; - if (rlength > 0) - { - if (rlength > cache.recv_buf_caps[node]) - { - if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; - cache.recv_bufs[node] = new double[rlength]; - cache.recv_buf_caps[node] = rlength; - } - MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 1; - req_no++; - pending_recv++; - } - } - - // Local transfer on this rank. - int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - cache.recv_lengths[myrank] = self_len; - if (self_len > 0) - { - if (self_len > cache.recv_buf_caps[myrank]) - { - if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; - cache.recv_bufs[myrank] = new double[self_len]; - cache.recv_buf_caps[myrank] = self_len; - } - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); - } - - // Pack and post sends. - for (int node = 0; node < cpusize; node++) - { - if (node == myrank) continue; - - int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - cache.send_lengths[node] = slength; - if (slength > 0) - { - if (slength > cache.send_buf_caps[node]) - { - if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; - cache.send_bufs[node] = new double[slength]; - cache.send_buf_caps[node] = slength; - } - data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); - req_node[req_no] = node; - req_is_recv[req_no] = 0; - req_no++; - } - } - - // Unpack as soon as receive completes to reduce pure wait time. - while (pending_recv > 0) - { - int outcount = 0; - MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); - if (outcount == MPI_UNDEFINED) break; - - for (int i = 0; i < outcount; i++) - { - int idx = completed[i]; - if (idx >= 0 && req_is_recv[idx]) - { - int recv_node_i = req_node[idx]; - data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); - pending_recv--; - } - } - } - - if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); - - if (self_len > 0) - data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; -} + int req_no = 0; + int pending_recv = 0; + int *req_node = new int[cache.max_reqs]; + int *req_is_recv = new int[cache.max_reqs]; + int *completed = new int[cache.max_reqs]; + + // Post receives first so peers can progress rendezvous early. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[node] = rlength; + if (rlength > 0) + { + if (rlength > cache.recv_buf_caps[node]) + { + if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node]; + cache.recv_bufs[node] = new double[rlength]; + cache.recv_buf_caps[node] = rlength; + } + MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 1; + req_no++; + pending_recv++; + } + } + + // Local transfer on this rank. + int self_len = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + cache.recv_lengths[myrank] = self_len; + if (self_len > 0) + { + if (self_len > cache.recv_buf_caps[myrank]) + { + if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank]; + cache.recv_bufs[myrank] = new double[self_len]; + cache.recv_buf_caps[myrank] = self_len; + } + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); + } + + // Pack and post sends. + for (int node = 0; node < cpusize; node++) + { + if (node == myrank) continue; + + int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + cache.send_lengths[node] = slength; + if (slength > 0) + { + if (slength > cache.send_buf_caps[node]) + { + if (cache.send_bufs[node]) delete[] cache.send_bufs[node]; + cache.send_bufs[node] = new double[slength]; + cache.send_buf_caps[node] = slength; + } + data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no); + req_node[req_no] = node; + req_is_recv[req_no] = 0; + req_no++; + } + } + + // Unpack as soon as receive completes to reduce pure wait time. + while (pending_recv > 0) + { + int outcount = 0; + MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats); + if (outcount == MPI_UNDEFINED) break; + + for (int i = 0; i < outcount; i++) + { + int idx = completed[i]; + if (idx >= 0 && req_is_recv[idx]) + { + int recv_node_i = req_node[idx]; + data_packermix(cache.recv_bufs[recv_node_i], cache.combined_src[recv_node_i], cache.combined_dst[recv_node_i], recv_node_i, UNPACK, VarList1, VarList2, Symmetry); + pending_recv--; + } + } + } + + if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats); + + if (self_len > 0) + data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); + + delete[] req_node; + delete[] req_is_recv; + delete[] completed; +} // collect all buffer grid segments or blocks for given patch MyList *Parallel::build_buffer_gsl(Patch *Pat) diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index e17f365..5a72797 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -121,7 +121,10 @@ namespace Parallel struct AsyncSyncState { int req_no; bool active; - AsyncSyncState() : req_no(0), active(false) {} + int *req_node; + int *req_is_recv; + int pending_recv; + AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {} }; void Sync_start(MyList *PatL, MyList *VarList, int Symmetry, diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index eb84f8e..432747e 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -736,6 +736,8 @@ void bssn_class::Initialize() sync_cache_cor = new Parallel::SyncCache[GH->levels]; sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; } //================================================================================================ @@ -2213,7 +2215,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) @@ -2429,7 +2431,7 @@ void bssn_class::RecursiveStep(int lev) if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -2608,7 +2610,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } @@ -2775,7 +2777,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2790,7 +2792,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2809,7 +2811,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -2825,7 +2827,7 @@ void bssn_class::ParallelStep() if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } // a_stream.clear(); // a_stream.str(""); @@ -5796,7 +5798,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); @@ -5820,7 +5822,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif @@ -5847,7 +5849,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); @@ -5871,7 +5873,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif @@ -5940,7 +5942,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, } #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); @@ -5950,7 +5952,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif @@ -5962,7 +5964,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, else // no time refinement levels and for all same time levels { #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); @@ -5972,7 +5974,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif @@ -6027,7 +6029,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) } #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); @@ -6037,7 +6039,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif @@ -6051,7 +6053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) if (myrank == 0) cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; #if (RPB == 0) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); #elif (RPB == 1) // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); @@ -6061,7 +6063,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif @@ -6102,7 +6104,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif @@ -6115,7 +6117,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) { #if (RPB == 0) #if (MIXOUTB == 0) - Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index db434e2..5a8eb2a 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -130,6 +130,8 @@ public: Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] + Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong + Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ConVMonitor; From 5070134857bed9940c9ab0a3cb2b51c8506c6358 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 2 Mar 2026 21:14:35 +0800 Subject: [PATCH 13/13] =?UTF-8?q?perf(transfer=5Fcached):=20=E5=B0=86=20pe?= =?UTF-8?q?r-call=20new/delete=20=E7=9A=84=20req=5Fnode/req=5Fis=5Frecv/co?= =?UTF-8?q?mpleted=20=E6=95=B0=E7=BB=84=E7=A7=BB=E5=85=A5=20SyncCache=20?= =?UTF-8?q?=E5=A4=8D=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 避免 transfer_cached 每次调用分配释放 3 个临时数组,减少堆操作开销。 Co-Authored-By: Claude Opus 4.6 (1M context) --- AMSS_NCKU_source/Parallel.C | 31 +++++++++++++++++++++++-------- AMSS_NCKU_source/Parallel.h | 3 +++ 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 4e5e4ec..b87ce6d 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -4320,7 +4320,7 @@ Parallel::SyncCache::SyncCache() : valid(false), cpusize(0), combined_src(0), combined_dst(0), send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0), send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0), - lengths_valid(false) + lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0) { } // SyncCache invalidate: free grid segment lists but keep buffers @@ -4359,11 +4359,15 @@ void Parallel::SyncCache::destroy() if (recv_bufs) delete[] recv_bufs; if (reqs) delete[] reqs; if (stats) delete[] stats; + if (tc_req_node) delete[] tc_req_node; + if (tc_req_is_recv) delete[] tc_req_is_recv; + if (tc_completed) delete[] tc_completed; combined_src = combined_dst = 0; send_lengths = recv_lengths = 0; send_buf_caps = recv_buf_caps = 0; send_bufs = recv_bufs = 0; reqs = 0; stats = 0; + tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0; cpusize = 0; max_reqs = 0; } // transfer_cached: reuse pre-allocated buffers from SyncCache @@ -4379,9 +4383,9 @@ void Parallel::transfer_cached(MyList **src, MyList **src, MyList 0) data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); - - delete[] req_node; - delete[] req_is_recv; - delete[] completed; } void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmetry, SyncCache &cache) { @@ -4498,6 +4498,9 @@ void Parallel::Sync_cached(MyList *PatL, MyList *VarList, int Symmet cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } for (int node = 0; node < cpusize; node++) @@ -4598,6 +4601,9 @@ void Parallel::Sync_start(MyList *PatL, MyList *VarList, int Symmetr cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } for (int node = 0; node < cpusize; node++) @@ -5856,6 +5862,9 @@ void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_complete_gsl(PatcL); @@ -5902,6 +5911,9 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_buffer_gsl(PatfL); @@ -5948,6 +5960,9 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.max_reqs = 2 * cpusize; cache.reqs = new MPI_Request[cache.max_reqs]; cache.stats = new MPI_Status[cache.max_reqs]; + cache.tc_req_node = new int[cache.max_reqs]; + cache.tc_req_is_recv = new int[cache.max_reqs]; + cache.tc_completed = new int[cache.max_reqs]; } MyList *dst = build_buffer_gsl(PatfL); diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index 5a72797..0ab975c 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -108,6 +108,9 @@ namespace Parallel MPI_Status *stats; int max_reqs; bool lengths_valid; + int *tc_req_node; + int *tc_req_is_recv; + int *tc_completed; SyncCache(); void invalidate(); void destroy();