diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C index 1086026..1848543 100644 --- a/AMSS_NCKU_source/bssn_rhs_c.C +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -39,7 +39,6 @@ int f_compute_rhs_bssn(int *ex, double &T, // printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all); // temp variable - double gxx[all],gyy[all],gzz[all]; double chix[all],chiy[all],chiz[all]; double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all]; double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all]; @@ -51,9 +50,9 @@ int f_compute_rhs_bssn(int *ex, double &T, double Gamxx[all],Gamxy[all],Gamxz[all]; double Gamyx[all],Gamyy[all],Gamyz[all]; double Gamzx[all],Gamzy[all],Gamzz[all]; - double Kx[all], Ky[all], Kz[all], div_beta[all], S[all]; + double Kx[all], Ky[all], Kz[all], S[all]; double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all]; - double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all]; + double alpn1[all], chin1[all]; double gupxx[all], gupxy[all], gupxz[all]; double gupyy[all], gupyz[all], gupzz[all]; double SSS[3] = { 1.0, 1.0, 1.0}; @@ -107,9 +106,6 @@ int f_compute_rhs_bssn(int *ex, double &T, for(int i=0;i 1) for(int i=0;i 1) for (int i=0;i 1) for(int i=0;i0){ - kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps); - kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps); - kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps); - kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps); - kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps); - kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps); - kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps); - kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps); - kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps); - kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps); - kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps); - kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps); - kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps); - kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps); - kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps); - kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps); - } + // advection + KO dissipation with shared symmetry buffer + lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps); + lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps); + lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps); + lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps); + lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps); + lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps); + lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps); + lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps); + lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps); + lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps); + lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps); + lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps); + lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps); + lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps); + lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps); + lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps); // 2ms // if(co==0){ for (int i=0;i 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; - /* 高阶分支:i±2,j±2,k±2 都在范围内 */ - if ((iF + 2) <= imaxF && (iF - 2) >= iminF && - (jF + 2) <= jmaxF && (jF - 2) >= jminF && - (kF + 2) <= kmaxF && (kF - 2) >= kminF) - { - fxx[p] = Fdxdx * ( - -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - ); + const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; - fyy[p] = Fdydy * ( - -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - ); + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); - fzz[p] = Fdzdz * ( - -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - ); - - /* fxy 高阶:完全照搬 Fortran 的括号结构 */ - { - const double t_jm2 = - ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] ); - - const double t_jm1 = - ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] ); - - const double t_jp1 = - ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] ); - - const double t_jp2 = - ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] ); - - fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); - } - - /* fxz 高阶 */ - { - const double t_km2 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] ); - - const double t_km1 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] ); - - const double t_kp1 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] ); - - const double t_kp2 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] ); - - fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); - } - - /* fyz 高阶 */ - { - const double t_km2 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] ); - - const double t_km1 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] ); - - const double t_kp1 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] ); - - const double t_kp2 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] ); - - fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); - } - } - /* 二阶分支:i±1,j±1,k±1 在范围内 */ - else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && - (jF + 1) <= jmaxF && (jF - 1) >= jminF && - (kF + 1) <= kmaxF && (kF - 1) >= kminF) - { fxx[p] = Sdxdx * ( fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + @@ -252,17 +188,131 @@ void fdderivs(const int ex[3], fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] ); - }else{ - fxx[p] = 0.0; - fyy[p] = 0.0; - fzz[p] = 0.0; - fxy[p] = 0.0; - fxz[p] = 0.0; - fyz[p] = 0.0; + } + } + } + } + + if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Fdxdx * ( + -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fyy[p] = Fdydy * ( + -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fzz[p] = Fdzdz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + + { + const double t_jm2 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] ); + + const double t_jm1 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] ); + + const double t_jp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] ); + + const double t_jp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] ); + + fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); + } + + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] ); + + fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] ); + + fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } } } } } // free(fh); -} \ No newline at end of file +} diff --git a/AMSS_NCKU_source/fderivs_c.C b/AMSS_NCKU_source/fderivs_c.C index 5f6b94c..90f10a1 100644 --- a/AMSS_NCKU_source/fderivs_c.C +++ b/AMSS_NCKU_source/fderivs_c.C @@ -81,26 +81,63 @@ void fderivs(const int ex[3], } /* - * Fortran loops: - * do k=1,ex3-1 - * do j=1,ex2-1 - * do i=1,ex1-1 + * 两段式: + * 1) 先在二阶可用区域计算二阶模板 + * 2) 再在高阶可用区域覆盖为四阶模板 * - * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2 + * 与原 if/elseif 逻辑等价,但减少逐点分支判断。 */ - for (int k0 = 0; k0 <= ex3 - 2; ++k0) { - const int kF = k0 + 1; - for (int j0 = 0; j0 <= ex2 - 2; ++j0) { - const int jF = j0 + 1; - for (int i0 = 0; i0 <= ex1 - 2; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); + const int i2_lo = (iminF > 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; + + const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; + + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d2dx * ( + -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fy[p] = d2dy * ( + -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fz[p] = d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + } + } + } + } + + if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); - // if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引) - if ((iF + 2) <= imaxF && (iF - 2) >= iminF && - (jF + 2) <= jmaxF && (jF - 2) >= jminF && - (kF + 2) <= kmaxF && (kF - 2) >= kminF) - { fx[p] = d12dx * ( fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] - EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + @@ -122,29 +159,9 @@ void fderivs(const int ex[3], fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] ); } - // elseif(i+1 <= imax .and. i-1 >= imin ...) - else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && - (jF + 1) <= jmaxF && (jF - 1) >= jminF && - (kF + 1) <= kmaxF && (kF - 1) >= kminF) - { - fx[p] = d2dx * ( - -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + - fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - ); - - fy[p] = d2dy * ( - -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - ); - - fz[p] = d2dz * ( - -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - ); - } } } } // free(fh); -} \ No newline at end of file +} diff --git a/AMSS_NCKU_source/fmisc.f90 b/AMSS_NCKU_source/fmisc.f90 index 4baf147..6f5be07 100644 --- a/AMSS_NCKU_source/fmisc.f90 +++ b/AMSS_NCKU_source/fmisc.f90 @@ -1111,27 +1111,177 @@ end subroutine d2dump !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -! common code for cell and vertex -!------------------------------------------------------------------------------ -! Lagrangian polynomial interpolation -!------------------------------------------------------------------------------ - -!DIR$ ATTRIBUTES FORCEINLINE :: polint - subroutine polint(xa, ya, x, y, dy, ordn) - implicit none - - integer, intent(in) :: ordn +! common code for cell and vertex +!------------------------------------------------------------------------------ +! Lagrangian polynomial interpolation +!------------------------------------------------------------------------------ +#ifndef POLINT6_USE_BARYCENTRIC +#define POLINT6_USE_BARYCENTRIC 1 +#endif + +!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville + subroutine polint6_neville(xa, ya, x, y, dy) + implicit none + + real*8, dimension(6), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, m, ns, n_m + real*8, dimension(6) :: c, d, ho + real*8 :: dif, dift, hp, h, den_val + + c = ya + d = ya + ho = xa - x + + ns = 1 + dif = abs(x - xa(1)) + + do i = 2, 6 + dift = abs(x - xa(i)) + if (dift < dif) then + ns = i + dif = dift + end if + end do + + y = ya(ns) + ns = ns - 1 + + do m = 1, 5 + n_m = 6 - m + do i = 1, n_m + hp = ho(i) + h = ho(i+m) + den_val = hp - h + + if (den_val == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + + den_val = (c(i+1) - d(i)) / den_val + + d(i) = h * den_val + c(i) = hp * den_val + end do + + if (2 * ns < n_m) then + dy = c(ns + 1) + else + dy = d(ns) + ns = ns - 1 + end if + y = y + dy + end do + + return + end subroutine polint6_neville + +!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric + subroutine polint6_barycentric(xa, ya, x, y, dy) + implicit none + + real*8, dimension(6), intent(in) :: xa, ya + real*8, intent(in) :: x + real*8, intent(out) :: y, dy + + integer :: i, j + logical :: is_uniform + real*8, dimension(6) :: lambda + real*8 :: dx, den_i, term, num, den, step, tol + real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /) + + do i = 1, 6 + if (x == xa(i)) then + y = ya(i) + dy = 0.d0 + return + end if + end do + + step = xa(2) - xa(1) + is_uniform = (step /= 0.d0) + if (is_uniform) then + tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step)) + do i = 3, 6 + if (abs((xa(i) - xa(i-1)) - step) > tol) then + is_uniform = .false. + exit + end if + end do + end if + + if (is_uniform) then + num = 0.d0 + den = 0.d0 + do i = 1, 6 + term = c_uniform(i) / (x - xa(i)) + num = num + term * ya(i) + den = den + term + end do + y = num / den + dy = 0.d0 + return + end if + + do i = 1, 6 + den_i = 1.d0 + do j = 1, 6 + if (j /= i) then + dx = xa(i) - xa(j) + if (dx == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + den_i = den_i * dx + end if + end do + lambda(i) = 1.d0 / den_i + end do + + num = 0.d0 + den = 0.d0 + do i = 1, 6 + term = lambda(i) / (x - xa(i)) + num = num + term * ya(i) + den = den + term + end do + + y = num / den + dy = 0.d0 + + return + end subroutine polint6_barycentric + +!DIR$ ATTRIBUTES FORCEINLINE :: polint + subroutine polint(xa, ya, x, y, dy, ordn) + implicit none + + integer, intent(in) :: ordn real*8, dimension(ordn), intent(in) :: xa, ya real*8, intent(in) :: x real*8, intent(out) :: y, dy - integer :: i, m, ns, n_m - real*8, dimension(ordn) :: c, d, ho - real*8 :: dif, dift, hp, h, den_val - - c = ya - d = ya - ho = xa - x + integer :: i, m, ns, n_m + real*8, dimension(ordn) :: c, d, ho + real*8 :: dif, dift, hp, h, den_val + + if (ordn == 6) then +#if POLINT6_USE_BARYCENTRIC + call polint6_barycentric(xa, ya, x, y, dy) +#else + call polint6_neville(xa, ya, x, y, dy) +#endif + return + end if + + c = ya + d = ya + ho = xa - x ns = 1 dif = abs(x - xa(1)) @@ -1175,13 +1325,48 @@ end subroutine d2dump y = y + dy end do - return - end subroutine polint -!------------------------------------------------------------------------------ -! -! interpolation in 2 dimensions, follow yx order -! -!------------------------------------------------------------------------------ + return + end subroutine polint +!------------------------------------------------------------------------------ +! Compute Lagrange interpolation basis weights for one target point. +!------------------------------------------------------------------------------ +!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights + subroutine polint_lagrange_weights(xa, x, w, ordn) + implicit none + + integer, intent(in) :: ordn + real*8, dimension(1:ordn), intent(in) :: xa + real*8, intent(in) :: x + real*8, dimension(1:ordn), intent(out) :: w + + integer :: i, j + real*8 :: num, den, dx + + do i = 1, ordn + num = 1.d0 + den = 1.d0 + do j = 1, ordn + if (j /= i) then + dx = xa(i) - xa(j) + if (dx == 0.0d0) then + write(*,*) 'failure in polint for point',x + write(*,*) 'with input points: ',xa + stop + end if + num = num * (x - xa(j)) + den = den * dx + end if + end do + w(i) = num / den + end do + + return + end subroutine polint_lagrange_weights +!------------------------------------------------------------------------------ +! +! interpolation in 2 dimensions, follow yx order +! +!------------------------------------------------------------------------------ subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn) implicit none @@ -1229,11 +1414,11 @@ end subroutine d2dump real*8, intent(in) :: x1,x2,x3 real*8, intent(out) :: y,dy -#ifdef POLINT_LEGACY_ORDER - integer :: i,j,m,n - real*8, dimension(ordn,ordn) :: yatmp - real*8, dimension(ordn) :: ymtmp - real*8, dimension(ordn) :: yntmp +#ifdef POLINT_LEGACY_ORDER + integer :: i,j,m,n + real*8, dimension(ordn,ordn) :: yatmp + real*8, dimension(ordn) :: ymtmp + real*8, dimension(ordn) :: yntmp real*8, dimension(ordn) :: yqtmp m=size(x1a) @@ -1243,29 +1428,36 @@ end subroutine d2dump yqtmp=ya(i,j,:) call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn) end do - yntmp=yatmp(i,:) - call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) - end do - call polint(x1a,ymtmp,x1,y,dy,ordn) -#else - integer :: j, k - real*8, dimension(ordn,ordn) :: yatmp - real*8, dimension(ordn) :: ymtmp - real*8 :: dy_temp - - do k=1,ordn - do j=1,ordn - call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn) - end do - end do - do k=1,ordn - call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn) - end do - call polint(x3a, ymtmp, x3, y, dy, ordn) -#endif - - return - end subroutine polin3 + yntmp=yatmp(i,:) + call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn) + end do + call polint(x1a,ymtmp,x1,y,dy,ordn) +#else + integer :: i, j, k + real*8, dimension(ordn) :: w1, w2 + real*8, dimension(ordn) :: ymtmp + real*8 :: yx_sum, x_sum + + call polint_lagrange_weights(x1a, x1, w1, ordn) + call polint_lagrange_weights(x2a, x2, w2, ordn) + + do k = 1, ordn + yx_sum = 0.d0 + do j = 1, ordn + x_sum = 0.d0 + do i = 1, ordn + x_sum = x_sum + w1(i) * ya(i,j,k) + end do + yx_sum = yx_sum + w2(j) * x_sum + end do + ymtmp(k) = yx_sum + end do + + call polint(x3a, ymtmp, x3, y, dy, ordn) +#endif + + return + end subroutine polin3 !-------------------------------------------------------------------------------------- ! calculate L2norm subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& @@ -1608,11 +1800,14 @@ deallocate(f_flat) ! ^ ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3 - real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0 - - fout = C1*f1+C2*f2+C3*f3 - - return + real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0 + integer :: i,j,k + + do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1)) + fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k) + end do + + return end subroutine average2 !----------------------------------------------------------------------------- diff --git a/AMSS_NCKU_source/interp_lb_profile_data.h b/AMSS_NCKU_source/interp_lb_profile_data.h index 159c01d..5512c54 100644 --- a/AMSS_NCKU_source/interp_lb_profile_data.h +++ b/AMSS_NCKU_source/interp_lb_profile_data.h @@ -1,3 +1,5 @@ +/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */ +/* 更新:负载均衡问题已经通过优化插值函数解决,此profile静态均衡方案已弃用,本头文件现在未参与编译 */ /* Auto-generated from interp_lb_profile.bin — do not edit */ #ifndef INTERP_LB_PROFILE_DATA_H #define INTERP_LB_PROFILE_DATA_H diff --git a/AMSS_NCKU_source/kodiss_c.C b/AMSS_NCKU_source/kodiss_c.C index 2a04abe..36f80ef 100644 --- a/AMSS_NCKU_source/kodiss_c.C +++ b/AMSS_NCKU_source/kodiss_c.C @@ -63,19 +63,28 @@ void kodis(const int ex[3], * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1 * 并定义 Fortran index: iF=i0+1, ... */ - for (int k0 = 0; k0 < ex3; ++k0) { + // 收紧循环范围:只遍历满足 iF±3/jF±3/kF±3 条件的内部点 + // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1) + // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4 + const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0; + const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0; + const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0; + const int i0_hi = imaxF - 4; // inclusive + const int j0_hi = jmaxF - 4; + const int k0_hi = kmaxF - 4; + + if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) { + free(fh); + return; + } + + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { const int kF = k0 + 1; - for (int j0 = 0; j0 < ex2; ++j0) { + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { const int jF = j0 + 1; - for (int i0 = 0; i0 < ex1; ++i0) { + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { const int iF = i0 + 1; - // Fortran if 条件: - // i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引) - if ((iF - 3) >= iminF && (iF + 3) <= imaxF && - (jF - 3) >= jminF && (jF + 3) <= jmaxF && - (kF - 3) >= kminF && (kF + 3) <= kmaxF) - { const size_t p = idx_ex(i0, j0, k0, ex); // 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核) @@ -100,7 +109,6 @@ void kodis(const int ex[3], // Fortran: // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term) f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); - } } } } diff --git a/AMSS_NCKU_source/lopsided_kodis_c.C b/AMSS_NCKU_source/lopsided_kodis_c.C new file mode 100644 index 0000000..970d0e7 --- /dev/null +++ b/AMSS_NCKU_source/lopsided_kodis_c.C @@ -0,0 +1,248 @@ +#include "tool.h" + +/* + * Combined advection (lopsided) + KO dissipation (kodis). + * Uses one shared symmetry_bd buffer per call. + */ +void lopsided_kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3], double eps) +{ + const double ZEO = 0.0, ONE = 1.0, F3 = 3.0; + const double F6 = 6.0, F18 = 18.0; + const double F12 = 12.0, F10 = 10.0, EIT = 8.0; + const double SIX = 6.0, FIT = 15.0, TWT = 20.0; + const double cof = 64.0; // 2^6 + + const int NO_SYMM = 0, EQ_SYMM = 1; + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3) + const size_t nx = (size_t)ex1 + 3; + const size_t ny = (size_t)ex2 + 3; + const size_t nz = (size_t)ex3 + 3; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + symmetry_bd(3, ex, f, fh, SoA); + + // Advection (same stencil logic as lopsided_c.C) + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double sfx = Sfx[p]; + if (sfx > ZEO) { + if (i0 <= ex1 - 4) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } else if (i0 <= ex1 - 3) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } else if (i0 <= ex1 - 2) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } + } else if (sfx < ZEO) { + if ((i0 - 2) >= iminF) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } else if ((i0 - 1) >= iminF) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } else if (i0 >= iminF) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } + } + + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0 <= ex2 - 4) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } else if (j0 <= ex2 - 3) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 <= ex2 - 2) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } + } else if (sfy < ZEO) { + if ((j0 - 2) >= jminF) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } else if ((j0 - 1) >= jminF) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 >= jminF) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } + } + + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0 <= ex3 - 4) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } else if (k0 <= ex3 - 3) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 <= ex3 - 2) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } + } else if (sfz < ZEO) { + if ((k0 - 2) >= kminF) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } else if ((k0 - 1) >= kminF) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 >= kminF) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } + } + } + } + } + + // KO dissipation (same domain restriction as kodiss_c.C) + if (eps > ZEO) { + const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0; + const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0; + const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0; + const int i0_hi = imaxF - 4; // inclusive + const int j0_hi = jmaxF - 4; + const int k0_hi = kmaxF - 4; + + if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double Dx_term = + ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - + SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + + FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dX; + + const double Dy_term = + ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - + SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + + FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dY; + + const double Dz_term = + ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - + SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + + FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dZ; + + f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); + } + } + } + } + } + + free(fh); +} diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 7849f37..425effa 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,27 +1,35 @@ -include makefile.inc - -## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) -## make -> opt (PGO-guided, maximum performance) -## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) -PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata +include makefile.inc + +## polint(ordn=6) kernel selector: +## 1 (default): barycentric fast path +## 0 : fallback to Neville path +POLINT6_USE_BARY ?= 1 +POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) + +## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) +## make -> opt (PGO-guided, maximum performance) +## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) +PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include -else -## opt (default): maximum performance with PGO profile data -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -align array64byte -fpp -I${MKLROOT}/include -endif +CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +else +## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ +## PGO has been turned off, now tested and found to be negative optimization +## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization + + +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +endif .SUFFIXES: .o .f90 .C .for .cu @@ -50,11 +58,14 @@ fdderivs_c.o: fdderivs_c.C kodiss_c.o: kodiss_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ -lopsided_c.o: lopsided_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ +lopsided_c.o: lopsided_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +lopsided_kodis_c.o: lopsided_kodis_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata @@ -71,13 +82,21 @@ TwoPunctureABE.o: TwoPunctureABE.C # Input files ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) -ifeq ($(USE_CXX_KERNELS),0) -# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below -CFILES = -else -# C++ mode (default): C rewrite of bssn_rhs and helper kernels -CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o -endif +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below +CFILES = +else +# C++ mode (default): C rewrite of bssn_rhs and helper kernels +CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o +endif + +## RK4 kernel switch (independent from USE_CXX_KERNELS) +ifeq ($(USE_CXX_RK4),1) +CFILES += rungekutta4_rout_c.o +RK4_F90_OBJ = +else +RK4_F90_OBJ = rungekutta4_rout.o +endif C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ @@ -94,12 +113,12 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o NullShellPatch2_Evo.o \ bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o -F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ - prolongrestrict_cell.o prolongrestrict_vertex.o\ - rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\ - lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ - shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ - getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ +F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ + prolongrestrict_cell.o prolongrestrict_vertex.o\ + $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\ + lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ + shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ + getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\ cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ getnpem2.o empart.o NullNews.o fourdcurvature.o\ diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index ad83ceb..331cff1 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -10,6 +10,20 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 +## Memory allocator switch +## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) +## 0 : use system default allocator (ptmalloc) +USE_TBBMALLOC ?= 1 +TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so +ifneq ($(wildcard $(TBBMALLOC_SO)),) +TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed +else +TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed +endif +ifeq ($(USE_TBBMALLOC),1) +LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) +endif + ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## opt : (default) maximum performance with PGO profile-guided optimization ## instrument : PGO Phase 1 instrumentation to collect fresh profile data @@ -33,6 +47,12 @@ endif ## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) ## 0 : fall back to original Fortran kernels USE_CXX_KERNELS ?= 1 + +## RK4 kernel implementation switch +## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) +## 0 : use original Fortran rungekutta4_rout.o +USE_CXX_RK4 ?= 1 + f90 = ifx f77 = ifx CXX = icpx diff --git a/AMSS_NCKU_source/prolongrestrict_cell.f90 b/AMSS_NCKU_source/prolongrestrict_cell.f90 index 17fc43d..b62e722 100644 --- a/AMSS_NCKU_source/prolongrestrict_cell.f90 +++ b/AMSS_NCKU_source/prolongrestrict_cell.f90 @@ -1932,19 +1932,29 @@ real*8, dimension(1:3) :: base integer,dimension(3) :: lbc,ubc,lbf,ubf,lbp,ubp,lbpc,ubpc ! when if=1 -> ic=0, this is different to vertex center grid - real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc - integer,dimension(3) :: cxI - integer :: i,j,k,ii,jj,kk - real*8, dimension(6,6) :: tmp2 - real*8, dimension(6) :: tmp1 - - real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 - real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 - - integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi - integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo - - real*8,dimension(3) :: CD,FD + real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc + integer,dimension(3) :: cxI + integer :: i,j,k,ii,jj,kk,px,py,pz + real*8, dimension(6,6) :: tmp2 + real*8, dimension(6) :: tmp1 + integer, dimension(extf(1)) :: cix + integer, dimension(extf(2)) :: ciy + integer, dimension(extf(3)) :: ciz + integer, dimension(extf(1)) :: pix + integer, dimension(extf(2)) :: piy + integer, dimension(extf(3)) :: piz + + real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 + real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 + real*8, dimension(6,2), parameter :: WC = reshape((/& + C1,C2,C3,C4,C5,C6,& + C6,C5,C4,C3,C2,C1/), (/6,2/)) + + integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi + integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo + integer::maxcx,maxcy,maxcz + + real*8,dimension(3) :: CD,FD if(wei.ne.3)then write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" @@ -2003,10 +2013,10 @@ kmini=lbpc(3)-lbc(3) + 1 kmaxi=ubpc(3)-lbc(3) + 1 - if(imino.lt.1.or.jmino.lt.1.or.kmino.lt.1.or.& - imini.lt.1.or.jmini.lt.1.or.kmini.lt.1.or.& - imaxo.gt.extf(1).or.jmaxo.gt.extf(2).or.kmaxo.gt.extf(3).or.& - imaxi.gt.extc(1)-2.or.jmaxi.gt.extc(2)-2.or.kmaxi.gt.extc(3)-2)then + if(imino.lt.1.or.jmino.lt.1.or.kmino.lt.1.or.& + imini.lt.1.or.jmini.lt.1.or.kmini.lt.1.or.& + imaxo.gt.extf(1).or.jmaxo.gt.extf(2).or.kmaxo.gt.extf(3).or.& + imaxi.gt.extc(1)-2.or.jmaxi.gt.extc(2)-2.or.kmaxi.gt.extc(3)-2)then write(*,*)"error in prolongation for" write(*,*)"from" write(*,*)llbc,uubc @@ -2017,33 +2027,61 @@ write(*,*)"want" write(*,*)llbp,uubp write(*,*)lbp,ubp,lbpc,ubpc - return - endif - - call symmetry_bd(3,extc,func,funcc,SoA) - -!~~~~~~> prolongation start... - do k = kmino,kmaxo - do j = jmino,jmaxo - do i = imino,imaxo - cxI(1) = i - cxI(2) = j - cxI(3) = k -! change to coarse level reference -!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---| -!|=======x===============x===============x===============x=======| - cxI = (cxI+lbf-1)/2 -! change to array index - cxI = cxI - lbc + 1 - - if(any(cxI+3 > extc)) write(*,*)"error in prolong" - ii=i+lbf(1)-1 - jj=j+lbf(2)-1 - kk=k+lbf(3)-1 -#if 0 - if(ii/2*2==ii)then - if(jj/2*2==jj)then - if(kk/2*2==kk)then + return + endif + + do i = imino,imaxo + ii = i + lbf(1) - 1 + cix(i) = ii/2 - lbc(1) + 1 + if(ii/2*2 == ii)then + pix(i) = 1 + else + pix(i) = 2 + endif + enddo + do j = jmino,jmaxo + jj = j + lbf(2) - 1 + ciy(j) = jj/2 - lbc(2) + 1 + if(jj/2*2 == jj)then + piy(j) = 1 + else + piy(j) = 2 + endif + enddo + do k = kmino,kmaxo + kk = k + lbf(3) - 1 + ciz(k) = kk/2 - lbc(3) + 1 + if(kk/2*2 == kk)then + piz(k) = 1 + else + piz(k) = 2 + endif + enddo + + maxcx = maxval(cix(imino:imaxo)) + maxcy = maxval(ciy(jmino:jmaxo)) + maxcz = maxval(ciz(kmino:kmaxo)) + if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then + write(*,*)"error in prolong" + return + endif + + call symmetry_bd(3,extc,func,funcc,SoA) + +!~~~~~~> prolongation start... + do k = kmino,kmaxo + do j = jmino,jmaxo + do i = imino,imaxo + cxI(1) = cix(i) + cxI(2) = ciy(j) + cxI(3) = ciz(k) + px = pix(i) + py = piy(j) + pz = piz(k) +#if 0 + if(ii/2*2==ii)then + if(jj/2*2==jj)then + if(kk/2*2==kk)then tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& @@ -2126,35 +2164,20 @@ endif endif endif -#else - if(kk/2*2==kk)then - tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - else - tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& - C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& - C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& - C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& - C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& - C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) - endif - - if(jj/2*2==jj)then - tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6) - else - tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6) - endif - - if(ii/2*2==ii)then - funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6) - else - funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6) - endif -#endif +#else + tmp2= WC(1,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+& + WC(2,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+& + WC(3,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+& + WC(4,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+& + WC(5,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+& + WC(6,pz)*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3) + + tmp1= WC(1,py)*tmp2(:,1)+WC(2,py)*tmp2(:,2)+WC(3,py)*tmp2(:,3)+& + WC(4,py)*tmp2(:,4)+WC(5,py)*tmp2(:,5)+WC(6,py)*tmp2(:,6) + + funf(i,j,k)= WC(1,px)*tmp1(1)+WC(2,px)*tmp1(2)+WC(3,px)*tmp1(3)+& + WC(4,px)*tmp1(4)+WC(5,px)*tmp1(5)+WC(6,px)*tmp1(6) +#endif enddo enddo enddo diff --git a/AMSS_NCKU_source/rungekutta4_rout_c.C b/AMSS_NCKU_source/rungekutta4_rout_c.C new file mode 100644 index 0000000..fd39480 --- /dev/null +++ b/AMSS_NCKU_source/rungekutta4_rout_c.C @@ -0,0 +1,212 @@ +#include "rungekutta4_rout.h" +#include +#include +#include +#include +#include + +namespace { + +inline void rk4_stage0(std::size_t n, + const double *__restrict f0, + const double *__restrict frhs, + double *__restrict f1, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d vr = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d vr = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * frhs[i]; + } +} + +inline void rk4_rhs_accum(std::size_t n, + const double *__restrict f1, + double *__restrict frhs) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d v2 = _mm512_set1_pd(2.0); + for (; i + 7 < n; i += 8) { + const __m512d v1 = _mm512_loadu_pd(f1 + i); + const __m512d vrhs = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs)); + } +#elif defined(__AVX2__) + const __m256d v2 = _mm256_set1_pd(2.0); + for (; i + 3 < n; i += 4) { + const __m256d v1 = _mm256_loadu_pd(f1 + i); + const __m256d vrhs = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + frhs[i] = frhs[i] + 2.0 * f1[i]; + } +} + +inline void rk4_f1_from_f0_f1(std::size_t n, + const double *__restrict f0, + double *__restrict f1, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d v1 = _mm512_loadu_pd(f1 + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d v1 = _mm256_loadu_pd(f1 + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * f1[i]; + } +} + +inline void rk4_stage3(std::size_t n, + const double *__restrict f0, + double *__restrict f1, + const double *__restrict frhs, + double c) { + std::size_t i = 0; +#if defined(__AVX512F__) + const __m512d vc = _mm512_set1_pd(c); + for (; i + 7 < n; i += 8) { + const __m512d v0 = _mm512_loadu_pd(f0 + i); + const __m512d v1 = _mm512_loadu_pd(f1 + i); + const __m512d vr = _mm512_loadu_pd(frhs + i); + _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0)); + } +#elif defined(__AVX2__) + const __m256d vc = _mm256_set1_pd(c); + for (; i + 3 < n; i += 4) { + const __m256d v0 = _mm256_loadu_pd(f0 + i); + const __m256d v1 = _mm256_loadu_pd(f1 + i); + const __m256d vr = _mm256_loadu_pd(frhs + i); + _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0)); + } +#endif +#pragma ivdep + for (; i < n; ++i) { + f1[i] = f0[i] + c * (f1[i] + frhs[i]); + } +} + +} // namespace + +extern "C" { + +void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) { + constexpr double F1o6 = 1.0 / 6.0; + constexpr double HLF = 0.5; + constexpr double TWO = 2.0; + + switch (RK4) { + case 0: + f1 = f0 + HLF * dT * f_rhs; + break; + case 1: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + HLF * dT * f1; + break; + case 2: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + dT * f1; + break; + case 3: + f1 = f0 + F1o6 * dT * (f1 + f_rhs); + break; + default: + std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } +} + +void rungekutta4_cplxscalar_(double &dT, + std::complex &f0, + std::complex &f1, + std::complex &f_rhs, + int &RK4) { + constexpr double F1o6 = 1.0 / 6.0; + constexpr double HLF = 0.5; + constexpr double TWO = 2.0; + + switch (RK4) { + case 0: + f1 = f0 + HLF * dT * f_rhs; + break; + case 1: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + HLF * dT * f1; + break; + case 2: + f_rhs = f_rhs + TWO * f1; + f1 = f0 + dT * f1; + break; + case 3: + f1 = f0 + F1o6 * dT * (f1 + f_rhs); + break; + default: + std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } +} + +int f_rungekutta4_rout(int *ex, double &dT, + double *f0, double *f1, double *f_rhs, + int &RK4) { + const std::size_t n = static_cast(ex[0]) * + static_cast(ex[1]) * + static_cast(ex[2]); + const double *const __restrict f0r = f0; + double *const __restrict f1r = f1; + double *const __restrict frhs = f_rhs; + + if (__builtin_expect(static_cast(RK4) > 3u, 0)) { + std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4); + std::abort(); + } + + switch (RK4) { + case 0: + rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT); + break; + case 1: + rk4_rhs_accum(n, f1r, frhs); + rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT); + break; + case 2: + rk4_rhs_accum(n, f1r, frhs); + rk4_f1_from_f0_f1(n, f0r, f1r, dT); + break; + default: + rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT); + break; + } + + return 0; +} + +} // extern "C" diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h index 5051448..2b509b8 100644 --- a/AMSS_NCKU_source/share_func.h +++ b/AMSS_NCKU_source/share_func.h @@ -5,6 +5,7 @@ #include #include #include +#include /* 主网格:0-based -> 1D */ static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) { const int ex1 = ex[0], ex2 = ex[1]; @@ -87,60 +88,159 @@ static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc * funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) * enddo */ +static inline void symmetry_bd_impl(int ord, + int shift, + const int extc[3], + const double *__restrict func, + double *__restrict funcc, + const double SoA[3]) +{ + const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + const int nx = extc1 + ord; + const int ny = extc2 + ord; + + const size_t snx = (size_t)nx; + const size_t splane = (size_t)nx * (size_t)ny; + const size_t interior_i = (size_t)shift + 1u; /* iF = 1 */ + const size_t interior_j = ((size_t)shift + 1u) * snx; /* jF = 1 */ + const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */ + const size_t interior0 = interior_k + interior_j + interior_i; + + /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */ + for (int k0 = 0; k0 < extc3; ++k0) { + const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1; + const size_t dst_k0 = interior0 + (size_t)k0 * splane; + for (int j0 = 0; j0 < extc2; ++j0) { + const double *src = src_k + (size_t)j0 * (size_t)extc1; + double *dst = funcc + dst_k0 + (size_t)j0 * snx; + memcpy(dst, src, (size_t)extc1 * sizeof(double)); + } + } + + /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */ + const double s1 = SoA[0]; + if (s1 == 1.0) { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = funcc[off + src_i]; + } + } + } + } else if (s1 == -1.0) { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = -funcc[off + src_i]; + } + } + } + } else { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = funcc[off + src_i] * s1; + } + } + } + } + + /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */ + const double s2 = SoA[1]; + if (s2 == 1.0) { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = src[i]; + } + } + } else if (s2 == -1.0) { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = -src[i]; + } + } + } else { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2; + } + } + } + + /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */ + const double s3 = SoA[2]; + if (s3 == 1.0) { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = src[p]; + } + } else if (s3 == -1.0) { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = -src[p]; + } + } else { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3; + } + } +} + static inline void symmetry_bd(int ord, const int extc[3], const double *func, double *funcc, const double SoA[3]) { - const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + if (ord <= 0) return; - // 1) funcc(1:extc1,1:extc2,1:extc3) = func - // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1) - for (int k0 = 0; k0 < extc3; ++k0) { - for (int j0 = 0; j0 < extc2; ++j0) { - for (int i0 = 0; i0 < extc1; ++i0) { - const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; - funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)]; - } - } + /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */ + if (ord == 2) { + symmetry_bd_impl(2, 1, extc, func, funcc, SoA); + return; + } + if (ord == 3) { + symmetry_bd_impl(3, 2, extc, func, funcc, SoA); + return; } - // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1) - for (int ii = 0; ii <= ord - 1; ++ii) { - const int iF_dst = -ii; // 0, -1, -2, ... - const int iF_src = ii + 1; // 1, 2, 3, ... - for (int kF = 1; kF <= extc3; ++kF) { - for (int jF = 1; jF <= extc2; ++jF) { - funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] = - funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0]; - } - } - } - - // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2) - // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖 - for (int jj = 0; jj <= ord - 1; ++jj) { - const int jF_dst = -jj; - const int jF_src = jj + 1; - for (int kF = 1; kF <= extc3; ++kF) { - for (int iF = -ord + 1; iF <= extc1; ++iF) { - funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] = - funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1]; - } - } - } - - // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3) - for (int kk = 0; kk <= ord - 1; ++kk) { - const int kF_dst = -kk; - const int kF_src = kk + 1; - for (int jF = -ord + 1; jF <= extc2; ++jF) { - for (int iF = -ord + 1; iF <= extc1; ++iF) { - funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] = - funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2]; - } - } - } + symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA); } #endif diff --git a/AMSS_NCKU_source/tool.h b/AMSS_NCKU_source/tool.h index 154b5ec..2c75748 100644 --- a/AMSS_NCKU_source/tool.h +++ b/AMSS_NCKU_source/tool.h @@ -24,4 +24,10 @@ void lopsided(const int ex[3], const double *X, const double *Y, const double *Z, const double *f, double *f_rhs, const double *Sfx, const double *Sfy, const double *Sfz, - int Symmetry, const double SoA[3]); \ No newline at end of file + int Symmetry, const double SoA[3]); + +void lopsided_kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3], double eps); diff --git a/makefile_and_run.py b/makefile_and_run.py index 1a0b937..5682476 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -43,7 +43,8 @@ def get_last_n_cores_per_socket(n=32): cpu_str = ",".join(segments) total = len(segments) * n print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)") - return f"taskset -c {cpu_str}" + #return f"taskset -c {cpu_str}" + return f"" ## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total) @@ -69,7 +70,7 @@ def makefile_ABE(): ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE" elif (input_data.GPU_Calculation == "yes"): makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" else: diff --git a/pgo_profile/PGO_Profile_Analysis.md b/pgo_profile/PGO_Profile_Analysis.md deleted file mode 100644 index bff40c0..0000000 --- a/pgo_profile/PGO_Profile_Analysis.md +++ /dev/null @@ -1,97 +0,0 @@ -# AMSS-NCKU PGO Profile Analysis Report - -## 1. Profiling Environment - -| Item | Value | -|------|-------| -| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) | -| Instrumentation Flag | `-fprofile-instr-generate` | -| Optimization Level (instrumented) | `-O2 -xHost -fma` | -| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) | -| Profile File | `default_9725750769337483397_0.profraw` (327 KB) | -| Merged Profile | `default.profdata` (394 KB) | -| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` | - -## 2. Reduced Simulation Parameters (for profiling run) - -| Parameter | Production Value | Profiling Value | -|-----------|-----------------|-----------------| -| MPI_processes | 64 | 1 | -| grid_level | 9 | 4 | -| static_grid_level | 5 | 3 | -| static_grid_number | 96 | 24 | -| moving_grid_number | 48 | 16 | -| largest_box_xyz_max | 320^3 | 160^3 | -| Final_Evolution_Time | 1000.0 | 10.0 | -| Evolution_Step_Number | 10,000,000 | 1,000 | -| Detector_Number | 12 | 2 | - -## 3. Profile Summary - -| Metric | Value | -|--------|-------| -| Total instrumented functions | 1,392 | -| Functions with non-zero counts | 117 (8.4%) | -| Functions with zero counts | 1,275 (91.6%) | -| Maximum function entry count | 386,459,248 | -| Maximum internal block count | 370,477,680 | -| Total block count | 4,198,023,118 | - -## 4. Top 20 Hotspot Functions - -| Rank | Total Count | Max Block Count | Function | Category | -|------|------------|-----------------|----------|----------| -| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation | -| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation | -| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution | -| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary | -| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil | -| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision | -| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator | -| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation | -| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences | -| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility | -| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives | -| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction | -| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation | -| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy | -| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation | -| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights | -| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation | -| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary | -| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) | -| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation | - -## 5. Hotspot Category Breakdown - -Top 20 functions account for ~98% of total execution counts: - -| Category | Functions | Combined Count | Share | -|----------|-----------|---------------|-------| -| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% | -| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% | -| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% | -| Time integration | rungekutta4_rout_ | ~119M | ~3% | -| Dissipation | kodis_ | ~92M | ~2% | -| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% | - -## 6. Conclusions - -1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts. -2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets. -3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout. -4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization. -5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs. - -## 7. PGO Phase 2 Usage - -To apply the profile, use the following flags in `makefile.inc`: - -```makefile -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ - -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \ - -align array64byte -fpp -I${MKLROOT}/include -``` diff --git a/pgo_profile/default.profdata b/pgo_profile/default.profdata index e875ff0..53ed651 100644 Binary files a/pgo_profile/default.profdata and b/pgo_profile/default.profdata differ diff --git a/pgo_profile/default.profdata.backup b/pgo_profile/default.profdata.backup deleted file mode 100644 index dfac738..0000000 Binary files a/pgo_profile/default.profdata.backup and /dev/null differ diff --git a/pgo_profile/default.profdata.backup2 b/pgo_profile/default.profdata.backup2 deleted file mode 100644 index c09d078..0000000 Binary files a/pgo_profile/default.profdata.backup2 and /dev/null differ diff --git a/pgo_profile/default.profdatabackup3 b/pgo_profile/default.profdatabackup3 deleted file mode 100644 index 156744d..0000000 Binary files a/pgo_profile/default.profdatabackup3 and /dev/null differ diff --git a/pgo_profile/default_9725750769337483397_0.profraw b/pgo_profile/default_9725750769337483397_0.profraw deleted file mode 100644 index c9c2485..0000000 Binary files a/pgo_profile/default_9725750769337483397_0.profraw and /dev/null differ diff --git a/pgo_profile/default_9725923726611433605_0.profraw b/pgo_profile/default_9725923726611433605_0.profraw deleted file mode 100644 index e38d300..0000000 Binary files a/pgo_profile/default_9725923726611433605_0.profraw and /dev/null differ diff --git a/pgo_profile/default_9726420327935033477_0.profraw b/pgo_profile/default_9726420327935033477_0.profraw deleted file mode 100644 index e46d05a..0000000 Binary files a/pgo_profile/default_9726420327935033477_0.profraw and /dev/null differ diff --git a/pgo_profile/default.profdata-f b/pgo_profile/default_9726853898452064389_0.profdata similarity index 67% rename from pgo_profile/default.profdata-f rename to pgo_profile/default_9726853898452064389_0.profdata index c09d078..53ed651 100644 Binary files a/pgo_profile/default.profdata-f and b/pgo_profile/default_9726853898452064389_0.profdata differ diff --git a/pgo_profile/default_15874826282416242821_0_58277.profraw b/pgo_profile/default_9726853898452064389_0.profraw similarity index 54% rename from pgo_profile/default_15874826282416242821_0_58277.profraw rename to pgo_profile/default_9726853898452064389_0.profraw index 9aa82f8..3d91484 100644 Binary files a/pgo_profile/default_15874826282416242821_0_58277.profraw and b/pgo_profile/default_9726853898452064389_0.profraw differ