diff --git a/AMSS_NCKU_source/fdderivs_sh_c.C b/AMSS_NCKU_source/fdderivs_sh_c.C new file mode 100644 index 0000000..d061231 --- /dev/null +++ b/AMSS_NCKU_source/fdderivs_sh_c.C @@ -0,0 +1,321 @@ +#include "macrodef.h" +#include "share_func.h" + +/* + * fdderivs_sh — second derivatives on shell patch in (rho, sigma, R) coords. + * Same stencil coefficients as Cartesian fdderivs. Uses symmetry_stbd. + */ +void fdderivs_sh(const int ex[3], + const double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff, int sst) +{ + (void)SYM3; (void)onoff; (void)sst; + + const int NO_SYMM=0, EQ_SYMM=1, OCTANT=2; + const double ZEO=0.0, ONE=1.0, TWO=2.0, F1o4=2.5e-1; + const double F8=8.0, F16=16.0, F30=30.0, F1o12=ONE/12.0, F1o144=ONE/144.0; + const double F9=9.0, F45=45.0, F60=60.0, F27=27.0, F270=270.0, F490=490.0; + const double F1o180=ONE/180.0, F1o3600=ONE/3600.0; + const double F32=32.0, F128=128.0, F168=168.0, F672=672.0, F840=840.0; + const double F1008=1008.0, F8064=8064.0, F14350=14350.0; + const double F1o5040=ONE/5040.0, F1o705600=ONE/705600.0; + + const int ex1=ex[0], ex2=ex[1], ex3=ex[2]; + const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0]; + const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3; + const double SoA[2]={SYM1,SYM2}; + +#if (ghost_width == 2) + { + const int ord=1; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ); + const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ); + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)] + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){ + for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF)); + fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF)); + fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1)); + fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF)); + fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1)); + fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1)); + }}} + } + #undef FH + return; + } +#elif (ghost_width == 3) + { + const int ord=2; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ); + const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ); + const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ); + const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ); + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=0,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3; + const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi); + #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)] + + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){ + for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){ + if(has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF)); + fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF)); + fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1)); + fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF)); + fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1)); + fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1)); + }}} + } + if(has4){ + for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF)); + fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF)); + fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1)); + {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF)); + const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF)); + const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF)); + const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF)); + fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);} + {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2)); + const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1)); + const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1)); + const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2)); + fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2)); + const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1)); + const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1)); + const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2)); + fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + }}} + } + #undef FH + return; + } +#elif (ghost_width == 4) + { + const int ord=3; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ); + const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ); + const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ); + const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ); + const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ); + const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ); + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF+1:0,j2_lo=(jminF+1>0)?jminF+1:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + const int i4_lo=(iminF+2>0)?iminF+2:0,j4_lo=(jminF+2>0)?jminF+2:0,k4_lo=0,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3; + const int i6_lo=(iminF+3>0)?iminF+3:0,j6_lo=(jminF+3>0)?jminF+3:0,k6_lo=0,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4; + const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi); + #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)] + + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF)); + fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF)); + fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1)); + fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF)); + fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1)); + fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1)); + }}}} + if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){if(has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF)); + fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF)); + fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1)); + {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF)); + const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF)); + const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF)); + const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF)); + fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);} + {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2)); + const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1)); + const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1)); + const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2)); + fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2)); + const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1)); + const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1)); + const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2)); + fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + }}}} + if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1; + for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1; + for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF)); + fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF)); + fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3)); + #define XS6(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY)) + fxy[p]=Xdxdy*(-XS6(jF-3,kF)+F9*XS6(jF-2,kF)-F45*XS6(jF-1,kF)+F45*XS6(jF+1,kF)-F9*XS6(jF+2,kF)+XS6(jF+3,kF)); + fxz[p]=Xdxdz*(-XS6(jF,kF-3)+F9*XS6(jF,kF-2)-F45*XS6(jF,kF-1)+F45*XS6(jF,kF+1)-F9*XS6(jF,kF+2)+XS6(jF,kF+3)); + #undef XS6 + #define YS6(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY)) + fyz[p]=Xdydz*(-YS6(jF,kF-3)+F9*YS6(jF,kF-2)-F45*YS6(jF,kF-1)+F45*YS6(jF,kF+1)-F9*YS6(jF,kF+2)+YS6(jF,kF+3)); + #undef YS6 + }}}} + #undef FH + return; + } +#elif (ghost_width == 5) + { + /* 8th-order shell second derivatives — inherits 8th-order stencil coeffs from Cartesian */ + const int ord=4; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ); + const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ); + const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ); + const double Edxdx=F1o5040/(dX*dX),Edydy=F1o5040/(dY*dY),Edzdz=F1o5040/(dZ*dZ); + const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ); + const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ); + const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ); + const double Edxdy=F1o705600/(dX*dY),Edxdz=F1o705600/(dX*dZ),Edydz=F1o705600/(dY*dZ); + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF+2:0,j2_lo=(jminF+2>0)?jminF+2:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + const int i4_lo=(iminF+3>0)?iminF+3:0,j4_lo=(jminF+3>0)?jminF+3:0,k4_lo=0,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3; + const int i6_lo=(iminF+4>0)?iminF+4:0,j6_lo=(jminF+4>0)?jminF+4:0,k6_lo=0,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4; + const int i8_lo=(iminF+5>0)?iminF+5:0,j8_lo=(jminF+5>0)?jminF+5:0,k8_lo=0,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5; + const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi),has8=(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi); + #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)] + + /* 2nd-order pass */ + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF)); + fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF)); + fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1)); + fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF)); + fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1)); + fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1)); + }}}} + /* 4th-order pass */ + if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){_Bool in6=has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi;if(in6)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF)); + fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF)); + fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1)); + {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF)); + const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF)); + const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF)); + const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF)); + fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);} + {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2)); + const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1)); + const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1)); + const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2)); + fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2)); + const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1)); + const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1)); + const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2)); + fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);} + }}}} + /* 6th-order pass */ + if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1; + for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1; + for(int i0=i6_lo;i0<=i6_hi;++i0){if(has8&&i0>=i8_lo&&i0<=i8_hi&&j0>=j8_lo&&j0<=j8_hi&&k0>=k8_lo&&k0<=k8_hi)continue; + const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF)); + fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF)); + fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3)); + #define XS6_8(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY)) + fxy[p]=Xdxdy*(-XS6_8(jF-3,kF)+F9*XS6_8(jF-2,kF)-F45*XS6_8(jF-1,kF)+F45*XS6_8(jF+1,kF)-F9*XS6_8(jF+2,kF)+XS6_8(jF+3,kF)); + fxz[p]=Xdxdz*(-XS6_8(jF,kF-3)+F9*XS6_8(jF,kF-2)-F45*XS6_8(jF,kF-1)+F45*XS6_8(jF,kF+1)-F9*XS6_8(jF,kF+2)+XS6_8(jF,kF+3)); + #undef XS6_8 + #define YS6_8(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY)) + fyz[p]=Xdydz*(-YS6_8(jF,kF-3)+F9*YS6_8(jF,kF-2)-F45*YS6_8(jF,kF-1)+F45*YS6_8(jF,kF+1)-F9*YS6_8(jF,kF+2)+YS6_8(jF,kF+3)); + #undef YS6_8 + }}}} + /* 8th-order pass */ + if(has8){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1; + for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1; + for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fxx[p]=Edxdx*(-(double)9*FH(iF-4,jF,kF)+F128*FH(iF-3,jF,kF)-F1008*FH(iF-2,jF,kF)+F8064*FH(iF-1,jF,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF+1,jF,kF)-F1008*FH(iF+2,jF,kF)+F128*FH(iF+3,jF,kF)-(double)9*FH(iF+4,jF,kF)); + fyy[p]=Edydy*(-(double)9*FH(iF,jF-4,kF)+F128*FH(iF,jF-3,kF)-F1008*FH(iF,jF-2,kF)+F8064*FH(iF,jF-1,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF+1,kF)-F1008*FH(iF,jF+2,kF)+F128*FH(iF,jF+3,kF)-(double)9*FH(iF,jF+4,kF)); + fzz[p]=Edzdz*(-(double)9*FH(iF,jF,kF-4)+F128*FH(iF,jF,kF-3)-F1008*FH(iF,jF,kF-2)+F8064*FH(iF,jF,kF-1)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF,kF+1)-F1008*FH(iF,jF,kF+2)+F128*FH(iF,jF,kF+3)-(double)9*FH(iF,jF,kF+4)); + #define XS8(JF,KFDUMMY) (+(double)3*FH(iF-4,JF,KFDUMMY)-F32*FH(iF-3,JF,KFDUMMY)+F168*FH(iF-2,JF,KFDUMMY)-F672*FH(iF-1,JF,KFDUMMY)+F672*FH(iF+1,JF,KFDUMMY)-F168*FH(iF+2,JF,KFDUMMY)+F32*FH(iF+3,JF,KFDUMMY)-(double)3*FH(iF+4,JF,KFDUMMY)) + fxy[p]=Edxdy*(+(double)3*XS8(jF-4,kF)-F32*XS8(jF-3,kF)+F168*XS8(jF-2,kF)-F672*XS8(jF-1,kF)+F672*XS8(jF+1,kF)-F168*XS8(jF+2,kF)+F32*XS8(jF+3,kF)-(double)3*XS8(jF+4,kF)); + fxz[p]=Edxdz*(+(double)3*XS8(jF,kF-4)-F32*XS8(jF,kF-3)+F168*XS8(jF,kF-2)-F672*XS8(jF,kF-1)+F672*XS8(jF,kF+1)-F168*XS8(jF,kF+2)+F32*XS8(jF,kF+3)-(double)3*XS8(jF,kF+4)); + #undef XS8 + #define YS8(JF,KFDUMMY) (+(double)3*FH(iF,JF-4,KFDUMMY)-F32*FH(iF,JF-3,KFDUMMY)+F168*FH(iF,JF-2,KFDUMMY)-F672*FH(iF,JF-1,KFDUMMY)+F672*FH(iF,JF+1,KFDUMMY)-F168*FH(iF,JF+2,KFDUMMY)+F32*FH(iF,JF+3,KFDUMMY)-(double)3*FH(iF,JF+4,KFDUMMY)) + fyz[p]=Edydz*(+(double)3*YS8(jF,kF-4)-F32*YS8(jF,kF-3)+F168*YS8(jF,kF-2)-F672*YS8(jF,kF-1)+F672*YS8(jF,kF+1)-F168*YS8(jF,kF+2)+F32*YS8(jF,kF+3)-(double)3*YS8(jF,kF+4)); + #undef YS8 + }}}} + #undef FH + return; + } +#else +#error "fdderivs_sh_c.C: unsupported ghost_width" +#endif +} diff --git a/AMSS_NCKU_source/fdderivs_shc_c.C b/AMSS_NCKU_source/fdderivs_shc_c.C new file mode 100644 index 0000000..dfd49bf --- /dev/null +++ b/AMSS_NCKU_source/fdderivs_shc_c.C @@ -0,0 +1,115 @@ +#include "macrodef.h" +#include "share_func.h" +#include + +/* Forward declarations */ +void fderivs_sh(const int ex[3], const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff, int sst); + +void fdderivs_sh(const int ex[3], const double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff, int sst); + +/* + * fdderivs_shc — shell second derivatives converted to Cartesian via chain rule. + * + * Calls fderivs_sh and fdderivs_sh internally, then applies: + * f_{,ij} = sum_{a,b} (dx^a/dx^i)(dx^b/dx^j) f_{,ab} + sum_a (d^2 x^a/dx^i dx^j) f_{,a} + * where a,b ∈ {rho, sigma, R}. + */ +extern "C" { + +void f_fdderivs_shc(int *ex, + double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + double *crho, double *sigma, double *R, + double &SYM1, double &SYM2, double &SYM3, + int &Symmetry, int &Lev, int &sst, + double *drhodx, double *drhody, double *drhodz, + double *dsigmadx, double *dsigmady, double *dsigmadz, + double *dRdx, double *dRdy, double *dRdz, + double *drhodxx, double *drhodxy, double *drhodxz, + double *drhodyy, double *drhodyz, double *drhodzz, + double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, + double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, + double *dRdxx, double *dRdxy, double *dRdxz, + double *dRdyy, double *dRdyz, double *dRdzz) +{ + const int ex3[3] = { ex[0], ex[1], ex[2] }; + const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2]; + + double *gx = (double*)malloc(n * sizeof(double)); + double *gy = (double*)malloc(n * sizeof(double)); + double *gz = (double*)malloc(n * sizeof(double)); + double *gxx = (double*)malloc(n * sizeof(double)); + double *gxy = (double*)malloc(n * sizeof(double)); + double *gxz = (double*)malloc(n * sizeof(double)); + double *gyy = (double*)malloc(n * sizeof(double)); + double *gyz = (double*)malloc(n * sizeof(double)); + double *gzz = (double*)malloc(n * sizeof(double)); + + if (!gx||!gy||!gz||!gxx||!gxy||!gxz||!gyy||!gyz||!gzz) { + free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz); + return; + } + + fderivs_sh(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst); + fdderivs_sh(ex3, f, gxx, gxy, gxz, gyy, gyz, gzz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst); + + for (size_t i = 0; i < n; ++i) { + const double rx=drhodx[i], ry=drhody[i], rz=drhodz[i]; + const double sx=dsigmadx[i], sy=dsigmady[i], sz=dsigmadz[i]; + const double Rx=dRdx[i], Ry=dRdy[i], Rz=dRdz[i]; + const double rxx=drhodxx[i], rxy=drhodxy[i], rxz=drhodxz[i]; + const double ryy=drhodyy[i], ryz=drhodyz[i], rzz=drhodzz[i]; + const double sxx=dsigmadxx[i], sxy=dsigmadxy[i], sxz=dsigmadxz[i]; + const double syy=dsigmadyy[i], syz=dsigmadyz[i], szz=dsigmadzz[i]; + const double Rxx=dRdxx[i], Rxy=dRdxy[i], Rxz=dRdxz[i]; + const double Ryy=dRdyy[i], Ryz=dRdyz[i], Rzz=dRdzz[i]; + + const double Gr=gx[i], Gs=gy[i], GR=gz[i]; + const double Grr=gxx[i], Grs=gxy[i], GrR=gxz[i]; + const double Gss=gyy[i], GsR=gyz[i], GRR=gzz[i]; + + /* fxx */ + fxx[i] = rx*rx*Grr + sx*sx*Gss + Rx*Rx*GRR + + 2.0*(rx*sx*Grs + rx*Rx*GrR + sx*Rx*GsR) + + rxx*Gr + sxx*Gs + Rxx*GR; + + /* fxy */ + fxy[i] = rx*ry*Grr + sx*sy*Gss + Rx*Ry*GRR + + rx*sy*Grs + ry*sx*Grs + rx*Ry*GrR + ry*Rx*GrR + sx*Ry*GsR + sy*Rx*GsR + + rxy*Gr + sxy*Gs + Rxy*GR; + + /* fxz */ + fxz[i] = rx*rz*Grr + sx*sz*Gss + Rx*Rz*GRR + + rx*sz*Grs + rz*sx*Grs + rx*Rz*GrR + rz*Rx*GrR + sx*Rz*GsR + sz*Rx*GsR + + rxz*Gr + sxz*Gs + Rxz*GR; + + /* fyy */ + fyy[i] = ry*ry*Grr + sy*sy*Gss + Ry*Ry*GRR + + 2.0*(ry*sy*Grs + ry*Ry*GrR + sy*Ry*GsR) + + ryy*Gr + syy*Gs + Ryy*GR; + + /* fyz */ + fyz[i] = ry*rz*Grr + sy*sz*Gss + Ry*Rz*GRR + + ry*sz*Grs + rz*sy*Grs + ry*Rz*GrR + rz*Ry*GrR + sy*Rz*GsR + sz*Ry*GsR + + ryz*Gr + syz*Gs + Ryz*GR; + + /* fzz */ + fzz[i] = rz*rz*Grr + sz*sz*Gss + Rz*Rz*GRR + + 2.0*(rz*sz*Grs + rz*Rz*GrR + sz*Rz*GsR) + + rzz*Gr + szz*Gs + Rzz*GR; + } + + free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz); +} + +} // extern "C" diff --git a/AMSS_NCKU_source/fderivs_sh_c.C b/AMSS_NCKU_source/fderivs_sh_c.C new file mode 100644 index 0000000..de07157 --- /dev/null +++ b/AMSS_NCKU_source/fderivs_sh_c.C @@ -0,0 +1,234 @@ +#include "macrodef.h" +#include "share_func.h" + +/* + * C 版 fderivs_sh — first derivatives on shell patch in (rho, sigma, R) coords. + * + * Same stencil coefficients as Cartesian fderivs, but: + * - Uses symmetry_stbd (ghost on BOTH sides of x/y, none in z) + * - fh buffer: (-ord+1:ex+ord) in x/y, (1:ex) in z + * - SoA is 2-element only (x/y), no z-symmetry + * - sst parameter (shell surface type, not used in stencil computation) + */ +void fderivs_sh(const int ex[3], + const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff, int sst) +{ + (void)SYM3; (void)onoff; (void)sst; + + const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0; + const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0; + const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0; + + const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2; + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; + const double SoA[2] = { SYM1, SYM2 }; + +#if (ghost_width == 2) + { + const int ord = 1; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = 0; + if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = 0; + if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = 0; // EQ reflection + + const size_t nx = (size_t)ex1 + 2 * ord; + const size_t ny = (size_t)ex2 + 2 * ord; + const size_t nz = (size_t)ex3; + const size_t fh_size = nx * ny * nz; + static double *fh_buf = NULL; static size_t cap = 0; + if (fh_size > cap) { free(fh_buf); fh_buf = (double*)aligned_alloc(64, fh_size*sizeof(double)); cap = fh_size; } + double *fh = fh_buf; if (!fh) return; + symmetry_stbd(ord, ex, f, fh, SoA); + + const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ; + const size_t all = (size_t)ex1*ex2*ex3; + for (size_t p=0;p0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=0; + const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2; + if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) { + for (int k0=k2_lo;k0<=k2_hi;++k0) { const int kF=k0+1; + for (int j0=j2_lo;j0<=j2_hi;++j0) { const int jF=j0+1; + for (int i0=i2_lo;i0<=i2_hi;++i0) { const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]); + fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]); + fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]); + }}} + } + return; + } +#elif (ghost_width == 3) + { + const int ord = 2; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -1; + if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -1; + if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = -1; + + const size_t nx=(size_t)ex1+2*ord, ny=(size_t)ex2+2*ord, nz=(size_t)ex3; + const size_t fh_size=nx*ny*nz; + static double *fh_buf=NULL; static size_t cap=0; + if (fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf; if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ; + const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ; + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=0; + const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2; + const int i4_lo=(iminF+1>0)?iminF+1:0, j4_lo=(jminF+1>0)?jminF+1:0, k4_lo=0; + const int i4_hi=ex1-3, j4_hi=ex2-3, k4_hi=ex3-3; + + if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) { + for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]); + fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]); + fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]); + }}} + } + if (i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi) { + for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]); + fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]); + fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]); + }}} + } + return; + } +#elif (ghost_width == 4) + { + const int ord = 3; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ; + const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ; + const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ; + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF+1:0,j2_lo=(jminF+1>0)?jminF+1:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + const int i4_lo=(iminF+2>0)?iminF+2:0,j4_lo=(jminF+2>0)?jminF+2:0,k4_lo=0,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3; + const int i6_lo=(iminF+3>0)?iminF+3:0,j6_lo=(jminF+3>0)?jminF+3:0,k6_lo=0,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4; + + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){ + for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]); + fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]); + fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]); + }}} + } + if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){ + for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]); + fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]); + fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]); + }}} + } + if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){ + for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1; + for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1; + for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d60dx*(-fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)]); + fy[p]=d60dy*(-fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)]); + fz[p]=d60dz*(-fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)]); + }}} + } + return; + } +#elif (ghost_width == 5) + { + const int ord = 4; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;} + double *fh=fh_buf;if(!fh)return; + symmetry_stbd(ord,ex,f,fh,SoA); + + const double d840dx=ONE/F840/dX,d840dy=ONE/F840/dY,d840dz=ONE/F840/dZ; + const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ; + const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ; + const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ; + const size_t all=(size_t)ex1*ex2*ex3; + for(size_t p=0;p0)?iminF+2:0,j2_lo=(jminF+2>0)?jminF+2:0,k2_lo=0,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2; + const int i4_lo=(iminF+3>0)?iminF+3:0,j4_lo=(jminF+3>0)?jminF+3:0,k4_lo=0,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3; + const int i6_lo=(iminF+4>0)?iminF+4:0,j6_lo=(jminF+4>0)?jminF+4:0,k6_lo=0,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4; + const int i8_lo=(iminF+5>0)?iminF+5:0,j8_lo=(jminF+5>0)?jminF+5:0,k8_lo=0,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5; + + #define FH_S(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)] + if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1; + for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1; + for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d2dx*(-FH_S(iF-1,jF,kF)+FH_S(iF+1,jF,kF)); + fy[p]=d2dy*(-FH_S(iF,jF-1,kF)+FH_S(iF,jF+1,kF)); + fz[p]=d2dz*(-FH_S(iF,jF,kF-1)+FH_S(iF,jF,kF+1));}}}} + + if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1; + for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1; + for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d12dx*(FH_S(iF-2,jF,kF)-EIT*FH_S(iF-1,jF,kF)+EIT*FH_S(iF+1,jF,kF)-FH_S(iF+2,jF,kF)); + fy[p]=d12dy*(FH_S(iF,jF-2,kF)-EIT*FH_S(iF,jF-1,kF)+EIT*FH_S(iF,jF+1,kF)-FH_S(iF,jF+2,kF)); + fz[p]=d12dz*(FH_S(iF,jF,kF-2)-EIT*FH_S(iF,jF,kF-1)+EIT*FH_S(iF,jF,kF+1)-FH_S(iF,jF,kF+2));}}}} + + if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1; + for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1; + for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d60dx*(-FH_S(iF-3,jF,kF)+F9*FH_S(iF-2,jF,kF)-F45*FH_S(iF-1,jF,kF)+F45*FH_S(iF+1,jF,kF)-F9*FH_S(iF+2,jF,kF)+FH_S(iF+3,jF,kF)); + fy[p]=d60dy*(-FH_S(iF,jF-3,kF)+F9*FH_S(iF,jF-2,kF)-F45*FH_S(iF,jF-1,kF)+F45*FH_S(iF,jF+1,kF)-F9*FH_S(iF,jF+2,kF)+FH_S(iF,jF+3,kF)); + fz[p]=d60dz*(-FH_S(iF,jF,kF-3)+F9*FH_S(iF,jF,kF-2)-F45*FH_S(iF,jF,kF-1)+F45*FH_S(iF,jF,kF+1)-F9*FH_S(iF,jF,kF+2)+FH_S(iF,jF,kF+3));}}}} + + if(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1; + for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1; + for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + fx[p]=d840dx*(+(double)3*FH_S(iF-4,jF,kF)-F32*FH_S(iF-3,jF,kF)+F168*FH_S(iF-2,jF,kF)-F672*FH_S(iF-1,jF,kF)+F672*FH_S(iF+1,jF,kF)-F168*FH_S(iF+2,jF,kF)+F32*FH_S(iF+3,jF,kF)-(double)3*FH_S(iF+4,jF,kF)); + fy[p]=d840dy*(+(double)3*FH_S(iF,jF-4,kF)-F32*FH_S(iF,jF-3,kF)+F168*FH_S(iF,jF-2,kF)-F672*FH_S(iF,jF-1,kF)+F672*FH_S(iF,jF+1,kF)-F168*FH_S(iF,jF+2,kF)+F32*FH_S(iF,jF+3,kF)-(double)3*FH_S(iF,jF+4,kF)); + fz[p]=d840dz*(+(double)3*FH_S(iF,jF,kF-4)-F32*FH_S(iF,jF,kF-3)+F168*FH_S(iF,jF,kF-2)-F672*FH_S(iF,jF,kF-1)+F672*FH_S(iF,jF,kF+1)-F168*FH_S(iF,jF,kF+2)+F32*FH_S(iF,jF,kF+3)-(double)3*FH_S(iF,jF,kF+4));}}}} + #undef FH_S + return; + } +#else +#error "fderivs_sh_c.C: unsupported ghost_width" +#endif +} diff --git a/AMSS_NCKU_source/fderivs_shc_c.C b/AMSS_NCKU_source/fderivs_shc_c.C new file mode 100644 index 0000000..beec39c --- /dev/null +++ b/AMSS_NCKU_source/fderivs_shc_c.C @@ -0,0 +1,55 @@ +#include "macrodef.h" +#include "share_func.h" +#include + +/* + * fderivs_shc — shell first derivatives converted to Cartesian via chain rule. + * + * Calls fderivs_sh internally, then: + * fx = drhodx * df/drho + dsigmadx * df/dsigma + dRdx * df/dR + * fy = drhody * df/drho + dsigmady * df/dsigma + dRdy * df/dR + * fz = drhodz * df/drho + dsigmadz * df/dsigma + dRdz * df/dR + */ + +// Forward declaration (defined in fderivs_sh_c.C) +void fderivs_sh(const int ex[3], const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff, int sst); + +extern "C" { + +void f_fderivs_shc(int *ex, + double *f, + double *fx, double *fy, double *fz, + double *crho, double *sigma, double *R, + double &SYM1, double &SYM2, double &SYM3, + int &Symmetry, int &Lev, int &sst, + double *drhodx, double *drhody, double *drhodz, + double *dsigmadx, double *dsigmady, double *dsigmadz, + double *dRdx, double *dRdy, double *dRdz) +{ + const int ex3[3] = { ex[0], ex[1], ex[2] }; + const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2]; + + // Temporary shell-coordinate derivatives + double *gx = (double*)malloc(n * sizeof(double)); + double *gy = (double*)malloc(n * sizeof(double)); + double *gz = (double*)malloc(n * sizeof(double)); + if (!gx || !gy || !gz) { free(gx); free(gy); free(gz); return; } + + // Compute shell-coordinate derivatives + fderivs_sh(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst); + + // Chain rule to Cartesian + for (size_t i = 0; i < n; ++i) { + fx[i] = drhodx[i] * gx[i] + dsigmadx[i] * gy[i] + dRdx[i] * gz[i]; + fy[i] = drhody[i] * gx[i] + dsigmady[i] * gy[i] + dRdy[i] * gz[i]; + fz[i] = drhodz[i] * gx[i] + dsigmadz[i] * gy[i] + dRdz[i] * gz[i]; + } + + free(gx); free(gy); free(gz); +} + +} // extern "C" diff --git a/AMSS_NCKU_source/kodiss_sh_c.C b/AMSS_NCKU_source/kodiss_sh_c.C new file mode 100644 index 0000000..579c98a --- /dev/null +++ b/AMSS_NCKU_source/kodiss_sh_c.C @@ -0,0 +1,136 @@ +#include "macrodef.h" +#include "share_func.h" + +/* + * kodis_sh — Kreiss-Oliger dissipation on shell patches. + * Same stencil coefficients as Cartesian kodis. Uses symmetry_stbd. + */ +void kodis_sh(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double SoAi[2], + int Symmetry, double eps, int sst) +{ + (void)sst; + const double ZEO=0.0; + const int ex1=ex[0], ex2=ex[1], ex3=ex[2]; + const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0]; + const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3; + const double SoA[2]={SoAi[0],SoAi[1]}; + +#if (ghost_width == 2) + { + const int ord=2, r=2; + const double cof=16.0, F4=4.0, F6=6.0; + const int NO_SYMM=0, OCTANT=2; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])0)?iminF+1:0,j0_lo=(jminF+1>0)?jminF+1:0,k0_lo=0; + const int i0_hi=imaxF-3,j0_hi=jmaxF-3,k0_hi=kmaxF-3; + if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){ + for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1; + for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1; + for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX; + const double Dy=((fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY; + const double Dz=((fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ; + f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz); + }}} + } + free(fh);return; + } +#elif (ghost_width == 3) + { + const int ord=3, r=3; + const double cof=64.0,SIX=6.0,FIT=15.0,TWT=20.0; + const int NO_SYMM=0,OCTANT=2; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])0)?iminF+2:0,j0_lo=(jminF+2>0)?jminF+2:0,k0_lo=0; + const int i0_hi=imaxF-4,j0_hi=jmaxF-4,k0_hi=kmaxF-4; + if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){ + for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1; + for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1; + for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX; + const double Dy=((fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY; + const double Dz=((fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ; + f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz); + }}} + } + free(fh);return; + } +#elif (ghost_width == 4) + { + const int ord=4, r=4; + const double cof=256.0,F8=8.0,F28=28.0,F56=56.0,F70=70.0; + const int NO_SYMM=0,OCTANT=2; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])0)?iminF+3:0,j0_lo=(jminF+3>0)?jminF+3:0,k0_lo=0; + const int i0_hi=imaxF-5,j0_hi=jmaxF-5,k0_hi=kmaxF-5; + if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){ + for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1; + for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1; + for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX; + const double Dy=((fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY; + const double Dz=((fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ; + f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz); + }}} + } + free(fh);return; + } +#elif (ghost_width == 5) + { + const int ord=5, r=5; + const double cof=1024.0,F10=10.0,F45k=45.0,F120=120.0,F210=210.0,F252=252.0; + const int NO_SYMM=0,OCTANT=2; + int iminF=1,jminF=1,kminF=1; + if(Symmetry==OCTANT&&fabs(X[0])0)?iminF+4:0,j0_lo=(jminF+4>0)?jminF+4:0,k0_lo=0; + const int i0_hi=imaxF-6,j0_hi=jmaxF-6,k0_hi=kmaxF-6; + if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){ + for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1; + for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1; + for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_stbd(iF-5,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+5,jF,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX; + const double Dy=((fh[idx_fh_stbd(iF,jF-5,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+5,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY; + const double Dz=((fh[idx_fh_stbd(iF,jF,kF-5,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+5,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ; + f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz); + }}} + } + free(fh);return; + } +#else +#error "kodiss_sh_c.C: unsupported ghost_width" +#endif +} diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 40827d6..bb7dcce 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,5 +1,5 @@ - - + + include makefile.inc -include AMSS_NCKU_build.mk @@ -44,9 +44,9 @@ ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KE ## make -> opt (PGO-guided, maximum performance) ## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata - -ifeq ($(PGO_MODE),instrument) -## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability + +ifeq ($(PGO_MODE),instrument) +## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) @@ -64,65 +64,81 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) endif - -.SUFFIXES: .o .f90 .C .for .cu - -.f90.o: - $(f90) $(f90appflags) -c $< -o $@ - -.C.o: - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -# ShellPatch.C uses OpenMP for setupintintstuff search loops -ShellPatch.o: ShellPatch.C - ${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@ - -.for.o: - $(f77) -c $< -o $@ - -.cu.o: - $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) - -# C rewrite of BSSN RHS kernel and helpers + +.SUFFIXES: .o .f90 .C .for .cu + +.f90.o: + $(f90) $(f90appflags) -c $< -o $@ + +.C.o: + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +# ShellPatch.C uses OpenMP for setupintintstuff search loops +ShellPatch.o: ShellPatch.C + ${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@ + +.for.o: + $(f77) -c $< -o $@ + +.cu.o: + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + +# C rewrite of BSSN RHS kernel and helpers bssn_rhs_c.o: bssn_rhs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ fderivs_c.o: fderivs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -fdderivs_c.o: fdderivs_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -kodiss_c.o: kodiss_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - + +fdderivs_c.o: fdderivs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +kodiss_c.o: kodiss_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + lopsided_c.o: lopsided_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ lopsided_kodis_c.o: lopsided_kodis_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ +# C rewrite of shell-patch derivative kernels +fderivs_sh_c.o: fderivs_sh_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +fdderivs_sh_c.o: fdderivs_sh_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +fderivs_shc_c.o: fderivs_shc_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +fdderivs_shc_c.o: fdderivs_shc_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +kodiss_sh_c.o: kodiss_sh_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + z4c_rhs_c.o: z4c_rhs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS -TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(TP_PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include - -TwoPunctures.o: TwoPunctures.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ - -TwoPunctureABE.o: TwoPunctureABE.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ - -# Input files - -## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) + +## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS +TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata +TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(TP_PROFDATA) \ + -Dfortran3 -Dnewc -I${MKLROOT}/include + +TwoPunctures.o: TwoPunctures.C + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ + +TwoPunctureABE.o: TwoPunctureABE.C + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ + +# Input files + +## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) ifeq ($(USE_CXX_KERNELS),0) # Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below CFILES = @@ -148,95 +164,106 @@ RK4_F90_OBJ = else RK4_F90_OBJ = rungekutta4_rout.o endif - -C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ - cgh.o bssn_class.o surface_integral.o ShellPatch.o\ - bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ - bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ - Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o - -C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ - cgh.o surface_integral.o ShellPatch.o\ - bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ - bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ - Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o \ - bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o - + +## Shell-patch derivative kernel switch (independent from USE_CXX_KERNELS) +## 1 : use C++ rewrite of shell derivative functions (experimental) +## 0 : use original Fortran diff_new_sh.o and kodiss_sh.o (default) +USE_CXX_SHELL_KERNELS ?= 0 +ifeq ($(USE_CXX_SHELL_KERNELS),1) +CFILES += fderivs_sh_c.o fdderivs_sh_c.o fderivs_shc_c.o fdderivs_shc_c.o kodiss_sh_c.o +SH_F90_OBJ = +else +SH_F90_OBJ = diff_new_sh.o kodiss_sh.o point_diff_new_sh.o +endif + +C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ + cgh.o bssn_class.o surface_integral.o ShellPatch.o\ + bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ + bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ + Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ + NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o + +C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ + cgh.o surface_integral.o ShellPatch.o\ + bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ + bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ + Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ + NullShellPatch2_Evo.o \ + bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o + F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ - $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\ - lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ + $(RK4_F90_OBJ) diff_new.o kodiss.o\ + lopsidediff.o sommerfeld_rout.o getnp4.o $(SH_F90_OBJ)\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ - fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\ + fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o\ cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ getnpem2.o empart.o NullNews.o fourdcurvature.o\ bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ - scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ - NullNews2.o tool_f.o - -ifeq ($(USE_CXX_KERNELS),0) -# Fortran mode: include original bssn_rhs.o -F90FILES = $(F90FILES_BASE) bssn_rhs.o -else -# C++ mode (default): bssn_rhs.o replaced by C++ kernel -F90FILES = $(F90FILES_BASE) -endif - -F77FILES = zbesh.o - -AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ -tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \ -linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \ -initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o - -TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o - -CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o - -# file dependences -$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh - -$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ - misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ - rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\ - cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ - fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ - NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ - empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ - initial_null2.h NullShellPatch2.h - -$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ - misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ - rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\ - cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ - fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ - NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ - empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ - initial_null2.h NullShellPatch2.h \ - bssn_gpu_class.h bssn_macro.h - -$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h - -$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h - -TwoPunctureFILES: TwoPunctures.h - -$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h - -misc.o : zbesh.o - -# projects -ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) - -ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) - -TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) - -clean: - rm *.o ABE ABEGPU TwoPunctureABE make.log -f + scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ + NullNews2.o tool_f.o + +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: include original bssn_rhs.o +F90FILES = $(F90FILES_BASE) bssn_rhs.o +else +# C++ mode (default): bssn_rhs.o replaced by C++ kernel +F90FILES = $(F90FILES_BASE) +endif + +F77FILES = zbesh.o + +AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ +tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \ +linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \ +initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o + +TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o + +CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o + +# file dependences +$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh + +$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ + misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ + rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\ + cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ + fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ + NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ + empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ + initial_null2.h NullShellPatch2.h + +$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ + misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ + rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\ + cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ + fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ + NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ + empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ + initial_null2.h NullShellPatch2.h \ + bssn_gpu_class.h bssn_macro.h + +$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h + +$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h + +TwoPunctureFILES: TwoPunctures.h + +$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h + +misc.o : zbesh.o + +# projects +ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) + +ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) + +TwoPunctureABE: $(TwoPunctureFILES) + $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) + +clean: + rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h index 0fe2630..659d1fc 100644 --- a/AMSS_NCKU_source/share_func.h +++ b/AMSS_NCKU_source/share_func.h @@ -289,4 +289,84 @@ static inline void symmetry_bd(int ord, symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA); } + +/* + * symmetry_stbd — shell-patch (staggered boundary) ghost fill. + * + * Fortran: funcc(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3) + * Only 2 SoA values (x/y). No z symmetry fill. + * Ghost on BOTH positive and negative sides of x and y. + * Reflection uses i+2 (skips boundary) instead of i+1. + * nx = extc1 + 2*ord, ny = extc2 + 2*ord + */ +static inline void symmetry_stbd(int ord, + const int extc[3], + const double *func, + double *funcc, + const double SoA[2]) +{ + const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + const int nx = extc1 + 2 * ord; + const int ny = extc2 + 2 * ord; + const int sh = ord - 1; + const size_t snx = (size_t)nx; + const size_t splane = snx * (size_t)ny; + + /* 1) Copy interior: funcc(1:extc1, 1:extc2, 1:extc3) = func */ + for (int k0 = 0; k0 < extc3; ++k0) { + const double *src = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1; + const size_t kbase = (size_t)k0 * splane; + for (int j0 = 0; j0 < extc2; ++j0) { + double *dst = funcc + kbase + (size_t)(sh + j0) * snx + (size_t)sh; + const double *s = src + (size_t)j0 * (size_t)extc1; + for (int i0 = 0; i0 < extc1; ++i0) dst[i0] = s[i0]; + } + } + + /* 2) x-direction ghost fill */ + const double s1 = SoA[0]; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = (size_t)k0 * splane; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)(sh + j0) * snx; + /* left side: funcc(-i) = funcc(i+2) * s1 */ + for (int i = 0; i < ord; ++i) { + funcc[off + (size_t)(sh - i)] = funcc[off + (size_t)(sh + i + 2)] * s1; + /* right side: funcc(extc1+1+i) = funcc(extc1-1-i) * s1 */ + funcc[off + (size_t)(sh + extc1 + 1 + i)] = funcc[off + (size_t)(sh + extc1 - 1 - i)] * s1; + } + } + } + + /* 3) y-direction ghost fill */ + const double s2 = SoA[1]; + for (int i = 0; i < nx; ++i) { + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = (size_t)k0 * splane; + /* bottom: funcc(:,-i,:) = funcc(:,i+2,:) * s2 */ + for (int jj = 0; jj < ord; ++jj) { + funcc[kbase + (size_t)(sh - jj) * snx + (size_t)i] = + funcc[kbase + (size_t)(sh + jj + 2) * snx + (size_t)i] * s2; + /* top: funcc(:,extc2+1+jj,:) = funcc(:,extc2-1-jj,:) * s2 */ + funcc[kbase + (size_t)(sh + extc2 + 1 + jj) * snx + (size_t)i] = + funcc[kbase + (size_t)(sh + extc2 - 1 - jj) * snx + (size_t)i] * s2; + } + } + } +} + +/* + * Indexing for shell fh buffer: Fortran fh(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3) + * C 0-based: ii = iF + ord - 1 + * nx = extc1 + 2*ord, ny = extc2 + 2*ord + */ +static inline size_t idx_fh_stbd(int iF, int jF, int kF, int ord, const int extc[3]) { + const int sh = ord - 1; + const int nx = extc[0] + 2 * ord; + const int ny = extc[1] + 2 * ord; + const int ii = iF + sh; + const int jj = jF + sh; + const int kk = kF - 1; // Fortran 1-based kF → C 0-based + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} #endif