From 8a9c775705739319cdd3e9086db1919123edb2c8 Mon Sep 17 00:00:00 2001 From: wingrew <1934247510@qq.com> Date: Wed, 25 Feb 2026 18:59:33 +0800 Subject: [PATCH] Replace Fortran bssn_rhs with C implementation and add C helper kernels - Modify bssn_rhs_c.C to use existing project headers (macrodef.h, bssn_rhs.h) - Update makefile: remove bssn_rhs.o from F90FILES, add CFILES with OpenMP - Keep Fortran helper files (diff_new.f90, kodiss.f90, lopsidediff.f90) for other Fortran callers [copilot: fix compiling errors & a nan error] Co-authored-by: ianchb Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com> --- AMSS_NCKU_source/bssn_rhs_c.C | 1249 +++++++++++++++++++++++++++++++++ AMSS_NCKU_source/fdderivs_c.C | 264 +++++++ AMSS_NCKU_source/fderivs_c.C | 145 ++++ AMSS_NCKU_source/kodiss_c.C | 109 +++ AMSS_NCKU_source/lopsided_c.C | 255 +++++++ AMSS_NCKU_source/makefile | 34 +- AMSS_NCKU_source/makefile.inc | 2 +- AMSS_NCKU_source/share_func.h | 147 ++++ AMSS_NCKU_source/tool.h | 27 + 9 files changed, 2224 insertions(+), 8 deletions(-) create mode 100644 AMSS_NCKU_source/bssn_rhs_c.C create mode 100644 AMSS_NCKU_source/fdderivs_c.C create mode 100644 AMSS_NCKU_source/fderivs_c.C create mode 100644 AMSS_NCKU_source/kodiss_c.C create mode 100644 AMSS_NCKU_source/lopsided_c.C create mode 100644 AMSS_NCKU_source/share_func.h create mode 100644 AMSS_NCKU_source/tool.h diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C new file mode 100644 index 0000000..99254a2 --- /dev/null +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -0,0 +1,1249 @@ +#include "macrodef.h" +#include "bssn_rhs.h" +#include "share_func.h" +#include "tool.h" +// 0-based i,j,k +// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) +// ex(1)=nx, ex(2)=ny, ex(3)=nz + +// 用法:a[ IDX_F(i,j,k,nx,ny) ] + +// C function that calculates the right-hand side for BSSN equations +int f_compute_rhs_bssn(int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, + double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, + int &Symmetry, int &Lev, double &eps, int &co + ) // return gont +{ + double t0 = omp_get_wtime(); + int nx = ex[0], ny = ex[1], nz=ex[2]; + int all = nx*ny*nz; + // printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all); + + // temp variable + double gxx[all],gyy[all],gzz[all]; + double chix[all],chiy[all],chiz[all]; + double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all]; + double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all]; + double gxxz[all],gxyz[all],gxzz[all],gyyz[all],gyzz[all],gzzz[all]; + double Lapx[all], Lapy[all], Lapz[all]; + double betaxx[all], betaxy[all], betaxz[all]; + double betayx[all], betayy[all], betayz[all]; + double betazx[all], betazy[all], betazz[all]; + double Gamxx[all],Gamxy[all],Gamxz[all]; + double Gamyx[all],Gamyy[all],Gamyz[all]; + double Gamzx[all],Gamzy[all],Gamzz[all]; + double Kx[all], Ky[all], Kz[all], div_beta[all], S[all]; + double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all]; + double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all]; + double gupxx[all], gupxy[all], gupxz[all]; + double gupyy[all], gupyz[all], gupzz[all]; + double SSS[3] = { 1.0, 1.0, 1.0}; + double AAS[3] = {-1.0, -1.0, 1.0}; + double ASA[3] = {-1.0, 1.0, -1.0}; + double SAA[3] = { 1.0, -1.0, -1.0}; + double ASS[3] = {-1.0, 1.0, 1.0}; + double SAS[3] = { 1.0, -1.0, 1.0}; + double SSA[3] = { 1.0, 1.0, -1.0}; + double dX, dY, dZ, PI; + const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, FOUR = 4.0; + const double EIGHT = 8.0, HALF = 0.5, THR = 3.0; + const double SYM = 1.0, ANTI = -1.0; + const double FF = 0.75, eta = 2.0; + const double F1o3 = 1.0/3.0, F2o3 = 2.0/3.0, F3o2 = 1.5, F1o6 = 1.0/6.0; + const double F16 = 16.0, F8 = 8.0; + #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) + double reta[all]; + /* 使用时:reta[idx],其中 idx = i + nx*(j + ny*k) (Fortran列主序) */ + #endif + PI = acos(-1.0); + dX = X[1] - X[0]; + dY = Y[1] - Y[0]; + dZ = Z[1] - Z[0]; + + #pragma omp parallel + { + int tid = omp_get_thread_num(); // 当前线程号(从 0 开始) + int nthr = omp_get_num_threads(); // 当前并行区里的总线程数 + int local = all / nthr; + int start = tid * local; + int end = (tid == nthr - 1) ? all : start + local; + // 1ms // + for(int i=start;i0){ + for(int _task=tid; _task<16; _task+=nthr){ + switch(_task){ + case 0: kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps); break; + case 1: kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps); break; + case 2: kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps); break; + case 3: kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps); break; + case 4: kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps); break; + case 5: kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps); break; + case 6: kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps); break; + case 7: kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps); break; + case 8: + kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps); + break; + case 9: + kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps); + break; + case 10: + kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps); + break; + case 11: + kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps); + break; + case 12: + kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps); + kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps); + break; + case 13: + kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps); + break; + case 14: + kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps); + kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps); + break; + case 15: + kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps); + kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps); + break; + } + } + } + // 2ms // + if(co==0){ + for (int i=start;i NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */ + const size_t nx = (size_t)ex1 + 2; + const size_t ny = (size_t)ex2 + 2; + const size_t nz = (size_t)ex3 + 2; + const size_t fh_size = nx * ny * nz; + + static thread_local double *fh = NULL; + static thread_local size_t cap = 0; + + if (fh_size > cap) { + free(fh); + fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + // double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + symmetry_bd(2, ex, f, fh, SoA); + + /* 系数:按 Fortran 原式 */ + const double Sdxdx = ONE / (dX * dX); + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + + const double Fdxdx = F1o12 / (dX * dX); + const double Fdydy = F1o12 / (dY * dY); + const double Fdzdz = F1o12 / (dZ * dZ); + + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + + const double Fdxdy = F1o144 / (dX * dY); + const double Fdxdz = F1o144 / (dX * dZ); + const double Fdydz = F1o144 / (dY * dZ); + + /* 输出清零:fxx,fyy,fzz,fxy,fxz,fyz = 0 */ + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + + /* + * Fortran: + * do k=1,ex3-1 + * do j=1,ex2-1 + * do i=1,ex1-1 + */ + + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* 高阶分支:i±2,j±2,k±2 都在范围内 */ + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fxx[p] = Fdxdx * ( + -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fyy[p] = Fdydy * ( + -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fzz[p] = Fdzdz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + + /* fxy 高阶:完全照搬 Fortran 的括号结构 */ + { + const double t_jm2 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] ); + + const double t_jm1 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] ); + + const double t_jp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] ); + + const double t_jp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] ); + + fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); + } + + /* fxz 高阶 */ + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] ); + + fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + + /* fyz 高阶 */ + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] ); + + fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + } + /* 二阶分支:i±1,j±1,k±1 在范围内 */ + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fxx[p] = Sdxdx * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fyy[p] = Sdydy * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fzz[p] = Sdzdz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + + fxy[p] = Sdxdy * ( + fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + ); + + fxz[p] = Sdxdz * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + ); + + fyz[p] = Sdydz * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + ); + }else{ + fxx[p] = 0.0; + fyy[p] = 0.0; + fzz[p] = 0.0; + fxy[p] = 0.0; + fxz[p] = 0.0; + fyz[p] = 0.0; + } + } + } + } + + // free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/fderivs_c.C b/AMSS_NCKU_source/fderivs_c.C new file mode 100644 index 0000000..47c9c6c --- /dev/null +++ b/AMSS_NCKU_source/fderivs_c.C @@ -0,0 +1,145 @@ +#include "tool.h" + +/* + * C 版 fderivs + * + * Fortran: + * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff) + * + * 约定: + * f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局 + * X: ex1, Y: ex2, Z: ex3 + */ +void fderivs(const int ex[3], + const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff) +{ + (void)onoff; // Fortran 里没用到 + + const double ZEO = 0.0, ONE = 1.0; + const double TWO = 2.0, EIT = 8.0; + const double F12 = 12.0; + + const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用 + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // dX = X(2)-X(1) -> C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + // Fortran 1-based bounds + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + + // SoA(1:3) = SYM1,SYM2,SYM3 + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 + const size_t nx = (size_t)ex1 + 2; + const size_t ny = (size_t)ex2 + 2; + const size_t nz = (size_t)ex3 + 2; + const size_t fh_size = nx * ny * nz; + static thread_local double *fh = NULL; + static thread_local size_t cap = 0; + + if (fh_size > cap) { + free(fh); + fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + // double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + // call symmetry_bd(2,ex,f,fh,SoA) + symmetry_bd(2, ex, f, fh, SoA); + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + // fx = fy = fz = 0 + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + + /* + * Fortran loops: + * do k=1,ex3-1 + * do j=1,ex2-1 + * do i=1,ex1-1 + * + * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2 + */ + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + // if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引) + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fx[p] = d12dx * ( + fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + ); + + fy[p] = d12dy * ( + fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + ); + + fz[p] = d12dz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + ); + } + // elseif(i+1 <= imax .and. i-1 >= imin ...) + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fx[p] = d2dx * ( + -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fy[p] = d2dy * ( + -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fz[p] = d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + } + } + } + } + + // free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/kodiss_c.C b/AMSS_NCKU_source/kodiss_c.C new file mode 100644 index 0000000..2a04abe --- /dev/null +++ b/AMSS_NCKU_source/kodiss_c.C @@ -0,0 +1,109 @@ +#include "tool.h" + +/* + * C 版 kodis + * + * Fortran signature: + * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps) + * + * 约定: + * X: ex1, Y: ex2, Z: ex3 + * f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局 + * SoA[3] + * eps: double + */ +void kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double SoA[3], + int Symmetry, double eps) +{ + const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0; + const double cof = 64.0; // 2^6 + const int NO_SYMM = 0, OCTANT = 2; + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + (void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上 + + // Fortran: imax=ex(1) 等是 1-based 上界 + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + // Fortran: imin=jmin=kmin=1,某些对称情况变 -2 + int iminF = 1, jminF = 1, kminF = 1; + + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2; + if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2; + + // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3 + const size_t nx = (size_t)ex1 + 3; + const size_t ny = (size_t)ex2 + 3; + const size_t nz = (size_t)ex3 + 3; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + // Fortran: call symmetry_bd(3,ex,f,fh,SoA) + symmetry_bd(3, ex, f, fh, SoA); + + /* + * Fortran loops: + * do k=1,ex3 + * do j=1,ex2 + * do i=1,ex1 + * + * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1 + * 并定义 Fortran index: iF=i0+1, ... + */ + for (int k0 = 0; k0 < ex3; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 < ex2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 < ex1; ++i0) { + const int iF = i0 + 1; + + // Fortran if 条件: + // i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引) + if ((iF - 3) >= iminF && (iF + 3) <= imaxF && + (jF - 3) >= jminF && (jF + 3) <= jmaxF && + (kF - 3) >= kminF && (kF + 3) <= kmaxF) + { + const size_t p = idx_ex(i0, j0, k0, ex); + + // 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核) + const double Dx_term = + ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - + SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + + FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - + TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX; + + const double Dy_term = + ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - + SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + + FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY; + + const double Dz_term = + ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - + SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + + FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ; + + // Fortran: + // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term) + f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); + } + } + } + } + + free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/lopsided_c.C b/AMSS_NCKU_source/lopsided_c.C new file mode 100644 index 0000000..fe2c6e4 --- /dev/null +++ b/AMSS_NCKU_source/lopsided_c.C @@ -0,0 +1,255 @@ +#include "tool.h" +/* + * 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。 + * Fortran: call symmetry_bd(3,ex,f,fh,SoA) + * + * 约定: + * nghost = 3 + * ex[3] = {ex1,ex2,ex3} + * f = 原始网格 (ex1*ex2*ex3) + * fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...) + * SoA[3] = 输入参数 + */ +void lopsided(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3]) +{ + const double ZEO = 0.0, ONE = 1.0, F3 = 3.0; + const double TWO = 2.0, F6 = 6.0, F18 = 18.0; + const double F12 = 12.0, F10 = 10.0, EIT = 8.0; + + const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2; + (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系 + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based) + // C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来) + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + (void)d2dx; (void)d2dy; (void)d2dz; + + // Fortran: + // imax = ex(1); jmax = ex(2); kmax = ex(3) + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + // Fortran: + // imin=jmin=kmin=1; 若满足对称条件则设为 -2 + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3) + const size_t nx = (size_t)ex1 + 3; + const size_t ny = (size_t)ex2 + 3; + const size_t nz = (size_t)ex3 + 3; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错) + + // Fortran: call symmetry_bd(3,ex,f,fh,SoA) + symmetry_bd(3, ex, f, fh, SoA); + + /* + * Fortran 主循环: + * do k=1,ex(3)-1 + * do j=1,ex(2)-1 + * do i=1,ex(1)-1 + * + * 转成 C 0-based: + * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2 + * + * 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值: + * iF=i0+1, jF=j0+1, kF=k0+1 + */ + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + + const size_t p = idx_ex(i0, j0, k0, ex); + + // ---------------- x direction ---------------- + const double sfx = Sfx[p]; + if (sfx > ZEO) { + // Fortran: if(i+3 <= imax) + // iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4 + if (i0 <= ex1 - 4) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } + // elseif(i+2 <= imax) <=> i0 <= ex1-3 + else if (i0 <= ex1 - 3) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } + // elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立) + else if (i0 <= ex1 - 2) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } + } else if (sfx < ZEO) { + // Fortran: if(i-3 >= imin) + // (iF-3) >= iminF <=> (i0-2) >= iminF + if ((i0 - 2) >= iminF) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } + // elseif(i-2 >= imin) <=> (i0-1) >= iminF + else if ((i0 - 1) >= iminF) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } + // elseif(i-1 >= imin) <=> i0 >= iminF + else if (i0 >= iminF) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } + } + + // ---------------- y direction ---------------- + const double sfy = Sfy[p]; + if (sfy > ZEO) { + // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4 + if (j0 <= ex2 - 4) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } else if (j0 <= ex2 - 3) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 <= ex2 - 2) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } + } else if (sfy < ZEO) { + if ((j0 - 2) >= jminF) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } else if ((j0 - 1) >= jminF) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 >= jminF) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } + } + + // ---------------- z direction ---------------- + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0 <= ex3 - 4) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } else if (k0 <= ex3 - 3) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 <= ex3 - 2) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } + } else if (sfz < ZEO) { + if ((k0 - 2) >= kminF) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } else if ((k0 - 1) >= kminF) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 >= kminF) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } + } + } + } + } + free(fh); +} + + + + + diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index c5750b8..c6d465f 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -37,6 +37,22 @@ endif .cu.o: $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) +# C rewrite of BSSN RHS kernel and helpers +bssn_rhs_c.o: bssn_rhs_c.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ + +fderivs_c.o: fderivs_c.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ + +fdderivs_c.o: fdderivs_c.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ + +kodiss_c.o: kodiss_c.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ + +lopsided_c.o: lopsided_c.C + ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ + ## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented) TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include @@ -47,6 +63,10 @@ TwoPunctureABE.o: TwoPunctureABE.C ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ # Input files + +# C rewrite files +CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o + C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ @@ -64,7 +84,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ - rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\ + rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ @@ -87,7 +107,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o # file dependences -$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh +$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ @@ -110,7 +130,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h -$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h +$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h TwoPunctureFILES: TwoPunctures.h @@ -119,11 +139,11 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h misc.o : zbesh.o # projects -ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) +ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) +ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) TwoPunctureABE: $(TwoPunctureFILES) $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index db28baf..17dcbd2 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -8,7 +8,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## opt : (default) maximum performance with PGO profile-guided optimization diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h new file mode 100644 index 0000000..f504a07 --- /dev/null +++ b/AMSS_NCKU_source/share_func.h @@ -0,0 +1,147 @@ +#ifndef SHARE_FUNC_H +#define SHARE_FUNC_H + +#include +#include +#include +#include +#include +/* 主网格:0-based -> 1D */ +static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) { + const int ex1 = ex[0], ex2 = ex[1]; + return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2; +} + +/* + * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3) + * ord=2 => shift=1 + * iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex) + */ +static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) { + const int shift = 1; + const int nx = ex[0] + 2; // ex1 + ord + const int ny = ex[1] + 2; + + const int ii = iF + shift; // 0..ex1+1 + const int jj = jF + shift; // 0..ex2+1 + const int kk = kF + shift; // 0..ex3+1 + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3) + * ord=3 => shift=2 + * iF/jF/kF 是 Fortran 索引(可为负) + */ +static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) { + const int shift = 2; // ord=3 -> -2..ex + const int nx = ex[0] + 3; // ex1 + ord + const int ny = ex[1] + 3; + + const int ii = iF + shift; // 0..ex1+2 + const int jj = jF + shift; // 0..ex2+2 + const int kk = kF + shift; // 0..ex3+2 + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran + * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran + * + * C 里我们把: + * func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1 + * funcc 用“平移下标”存为一维数组: + * iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1] + * 总长度 nx = extc1 + ord + * 同理 ny = extc2 + ord, nz = extc3 + ord + */ + +static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) { + const int nx = extc[0], ny = extc[1]; + return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny; +} + +static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) { + const int shift = ord - 1; // iF = -shift .. extc1 + const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个 + const int ny = extc[1] + ord; + + const int ii = iF + shift; // 0..extc1+shift + const int jj = jF + shift; // 0..extc2+shift + const int kk = kF + shift; // 0..extc3+shift + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * 等价于 Fortran: + * funcc(1:extc1,1:extc2,1:extc3)=func + * do i=0,ord-1 + * funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) + * enddo + * do i=0,ord-1 + * funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2) + * enddo + * do i=0,ord-1 + * funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) + * enddo + */ +static inline void symmetry_bd(int ord, + const int extc[3], + const double *func, + double *funcc, + const double SoA[3]) +{ + const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + + // 1) funcc(1:extc1,1:extc2,1:extc3) = func + // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1) + for (int k0 = 0; k0 < extc3; ++k0) { + for (int j0 = 0; j0 < extc2; ++j0) { + for (int i0 = 0; i0 < extc1; ++i0) { + const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)]; + } + } + } + + // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1) + for (int ii = 0; ii <= ord - 1; ++ii) { + const int iF_dst = -ii; // 0, -1, -2, ... + const int iF_src = ii + 1; // 1, 2, 3, ... + for (int kF = 1; kF <= extc3; ++kF) { + for (int jF = 1; jF <= extc2; ++jF) { + funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] = + funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0]; + } + } + } + + // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2) + // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖 + for (int jj = 0; jj <= ord - 1; ++jj) { + const int jF_dst = -jj; + const int jF_src = jj + 1; + for (int kF = 1; kF <= extc3; ++kF) { + for (int iF = -ord + 1; iF <= extc1; ++iF) { + funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] = + funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1]; + } + } + } + + // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3) + for (int kk = 0; kk <= ord - 1; ++kk) { + const int kF_dst = -kk; + const int kF_src = kk + 1; + for (int jF = -ord + 1; jF <= extc2; ++jF) { + for (int iF = -ord + 1; iF <= extc1; ++iF) { + funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] = + funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2]; + } + } + } +} +#endif diff --git a/AMSS_NCKU_source/tool.h b/AMSS_NCKU_source/tool.h new file mode 100644 index 0000000..154b5ec --- /dev/null +++ b/AMSS_NCKU_source/tool.h @@ -0,0 +1,27 @@ +#include "share_func.h" +void fdderivs(const int ex[3], + const double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff); + +void fderivs(const int ex[3], + const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff); + +void kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double SoA[3], + int Symmetry, double eps); + +void lopsided(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3]); \ No newline at end of file