Remove OpenMP from C rewrite kernel

The C rewrite introduced OpenMP parallelism. Remove all OpenMP.
This commit is contained in:
2026-02-25 13:15:24 +00:00
parent 09b937c022
commit 284ab80baf
5 changed files with 111 additions and 206 deletions

View File

@@ -34,7 +34,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
int &Symmetry, int &Lev, double &eps, int &co
) // return gont
{
double t0 = omp_get_wtime();
int nx = ex[0], ny = ex[1], nz=ex[2];
int all = nx*ny*nz;
// printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);
@@ -80,15 +79,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0];
dZ = Z[1] - Z[0];
#pragma omp parallel
{
int tid = omp_get_thread_num(); // 当前线程号(从 0 开始)
int nthr = omp_get_num_threads(); // 当前并行区里的总线程数
int local = all / nthr;
int start = tid * local;
int end = (tid == nthr - 1) ? all : start + local;
// 1ms //
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0;
chin1[i] = chi[i] + 1.0;
gxx[i] = dxx[i] + 1.0;
@@ -96,27 +88,21 @@ int f_compute_rhs_bssn(int *ex, double &T,
gzz[i] = dzz[i] + 1.0;
}
// 9ms //
for(int _task=tid; _task<12; _task+=nthr){
switch(_task){
case 0: fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev); break;
case 1: fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev); break;
case 2: fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev); break;
case 3: fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
case 4: fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break;
case 5: fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev); break;
case 6: fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev); break;
case 7: fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break;
case 8: fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev); break;
case 9: fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break;
case 10: fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
case 11: fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
}
}
fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev);
fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev);
fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev);
fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev);
fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev);
fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 3ms //
//omp_barrier(); // 确保所有线程都完成了 betax, betay, betaz 的导数计算
#pragma omp barrier
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
div_beta[i] = betaxx[i] + betayy[i] + betazz[i];
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]);
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] +
@@ -136,7 +122,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gzz[i] * betazx[i] - gxz[i] * betayy[i];
}
// 1ms //
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i];
gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det;
@@ -148,7 +134,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 2.2ms //
if(co==0){
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
Gmx_Res[i] = Gamx[i] - (
gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) +
gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) +
@@ -193,7 +179,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
}
// 5ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i]
+ gupxy[i]*(TWO*gxyx[i] - gxxy[i])
@@ -269,7 +255,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 1.8ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
Rxx[i] = gupxx[i]*gupxx[i]*Axx[i]
+ gupxy[i]*gupxy[i]*Ayy[i]
@@ -314,7 +300,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
}
// 4ms //
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
Gamx_rhs[i] = - TWO * ( Lapx[i] * Rxx[i] + Lapy[i] * Rxy[i] + Lapz[i] * Rxz[i] ) +
TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i] * Rxx[i] + chiy[i] * Rxy[i] + chiz[i] * Rxz[i] ) -
@@ -345,23 +331,18 @@ int f_compute_rhs_bssn(int *ex, double &T,
);
}
// 22.3ms //
for(int _task=tid; _task<6; _task+=nthr){
switch(_task){
case 0: fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev); break;
case 1: fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,
X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev); break;
case 2: fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev); break;
case 3: fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev); break;
case 4: fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev); break;
case 5: fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev); break;
}
}
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,
X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev);
fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev);
fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
// 3.5ms //
#pragma omp barrier
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
fxx[i] = gxxx[i] + gxyy[i] + gxzz[i];
fxy[i] = gxyx[i] + gyyy[i] + gyzz[i];
fxz[i] = gxzx[i] + gyzy[i] + gzzz[i];
@@ -375,7 +356,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
}
// 3.9ms //
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
Gamx_rhs[i] = Gamx_rhs[i]
+ F2o3 * Gamxa[i] * div_beta[i]
- Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i]
@@ -398,7 +379,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
}
// 4.4ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
@@ -421,69 +402,51 @@ int f_compute_rhs_bssn(int *ex, double &T,
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i];
}
// 22.2ms //
if(tid==0){
fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
}
// 0.7ms //
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Rxx[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 23ms //
if(tid==0){
fdderivs(ex,dyy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
}
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Ryy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 23ms //
if(tid==0){
fdderivs(ex,dzz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
}
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Rzz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 23ms //
if(tid==0){
fdderivs(ex,gxy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI, ANTI,SYM ,Symmetry,Lev);
}
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Rxy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 23ms //
if(tid==0){
fdderivs(ex,gxz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI ,SYM ,ANTI,Symmetry,Lev);
}
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Rxz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 23ms //
if(tid==0){
fdderivs(ex,gyz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,ANTI ,ANTI,Symmetry,Lev);
}
#pragma omp barrier
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Ryz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
}
// 14ms //
/* 假设 all = ex1*ex2*ex3所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */
for (int i = start; i < end; i += 1) {
for (int i = 0; i < all; i += 1) {
Rxx[i] =
-HALF * Rxx[i]
+ gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
@@ -735,13 +698,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 22.3ms //
if(tid==0){
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
}
// 7ms //
#pragma omp barrier
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
@@ -765,16 +725,11 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 24ms //
for(int _task=tid; _task<2; _task+=nthr){
switch(_task){
case 0: fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
case 1: fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
}
}
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms //
#pragma omp barrier
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i];
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i];
@@ -815,12 +770,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
}
// 1ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
}
// 2.5ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
S[i] = chin1[i] * (
gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
@@ -879,7 +834,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
}
// 8ms //
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
/* Aij_rhs = fij - gij * f */
Axx_rhs[i] = fxx[i] - gxx[i] * f[i];
@@ -1011,7 +966,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 1ms //
for(int i=start;i<end;i+=1){
for(int i=0;i<all;i+=1){
betax_rhs[i] = FF * dtSfx[i];
betay_rhs[i] = FF * dtSfy[i];
betaz_rhs[i] = FF * dtSfz[i];
@@ -1036,101 +991,60 @@ int f_compute_rhs_bssn(int *ex, double &T,
#endif
}
// 26ms //
for(int _task=tid; _task<16; _task+=nthr){
switch(_task){
case 0:
lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
break;
case 1:
lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
break;
case 2:
lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
break;
case 3:
lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
break;
case 4:
lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
break;
case 5:
lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
break;
case 6:
lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
break;
case 7:
lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
break;
case 8: lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA); break;
case 9: lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 10: lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA); break;
case 11: lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 12: lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 13: lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 14: lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS); break;
case 15: lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS); break;
}
}
lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA);
lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA);
lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS);
lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS);
// 20ms //
#pragma omp barrier
if(eps>0){
for(int _task=tid; _task<16; _task+=nthr){
switch(_task){
case 0: kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps); break;
case 1: kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps); break;
case 2: kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps); break;
case 3: kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps); break;
case 4: kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps); break;
case 5: kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps); break;
case 6: kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps); break;
case 7: kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps); break;
case 8:
kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
break;
case 9:
kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
break;
case 10:
kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
break;
case 11:
kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
break;
case 12:
kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
break;
case 13:
kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
break;
case 14:
kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
break;
case 15:
kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
break;
}
}
kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps);
kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps);
kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps);
kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
}
// 2ms //
if(co==0){
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
ham_Res[i] =
gupxx[i] * Rxx[i] + gupyy[i] * Ryy[i] + gupzz[i] * Rzz[i]
+ TWO * ( gupxy[i] * Rxy[i] + gupxz[i] * Rxz[i] + gupyz[i] * Ryz[i] );
@@ -1174,20 +1088,15 @@ int f_compute_rhs_bssn(int *ex, double &T,
}
// 1ms //
for(int _task=tid; _task<6; _task+=nthr){
switch(_task){
case 0: fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break;
case 1: fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0); break;
case 2: fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0); break;
case 3: fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break;
case 4: fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0); break;
case 5: fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break;
}
}
fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0);
fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0);
fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
}
// 7ms //
#pragma omp barrier
for (int i=start;i<end;i+=1) {
for (int i=0;i<all;i+=1) {
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
@@ -1240,10 +1149,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
}
}
// double t1 = omp_get_wtime();
// printf("Time for BSSN RHS computation: %.6f seconds\n", (t1-t0));
return 0; // success
}

View File

@@ -41,8 +41,8 @@ void fdderivs(const int ex[3],
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
static double *fh = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh);

View File

@@ -50,8 +50,8 @@ void fderivs(const int ex[3],
const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL;
static thread_local size_t cap = 0;
static double *fh = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh);

View File

@@ -39,19 +39,19 @@ endif
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_c.o: fdderivs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_c.o: kodiss_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include

View File

@@ -5,7 +5,6 @@
#include <stddef.h>
#include <math.h>
#include <stdio.h>
#include <omp.h>
/* 主网格0-based -> 1D */
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
const int ex1 = ex[0], ex2 = ex[1];