Remove OpenMP from C rewrite kernel

The C rewrite introduced OpenMP parallelism. Remove all OpenMP.
This commit is contained in:
2026-02-25 13:15:24 +00:00
parent 09b937c022
commit 284ab80baf
5 changed files with 111 additions and 206 deletions

View File

@@ -34,7 +34,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
int &Symmetry, int &Lev, double &eps, int &co int &Symmetry, int &Lev, double &eps, int &co
) // return gont ) // return gont
{ {
double t0 = omp_get_wtime();
int nx = ex[0], ny = ex[1], nz=ex[2]; int nx = ex[0], ny = ex[1], nz=ex[2];
int all = nx*ny*nz; int all = nx*ny*nz;
// printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all); // printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);
@@ -80,15 +79,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0]; dY = Y[1] - Y[0];
dZ = Z[1] - Z[0]; dZ = Z[1] - Z[0];
#pragma omp parallel
{
int tid = omp_get_thread_num(); // 当前线程号(从 0 开始)
int nthr = omp_get_num_threads(); // 当前并行区里的总线程数
int local = all / nthr;
int start = tid * local;
int end = (tid == nthr - 1) ? all : start + local;
// 1ms // // 1ms //
for(int i=start;i<end;i+=1){ for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0; alpn1[i] = Lap[i] + 1.0;
chin1[i] = chi[i] + 1.0; chin1[i] = chi[i] + 1.0;
gxx[i] = dxx[i] + 1.0; gxx[i] = dxx[i] + 1.0;
@@ -96,27 +88,21 @@ int f_compute_rhs_bssn(int *ex, double &T,
gzz[i] = dzz[i] + 1.0; gzz[i] = dzz[i] + 1.0;
} }
// 9ms // // 9ms //
for(int _task=tid; _task<12; _task+=nthr){ fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
switch(_task){ fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev);
case 0: fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev); break; fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev);
case 1: fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev); break; fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
case 2: fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev); break; fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
case 3: fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break; fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev);
case 4: fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break; fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev);
case 5: fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev); break; fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
case 6: fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev); break; fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev);
case 7: fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break; fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
case 8: fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev); break; fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
case 9: fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); break; fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
case 10: fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
case 11: fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
}
}
// 3ms // // 3ms //
//omp_barrier(); // 确保所有线程都完成了 betax, betay, betaz 的导数计算 for(int i=0;i<all;i+=1){
#pragma omp barrier
for(int i=start;i<end;i+=1){
div_beta[i] = betaxx[i] + betayy[i] + betazz[i]; div_beta[i] = betaxx[i] + betayy[i] + betazz[i];
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]); chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]);
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] + gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] +
@@ -136,7 +122,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ gzz[i] * betazx[i] - gxz[i] * betayy[i]; + gzz[i] * betazx[i] - gxz[i] * betayy[i];
} }
// 1ms // // 1ms //
for(int i=start;i<end;i+=1){ for(int i=0;i<all;i+=1){
double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] - double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i]; gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i];
gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det; gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det;
@@ -148,7 +134,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 2.2ms // // 2.2ms //
if(co==0){ if(co==0){
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
Gmx_Res[i] = Gamx[i] - ( Gmx_Res[i] = Gamx[i] - (
gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) + gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) +
gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) + gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) +
@@ -193,7 +179,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
} }
// 5ms // // 5ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i] Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i]
+ gupxy[i]*(TWO*gxyx[i] - gxxy[i]) + gupxy[i]*(TWO*gxyx[i] - gxxy[i])
@@ -269,7 +255,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 1.8ms // // 1.8ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
Rxx[i] = gupxx[i]*gupxx[i]*Axx[i] Rxx[i] = gupxx[i]*gupxx[i]*Axx[i]
+ gupxy[i]*gupxy[i]*Ayy[i] + gupxy[i]*gupxy[i]*Ayy[i]
@@ -314,7 +300,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i]; + ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
} }
// 4ms // // 4ms //
for(int i=start;i<end;i+=1){ for(int i=0;i<all;i+=1){
Gamx_rhs[i] = - TWO * ( Lapx[i] * Rxx[i] + Lapy[i] * Rxy[i] + Lapz[i] * Rxz[i] ) + Gamx_rhs[i] = - TWO * ( Lapx[i] * Rxx[i] + Lapy[i] * Rxy[i] + Lapz[i] * Rxz[i] ) +
TWO * alpn1[i] * ( TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i] * Rxx[i] + chiy[i] * Rxy[i] + chiz[i] * Rxz[i] ) - -F3o2/chin1[i] * ( chix[i] * Rxx[i] + chiy[i] * Rxy[i] + chiz[i] * Rxz[i] ) -
@@ -345,23 +331,18 @@ int f_compute_rhs_bssn(int *ex, double &T,
); );
} }
// 22.3ms // // 22.3ms //
for(int _task=tid; _task<6; _task+=nthr){ fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
switch(_task){ X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
case 0: fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx, fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev); break; X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
case 1: fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy, fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,
X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev); break; X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev);
case 2: fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz, fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev);
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev); break; fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
case 3: fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev); break; fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
case 4: fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev); break;
case 5: fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev); break;
}
}
// 3.5ms // // 3.5ms //
#pragma omp barrier for(int i=0;i<all;i+=1){
for(int i=start;i<end;i+=1){
fxx[i] = gxxx[i] + gxyy[i] + gxzz[i]; fxx[i] = gxxx[i] + gxyy[i] + gxzz[i];
fxy[i] = gxyx[i] + gyyy[i] + gyzz[i]; fxy[i] = gxyx[i] + gyyy[i] + gyzz[i];
fxz[i] = gxzx[i] + gyzy[i] + gzzz[i]; fxz[i] = gxzx[i] + gyzy[i] + gzzz[i];
@@ -375,7 +356,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] ); + TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
} }
// 3.9ms // // 3.9ms //
for(int i=start;i<end;i+=1){ for(int i=0;i<all;i+=1){
Gamx_rhs[i] = Gamx_rhs[i] Gamx_rhs[i] = Gamx_rhs[i]
+ F2o3 * Gamxa[i] * div_beta[i] + F2o3 * Gamxa[i] * div_beta[i]
- Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i] - Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i]
@@ -398,7 +379,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] ); + TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
} }
// 4.4ms // // 4.4ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i]; gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i]; gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i]; gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
@@ -421,69 +402,51 @@ int f_compute_rhs_bssn(int *ex, double &T,
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i]; gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i];
} }
// 22.2ms // // 22.2ms //
if(tid==0){
fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
}
// 0.7ms // // 0.7ms //
#pragma omp barrier for (int i = 0; i < all; i += 1) {
for (int i = start; i < end; i += 1) {
Rxx[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Rxx[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 23ms // // 23ms //
if(tid==0){
fdderivs(ex,dyy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); fdderivs(ex,dyy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
} for (int i = 0; i < all; i += 1) {
#pragma omp barrier
for (int i = start; i < end; i += 1) {
Ryy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Ryy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 23ms // // 23ms //
if(tid==0){
fdderivs(ex,dzz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev); fdderivs(ex,dzz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
} for (int i = 0; i < all; i += 1) {
#pragma omp barrier
for (int i = start; i < end; i += 1) {
Rzz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Rzz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 23ms // // 23ms //
if(tid==0){
fdderivs(ex,gxy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI, ANTI,SYM ,Symmetry,Lev); fdderivs(ex,gxy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI, ANTI,SYM ,Symmetry,Lev);
} for (int i = 0; i < all; i += 1) {
#pragma omp barrier
for (int i = start; i < end; i += 1) {
Rxy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Rxy[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 23ms // // 23ms //
if(tid==0){
fdderivs(ex,gxz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI ,SYM ,ANTI,Symmetry,Lev); fdderivs(ex,gxz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI ,SYM ,ANTI,Symmetry,Lev);
} for (int i = 0; i < all; i += 1) {
#pragma omp barrier
for (int i = start; i < end; i += 1) {
Rxz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Rxz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 23ms // // 23ms //
if(tid==0){
fdderivs(ex,gyz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,ANTI ,ANTI,Symmetry,Lev); fdderivs(ex,gyz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,ANTI ,ANTI,Symmetry,Lev);
} for (int i = 0; i < all; i += 1) {
#pragma omp barrier
for (int i = start; i < end; i += 1) {
Ryz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] Ryz[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO; + (gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i]) * TWO;
} }
// 14ms // // 14ms //
/* 假设 all = ex1*ex2*ex3所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */ /* 假设 all = ex1*ex2*ex3所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */
for (int i = start; i < end; i += 1) { for (int i = 0; i < all; i += 1) {
Rxx[i] = Rxx[i] =
-HALF * Rxx[i] -HALF * Rxx[i]
+ gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i] + gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
@@ -735,13 +698,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 22.3ms // // 22.3ms //
if(tid==0){
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
}
// 7ms // // 7ms //
#pragma omp barrier for (int i=0;i<all;i+=1) {
for (int i=start;i<end;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i]; fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i]; fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i]; fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
@@ -765,16 +725,11 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 24ms // // 24ms //
for(int _task=tid; _task<2; _task+=nthr){ fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
switch(_task){ fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
case 0: fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
case 1: fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); break;
}
}
// 6ms // // 6ms //
#pragma omp barrier for (int i=0;i<all;i+=1) {
for (int i=start;i<end;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */ /* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i]; gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i];
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i]; gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i];
@@ -815,12 +770,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i]; fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
} }
// 1ms // // 1ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] ); + TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
} }
// 2.5ms // // 2.5ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
S[i] = chin1[i] * ( S[i] = chin1[i] * (
gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i] gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
@@ -879,7 +834,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i]; fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
} }
// 8ms // // 8ms //
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
/* Aij_rhs = fij - gij * f */ /* Aij_rhs = fij - gij * f */
Axx_rhs[i] = fxx[i] - gxx[i] * f[i]; Axx_rhs[i] = fxx[i] - gxx[i] * f[i];
@@ -1011,7 +966,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 1ms // // 1ms //
for(int i=start;i<end;i+=1){ for(int i=0;i<all;i+=1){
betax_rhs[i] = FF * dtSfx[i]; betax_rhs[i] = FF * dtSfx[i];
betay_rhs[i] = FF * dtSfy[i]; betay_rhs[i] = FF * dtSfy[i];
betaz_rhs[i] = FF * dtSfz[i]; betaz_rhs[i] = FF * dtSfz[i];
@@ -1036,101 +991,60 @@ int f_compute_rhs_bssn(int *ex, double &T,
#endif #endif
} }
// 26ms // // 26ms //
for(int _task=tid; _task<16; _task+=nthr){ lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
switch(_task){ lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
case 0: lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS); lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA); lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
break; lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
case 1: lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS); lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS); lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
break; lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
case 2: lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA); lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS); lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
break; lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
case 3: lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS); lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS); lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA);
break; lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS);
case 4: lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA);
lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA); lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA); lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS);
break; lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS);
case 5: lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS);
lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS); lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS);
lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
break;
case 6:
lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
break;
case 7:
lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
break;
case 8: lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA); break;
case 9: lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 10: lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA); break;
case 11: lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 12: lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 13: lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS); break;
case 14: lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS); break;
case 15: lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS); break;
}
}
// 20ms // // 20ms //
#pragma omp barrier
if(eps>0){ if(eps>0){
for(int _task=tid; _task<16; _task+=nthr){ kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps);
switch(_task){ kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps);
case 0: kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps); break; kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps);
case 1: kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps); break; kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps);
case 2: kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps); break; kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps);
case 3: kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps); break; kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps);
case 4: kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps); break; kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps);
case 5: kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps); break; kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps);
case 6: kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps); break; kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
case 7: kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps); break; kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
case 8: kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps); kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps); kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
break; kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
case 9: kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps); kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps); kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
break; kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
case 10: kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps); kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps); kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
break; kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
case 11: kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps); kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
break;
case 12:
kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
break;
case 13:
kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
break;
case 14:
kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
break;
case 15:
kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
break;
}
}
} }
// 2ms // // 2ms //
if(co==0){ if(co==0){
for (int i=start;i<end;i+=1) { for (int i=0;i<all;i+=1) {
ham_Res[i] = ham_Res[i] =
gupxx[i] * Rxx[i] + gupyy[i] * Ryy[i] + gupzz[i] * Rzz[i] gupxx[i] * Rxx[i] + gupyy[i] * Ryy[i] + gupzz[i] * Rzz[i]
+ TWO * ( gupxy[i] * Rxy[i] + gupxz[i] * Rxz[i] + gupyz[i] * Ryz[i] ); + TWO * ( gupxy[i] * Rxy[i] + gupxz[i] * Rxz[i] + gupyz[i] * Ryz[i] );
@@ -1174,20 +1088,15 @@ int f_compute_rhs_bssn(int *ex, double &T,
} }
// 1ms // // 1ms //
for(int _task=tid; _task<6; _task+=nthr){ fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
switch(_task){ fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0);
case 0: fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break; fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0);
case 1: fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0); break; fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
case 2: fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0); break; fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
case 3: fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break; fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
case 4: fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0); break;
case 5: fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0); break;
}
}
} }
// 7ms // // 7ms //
#pragma omp barrier for (int i=0;i<all;i+=1) {
for (int i=start;i<end;i+=1) {
gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i] gxxx[i] = gxxx[i] - ( Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
+ Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i]; + Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i] gxyx[i] = gxyx[i] - ( Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
@@ -1240,10 +1149,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i]; movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
} }
}
// double t1 = omp_get_wtime();
// printf("Time for BSSN RHS computation: %.6f seconds\n", (t1-t0));
return 0; // success return 0; // success
} }

View File

@@ -41,8 +41,8 @@ void fdderivs(const int ex[3],
const size_t nz = (size_t)ex3 + 2; const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz; const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL; static double *fh = NULL;
static thread_local size_t cap = 0; static size_t cap = 0;
if (fh_size > cap) { if (fh_size > cap) {
free(fh); free(fh);

View File

@@ -50,8 +50,8 @@ void fderivs(const int ex[3],
const size_t ny = (size_t)ex2 + 2; const size_t ny = (size_t)ex2 + 2;
const size_t nz = (size_t)ex3 + 2; const size_t nz = (size_t)ex3 + 2;
const size_t fh_size = nx * ny * nz; const size_t fh_size = nx * ny * nz;
static thread_local double *fh = NULL; static double *fh = NULL;
static thread_local size_t cap = 0; static size_t cap = 0;
if (fh_size > cap) { if (fh_size > cap) {
free(fh); free(fh);

View File

@@ -39,19 +39,19 @@ endif
# C rewrite of BSSN RHS kernel and helpers # C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_c.o: fdderivs_c.C fdderivs_c.o: fdderivs_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_c.o: kodiss_c.C kodiss_c.o: kodiss_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented) ## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include

View File

@@ -5,7 +5,6 @@
#include <stddef.h> #include <stddef.h>
#include <math.h> #include <math.h>
#include <stdio.h> #include <stdio.h>
#include <omp.h>
/* 主网格0-based -> 1D */ /* 主网格0-based -> 1D */
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) { static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
const int ex1 = ex[0], ex2 = ex[1]; const int ex1 = ex[0], ex2 = ex[1];