From 726d743376c99eddc7c743580afedc888c8aae69 Mon Sep 17 00:00:00 2001 From: ianchb Date: Tue, 14 Apr 2026 19:20:12 +0800 Subject: [PATCH] Fuse Ricci assembly and optimize trK/Aij gauge kernels --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 474 ++++++++++++++++++++++++------ 1 file changed, 383 insertions(+), 91 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 5cd91cb..046ff91 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -3368,6 +3368,219 @@ void kern_phase11_ricci_offdiag( } } +/* Phase 11: fused Ricci assembly (diag + off-diag) */ +__global__ __launch_bounds__(128, 2) +void kern_phase11_ricci_fused( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gamxa, const double* __restrict__ Gamya, + const double* __restrict__ Gamza, + const double* __restrict__ Gamxx, const double* __restrict__ Gamxy, + const double* __restrict__ Gamxz, + const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d, + const double* __restrict__ Gamyz_d, + const double* __restrict__ Gamzx, const double* __restrict__ Gamzy, + const double* __restrict__ Gamzz_d, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + const double* __restrict__ lxxx, const double* __restrict__ lxyx, + const double* __restrict__ lxzx, const double* __restrict__ lyyx, + const double* __restrict__ lyzx, const double* __restrict__ lzzx, + const double* __restrict__ lxxy, const double* __restrict__ lxyy, + const double* __restrict__ lxzy, const double* __restrict__ lyyy, + const double* __restrict__ lyzy, const double* __restrict__ lzzy, + const double* __restrict__ lxxz, const double* __restrict__ lxyz, + const double* __restrict__ lxzz, const double* __restrict__ lyyz, + const double* __restrict__ lyzz, const double* __restrict__ lzzz, + double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz, + double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double H = 0.5, TWO = 2.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) { + double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i]; + double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i]; + + Rxx[i] = -H*Rxx[i] + + gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i] + + Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i] + + uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i]) + +(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i])) + + uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]) + +(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i]) + +(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])) + + uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]) + +(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i]) + +(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])) + + uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]) + +(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]) + +(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])) + + uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])); + + Ryy[i] = -H*Ryy[i] + + gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i] + + Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i] + + uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i]) + +(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i]) + +(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])) + + uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]) + +(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i]) + +(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])) + + uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i]) + +(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i])) + + uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]) + +(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i]) + +(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])) + + uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])); + + Rzz[i] = -H*Rzz[i] + + gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i] + + Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i] + + uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])) + + uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]) + +(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]) + +(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i])) + + uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i]) + +(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i]) + +(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i])) + + uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])) + + uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i]) + +(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i]) + +(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i])) + + uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i]) + +(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i])); + + Rxy[i] = H*( + -Rxy[i] + +gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i] + +gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i] + +Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i] + +Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i]) + +uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i] + +Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]) + +uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i] + +Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i] + +Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i] + +Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i]) + +uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i] + +Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]) + +uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i] + +Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]); + + Rxz[i] = H*( + -Rxz[i] + +gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i] + +gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i] + +Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i] + +Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i]) + +uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i] + +Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]) + +uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i] + +Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i] + +Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i] + +Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i] + +Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i]) + +uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i] + +Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i] + +Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]); + + Ryz[i] = H*( + -Ryz[i] + +gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i] + +gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i] + +Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i] + +Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i]) + +uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]) + +uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i] + +Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i] + +Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i] + +Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]) + +uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i] + +Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i] + +Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i] + +Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i] + +Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i] + +Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i]) + +uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i] + +Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]); + } +} + /* Phase 13: chi correction to Ricci tensor * After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f, * then add chi contribution to Rxx..Rzz. @@ -3629,8 +3842,8 @@ void kern_phase13_chi_correction( } } -/* Phase 15: trK_rhs, Aij_rhs, gauge (after fdderivs(Lap) and fderivs(chi)) - * Also updates Christoffel with physical chi correction, computes Lap_rhs, beta_rhs, dtSf_rhs. +/* Phase 15: trK_rhs, Aij_rhs, gauge. + * Also updates Christoffel with physical chi correction and computes Lap second derivatives on the fly. */ __global__ __launch_bounds__(128, 4) void kern_phase15_trK_Aij_gauge( @@ -3674,11 +3887,6 @@ void kern_phase15_trK_Aij_gauge( double* __restrict__ Gzxx, double* __restrict__ Gzxy, double* __restrict__ Gzxz, double* __restrict__ Gzyy, double* __restrict__ Gzyz, double* __restrict__ Gzzz, - /* fxx..fzz = fdderivs(Lap) output */ - double* __restrict__ fxx, double* __restrict__ fxy, - double* __restrict__ fxz, double* __restrict__ fyy, - double* __restrict__ fyz, double* __restrict__ fzz, - /* dtSfx_rhs..dtSfz_rhs = fderivs(chi) output, then overwritten */ double* __restrict__ dtSfx_rhs, double* __restrict__ dtSfy_rhs, double* __restrict__ dtSfz_rhs, double* __restrict__ trK_rhs, @@ -3697,11 +3905,148 @@ void kern_phase15_trK_Aij_gauge( const double PI_V=3.14159265358979323846; const double F16=16.0, F8=8.0; for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int i0 = i % nx; + const int j0 = (i / nx) % ny; + const int k0 = i / (nx * ny); + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; double a=alpn1[i], c1=chin1[i]; double cx=chix[i],cy=chiy[i],cz=chiz[i]; double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i]; + double fxx_v = 0.0, fxy_v = 0.0, fxz_v = 0.0; + double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1); + fxx_v = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF, 1, 1, 1)); + fyy_v = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF, 1, 1, 1)); + fzz_v = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(alpn1, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF + 2, kF, 1, 1, 1); + fxy_v = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 2, 1, 1, 1); + fxz_v = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 2, 1, 1, 1); + fyz_v = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1); + fxx_v = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1)); + fyy_v = d_gp.Sdydy * ( + fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1)); + fzz_v = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1)); + fxy_v = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1)); + fxz_v = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1)); + fyz_v = d_gp.Sdydz * ( + fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1)); + } + } /* raised chi/chi */ double gx=(uxx*cx+uxy*cy+uxz*cz)/c1; @@ -3728,17 +4073,17 @@ void kern_phase15_trK_Aij_gauge( Gyyz[i]-=(cz/c1-gyz[i]*gy)*H; Gzyz[i]-=(cy/c1-gyz[i]*gz)*H; - /* fxx..fzz correction: subtract Gamma*Lap_deriv */ - fxx[i]-=Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz; - fyy[i]-=Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz; - fzz[i]-=Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz; - fxy[i]-=Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz; - fxz[i]-=Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz; - fyz[i]-=Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz; + /* Lap second-derivative correction: subtract Gamma*Lap_deriv */ + fxx_v -= Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz; + fyy_v -= Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz; + fzz_v -= Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz; + fxy_v -= Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz; + fxz_v -= Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz; + fyz_v -= Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz; /* D^i D_i alpha */ - double DDA = uxx*fxx[i]+uyy*fyy[i]+uzz*fzz[i] - +TWO*(uxy*fxy[i]+uxz*fxz[i]+uyz*fyz[i]); + double DDA = uxx*fxx_v+uyy*fyy_v+uzz*fzz_v + +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v); /* trace of S_ij (physical) */ double S_v = c1*(uxx*Sxx_m[i]+uyy*Syy_m[i]+uzz*Szz_m[i] @@ -3774,25 +4119,25 @@ void kern_phase15_trK_Aij_gauge( /* f_arr = -(1/3) * (DDA + alpha/chi * (2/3*K^2 - AijAij - 16pi*rho + 8pi*S)) */ double f_v = F2o3*trK_v*trK_v - AijAij - F16*PI_V*rho[i] + EIGHT*PI_V*S_v; - f_arr[i] = -F1o3*(uxx*fxx[i]+uyy*fyy[i]+uzz*fzz[i] - +TWO*(uxy*fxy[i]+uxz*fxz[i]+uyz*fyz[i]) + f_arr[i] = -F1o3*(uxx*fxx_v+uyy*fyy_v+uzz*fzz_v + +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v) +(a/c1)*f_v); /* fij = alpha*(Rij - 8pi*Sij) - D_iD_j alpha */ - double fxx_v=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx[i]; - double fxy_v=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy[i]; - double fxz_v=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz[i]; - double fyy_v=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy[i]; - double fyz_v=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz[i]; - double fzz_v=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz[i]; + double fij_xx=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx_v; + double fij_xy=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy_v; + double fij_xz=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz_v; + double fij_yy=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy_v; + double fij_yz=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz_v; + double fij_zz=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz_v; /* Aij_rhs = chi*(fij - gij*f) */ - Axx_rhs[i]=fxx_v-gxx[i]*f_arr[i]; - Ayy_rhs[i]=fyy_v-gyy[i]*f_arr[i]; - Azz_rhs[i]=fzz_v-gzz[i]*f_arr[i]; - Axy_rhs[i]=fxy_v-gxy[i]*f_arr[i]; - Axz_rhs[i]=fxz_v-gxz[i]*f_arr[i]; - Ayz_rhs[i]=fyz_v-gyz[i]*f_arr[i]; + Axx_rhs[i]=fij_xx-gxx[i]*f_arr[i]; + Ayy_rhs[i]=fij_yy-gyy[i]*f_arr[i]; + Azz_rhs[i]=fij_zz-gzz[i]*f_arr[i]; + Axy_rhs[i]=fij_xy-gxy[i]*f_arr[i]; + Axz_rhs[i]=fij_xz-gxz[i]*f_arr[i]; + Ayz_rhs[i]=fij_yz-gyz[i]*f_arr[i]; /* A_il A^l_j */ double AA_xx=uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i] @@ -4327,7 +4672,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) src_fields, dst_fields, soa_signs, all); } - kern_phase11_ricci_diag<<>>( + kern_phase11_ricci_fused<<>>( D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), D(S_Gamxa),D(S_Gamya),D(S_Gamza), @@ -4343,25 +4688,8 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), - D(S_Rxx),D(S_Ryy),D(S_Rzz)); - - kern_phase11_ricci_offdiag<<>>( - D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_Gamxa),D(S_Gamya),D(S_Gamza), - D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), - D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), - D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), - D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), - D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), - D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), - D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), - D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), - D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), - D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), - D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), - D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), - D(S_Rxy),D(S_Rxz),D(S_Ryz)); + D(S_Rxx),D(S_Rxy),D(S_Rxz), + D(S_Ryy),D(S_Ryz),D(S_Rzz)); kern_phase12_13_chi_correction_fused<<>>( D(S_chi), D(S_chin1), @@ -4378,12 +4706,6 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz)); - kern_phase14_lap_chi_derivs<<>>( - D(S_Lap), D(S_chi), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), - D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs)); - kern_phase15_trK_Aij_gauge<<>>( D(S_alpn1), D(S_chin1), D(S_chix), D(S_chiy), D(S_chiz), @@ -4407,8 +4729,6 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs), D(S_trK_rhs), D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs), @@ -4961,8 +5281,8 @@ int f_compute_rhs_bssn(int *ex, double &T, src_fields, dst_fields, soa_signs, all); } - /* Phase 11: Ricci assembly (diagonal + off-diagonal) */ - kern_phase11_ricci_diag<<>>( + /* Phase 11: fused Ricci assembly */ + kern_phase11_ricci_fused<<>>( D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), D(S_Gamxa),D(S_Gamya),D(S_Gamza), @@ -4978,25 +5298,8 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), - D(S_Rxx),D(S_Ryy),D(S_Rzz)); - - kern_phase11_ricci_offdiag<<>>( - D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_Gamxa),D(S_Gamya),D(S_Gamza), - D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), - D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), - D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), - D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), - D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), - D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), - D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), - D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), - D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), - D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), - D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), - D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), - D(S_Rxy),D(S_Rxz),D(S_Ryz)); + D(S_Rxx),D(S_Rxy),D(S_Rxz), + D(S_Ryy),D(S_Ryz),D(S_Rzz)); /* ============================================================ */ /* Phase 12/13: chi fdderivs + chi correction */ @@ -5017,16 +5320,7 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_Ryy), D(S_Ryz), D(S_Rzz)); /* ============================================================ */ - /* Phase 14: fdderivs(Lap) + fderivs(chi) */ - /* ============================================================ */ - kern_phase14_lap_chi_derivs<<>>( - D(S_Lap), D(S_chi), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), - D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs)); - - /* ============================================================ */ - /* Phase 15: trK_rhs, Aij_rhs, gauge */ + /* Phase 14/15: fused trK_rhs, Aij_rhs, gauge */ /* ============================================================ */ kern_phase15_trK_Aij_gauge<<>>( D(S_alpn1), D(S_chin1), @@ -5051,8 +5345,6 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs), D(S_trK_rhs), D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),