2473 lines
128 KiB
Plaintext
2473 lines
128 KiB
Plaintext
/*
|
|
* bssn_rhs_cuda.cu — GPU implementation of f_compute_rhs_bssn
|
|
*
|
|
* Drop-in replacement for bssn_rhs_c.C.
|
|
* Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
|
|
*/
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cmath>
|
|
#include <cuda_runtime.h>
|
|
#include "macrodef.h"
|
|
#include "bssn_rhs.h"
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Error checking */
|
|
/* ------------------------------------------------------------------ */
|
|
#define CUDA_CHECK(call) do { \
|
|
cudaError_t err = (call); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error %s:%d: %s\n", \
|
|
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while(0)
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Physical / gauge constants (matching bssn_rhs_c.C) */
|
|
/* ------------------------------------------------------------------ */
|
|
static const double PI_VAL = 3.14159265358979323846;
|
|
static const double FF_VAL = 0.75;
|
|
static const double ETA_VAL = 2.0;
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Constant memory for grid parameters and stencil coefficients */
|
|
/* ------------------------------------------------------------------ */
|
|
struct GridParams {
|
|
int ex[3]; /* nx, ny, nz */
|
|
int all; /* nx*ny*nz */
|
|
double dX, dY, dZ;
|
|
/* fderivs coefficients */
|
|
double d12dx, d12dy, d12dz; /* 1/(12*dX) etc */
|
|
double d2dx, d2dy, d2dz; /* 1/(2*dX) etc */
|
|
/* fdderivs coefficients */
|
|
double Fdxdx, Fdydy, Fdzdz; /* 1/(12*dX^2) etc */
|
|
double Sdxdx, Sdydy, Sdzdz; /* 1/(dX^2) etc */
|
|
double Fdxdy, Fdxdz, Fdydz; /* 1/(144*dX*dY) etc */
|
|
double Sdxdy, Sdxdz, Sdydz; /* 1/(4*dX*dY) etc */
|
|
/* symmetry bounds (Fortran 1-based) */
|
|
int iminF, jminF, kminF;
|
|
int imaxF, jmaxF, kmaxF;
|
|
/* symmetry bounds for ord=3 (lopsided/kodis) */
|
|
int iminF3, jminF3, kminF3;
|
|
int Symmetry;
|
|
double eps;
|
|
int co;
|
|
/* padded sizes */
|
|
int fh2_nx, fh2_ny, fh2_nz; /* (nx+2), (ny+2), (nz+2) for ord=2 */
|
|
int fh3_nx, fh3_ny, fh3_nz; /* (nx+3), (ny+3), (nz+3) for ord=3 */
|
|
};
|
|
|
|
__constant__ GridParams d_gp;
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Device indexing helpers */
|
|
/* ------------------------------------------------------------------ */
|
|
__device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) {
|
|
return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1];
|
|
}
|
|
|
|
/* ord=2 ghost-padded: Fortran index iF -> flat index */
|
|
__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) {
|
|
return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny;
|
|
}
|
|
|
|
/* ord=3 ghost-padded: Fortran index iF -> flat index */
|
|
__device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) {
|
|
return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny;
|
|
}
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* GPU buffer management */
|
|
/* ------------------------------------------------------------------ */
|
|
/*
|
|
* Array slot indices — all arrays live in one big cudaMalloc block.
|
|
* INPUT arrays (H2D): 39 slots
|
|
* OUTPUT arrays (D2H): 52 slots
|
|
* TEMPORARY arrays (GPU-only): ~65 slots
|
|
* Plus 2 extended arrays for ghost-padded stencils (fh_ord2, fh_ord3)
|
|
*/
|
|
|
|
/* Total number of "all"-sized slots */
|
|
#define NUM_SLOTS 160
|
|
|
|
struct GpuBuffers {
|
|
double *d_mem; /* single big allocation */
|
|
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
|
|
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
|
|
double *slot[NUM_SLOTS]; /* pointers into d_mem */
|
|
int prev_nx, prev_ny, prev_nz;
|
|
bool initialized;
|
|
};
|
|
|
|
static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
|
|
|
|
/* Slot assignments — INPUT (H2D) */
|
|
enum {
|
|
S_chi=0, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
|
|
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
|
|
S_Gamx, S_Gamy, S_Gamz,
|
|
S_Lap, S_betax, S_betay, S_betaz,
|
|
S_dtSfx, S_dtSfy, S_dtSfz,
|
|
S_rho, S_Sx, S_Sy, S_Sz,
|
|
S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz,
|
|
S_X, S_Y, S_Z, /* coordinate arrays — only nx/ny/nz long */
|
|
/* 37 input slots so far; X/Y/Z are special-sized */
|
|
|
|
/* OUTPUT (D2H) */
|
|
S_chi_rhs, S_trK_rhs,
|
|
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
|
|
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
|
|
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
|
|
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
|
|
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
|
|
S_Gamxxx, S_Gamxxy, S_Gamxxz, S_Gamxyy, S_Gamxyz, S_Gamxzz,
|
|
S_Gamyxx, S_Gamyxy, S_Gamyxz, S_Gamyyy, S_Gamyyz, S_Gamyzz,
|
|
S_Gamzxx, S_Gamzxy, S_Gamzxz, S_Gamzyy, S_Gamzyz, S_Gamzzz,
|
|
S_Rxx, S_Rxy, S_Rxz, S_Ryy, S_Ryz, S_Rzz,
|
|
S_ham_Res, S_movx_Res, S_movy_Res, S_movz_Res,
|
|
S_Gmx_Res, S_Gmy_Res, S_Gmz_Res,
|
|
|
|
/* TEMPORARY (GPU-only) */
|
|
S_gxx, S_gyy, S_gzz, /* physical metric = dxx+1 etc */
|
|
S_alpn1, S_chin1,
|
|
S_chix, S_chiy, S_chiz,
|
|
S_gxxx, S_gxyx, S_gxzx, S_gyyx, S_gyzx, S_gzzx,
|
|
S_gxxy, S_gxyy, S_gxzy, S_gyyy, S_gyzy, S_gzzy,
|
|
S_gxxz, S_gxyz, S_gxzz, S_gyyz, S_gyzz, S_gzzz,
|
|
S_Lapx, S_Lapy, S_Lapz,
|
|
S_betaxx, S_betaxy, S_betaxz,
|
|
S_betayx, S_betayy, S_betayz,
|
|
S_betazx, S_betazy, S_betazz,
|
|
S_Gamxx, S_Gamxy, S_Gamxz,
|
|
S_Gamyx, S_Gamyy_t, S_Gamyz_t,
|
|
S_Gamzx, S_Gamzy, S_Gamzz_t,
|
|
S_Kx, S_Ky, S_Kz,
|
|
S_div_beta, S_S_arr, S_f_arr,
|
|
S_fxx, S_fxy, S_fxz, S_fyy, S_fyz, S_fzz,
|
|
S_Gamxa, S_Gamya, S_Gamza,
|
|
S_gupxx, S_gupxy, S_gupxz,
|
|
S_gupyy, S_gupyz, S_gupzz,
|
|
NUM_USED_SLOTS
|
|
};
|
|
|
|
static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS");
|
|
|
|
static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
|
if (g_buf.initialized && g_buf.prev_nx == nx &&
|
|
g_buf.prev_ny == ny && g_buf.prev_nz == nz)
|
|
return;
|
|
|
|
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
|
|
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
|
|
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
|
|
|
|
size_t all = (size_t)nx * ny * nz;
|
|
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
|
|
size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3);
|
|
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
|
|
|
|
for (int s = 0; s < NUM_USED_SLOTS; ++s)
|
|
g_buf.slot[s] = g_buf.d_mem + s * all;
|
|
|
|
g_buf.prev_nx = nx;
|
|
g_buf.prev_ny = ny;
|
|
g_buf.prev_nz = nz;
|
|
g_buf.initialized = true;
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
|
|
/* ================================================================== */
|
|
|
|
/* Step 1: Copy interior into ghost-padded array */
|
|
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
|
double * __restrict__ fh,
|
|
double SoA0, double SoA1, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
|
|
}
|
|
}
|
|
|
|
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
|
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
|
{
|
|
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
/* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
|
|
/* Fortran: do ii=0,ord-1: funcc(-ii,jF,kF) = funcc(ii+1,jF,kF)*SoA[0] */
|
|
int total = ny * nz; /* jF=1..ny, kF=1..nz */
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2; /* 2 ghost layers */
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid / total; /* 0 or 1 */
|
|
int rem = tid % total;
|
|
int j0 = rem % ny;
|
|
int k0 = rem / ny;
|
|
int jF = j0 + 1, kF = k0 + 1;
|
|
int iF_dst = -ii; /* 0, -1 */
|
|
int iF_src = ii + 1; /* 1, 2 */
|
|
fh[(iF_dst+1) + (jF+1)*fnx + (kF+1)*fnx*fny] =
|
|
fh[(iF_src+1) + (jF+1)*fnx + (kF+1)*fnx*fny] * SoA0;
|
|
}
|
|
}
|
|
|
|
/* Step 3: Fill j-ghosts (y-direction symmetry) */
|
|
__global__ void kern_symbd_jghost_ord2(double * __restrict__ fh, double SoA1)
|
|
{
|
|
const int nx = d_gp.ex[0], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
/* iF ranges from -1 to nx (i.e. -ord+1 to ex1), total = nx+2 */
|
|
int irange = nx + 2;
|
|
int total = irange * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int jj = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int k0 = rem / irange;
|
|
int iF = ii - 1; /* -1 .. nx */
|
|
int kF = k0 + 1;
|
|
int jF_dst = -jj;
|
|
int jF_src = jj + 1;
|
|
fh[(iF+1) + (jF_dst+1)*fnx + (kF+1)*fnx*fny] =
|
|
fh[(iF+1) + (jF_src+1)*fnx + (kF+1)*fnx*fny] * SoA1;
|
|
}
|
|
}
|
|
|
|
/* Step 4: Fill k-ghosts (z-direction symmetry) */
|
|
__global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
int irange = nx + 2;
|
|
int jrange = ny + 2;
|
|
int total = irange * jrange;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int kk = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int jj = rem / irange;
|
|
int iF = ii - 1;
|
|
int jF = jj - 1;
|
|
int kF_dst = -kk;
|
|
int kF_src = kk + 1;
|
|
fh[(iF+1) + (jF+1)*fnx + (kF_dst+1)*fnx*fny] =
|
|
fh[(iF+1) + (jF+1)*fnx + (kF_src+1)*fnx*fny] * SoA2;
|
|
}
|
|
}
|
|
|
|
/* ---- ord=3 variants (for lopsided / kodis) ---- */
|
|
|
|
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
|
double * __restrict__ fh)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
|
{
|
|
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int total = ny * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid / total;
|
|
int rem = tid % total;
|
|
int j0 = rem % ny;
|
|
int k0 = rem / ny;
|
|
int jF = j0 + 1, kF = k0 + 1;
|
|
int iF_dst = -ii;
|
|
int iF_src = ii + 1;
|
|
fh[(iF_dst+2) + (jF+2)*fnx + (kF+2)*fnx*fny] =
|
|
fh[(iF_src+2) + (jF+2)*fnx + (kF+2)*fnx*fny] * SoA0;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_jghost_ord3(double * __restrict__ fh, double SoA1)
|
|
{
|
|
const int nx = d_gp.ex[0], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int irange = nx + 3;
|
|
int total = irange * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int jj = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int k0 = rem / irange;
|
|
int iF = ii - 2;
|
|
int kF = k0 + 1;
|
|
int jF_dst = -jj;
|
|
int jF_src = jj + 1;
|
|
fh[(iF+2) + (jF_dst+2)*fnx + (kF+2)*fnx*fny] =
|
|
fh[(iF+2) + (jF_src+2)*fnx + (kF+2)*fnx*fny] * SoA1;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_kghost_ord3(double * __restrict__ fh, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int irange = nx + 3;
|
|
int jrange = ny + 3;
|
|
int total = irange * jrange;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int kk = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int jj = rem / irange;
|
|
int iF = ii - 2;
|
|
int jF = jj - 2;
|
|
int kF_dst = -kk;
|
|
int kF_src = kk + 1;
|
|
fh[(iF+2) + (jF+2)*fnx + (kF_dst+2)*fnx*fny] =
|
|
fh[(iF+2) + (jF+2)*fnx + (kF_src+2)*fnx*fny] * SoA2;
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* B. Stencil kernels */
|
|
/* ================================================================== */
|
|
|
|
/* ---- First derivatives (ord=2, 4th/2nd order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fderivs(const double * __restrict__ fh,
|
|
double * __restrict__ fx,
|
|
double * __restrict__ fy,
|
|
double * __restrict__ fz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
/* boundary points: leave as zero */
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
|
|
continue;
|
|
}
|
|
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
|
|
if ((iF+2) <= imaxF && (iF-2) >= iminF &&
|
|
(jF+2) <= jmaxF && (jF-2) >= jminF &&
|
|
(kF+2) <= kmaxF && (kF-2) >= kminF)
|
|
{
|
|
fx[tid] = d_gp.d12dx * (
|
|
fh[idx_fh2(iF-2,jF,kF)] - 8.0*fh[idx_fh2(iF-1,jF,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
|
|
fy[tid] = d_gp.d12dy * (
|
|
fh[idx_fh2(iF,jF-2,kF)] - 8.0*fh[idx_fh2(iF,jF-1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
|
|
fz[tid] = d_gp.d12dz * (
|
|
fh[idx_fh2(iF,jF,kF-2)] - 8.0*fh[idx_fh2(iF,jF,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);
|
|
}
|
|
else if ((iF+1) <= imaxF && (iF-1) >= iminF &&
|
|
(jF+1) <= jmaxF && (jF-1) >= jminF &&
|
|
(kF+1) <= kmaxF && (kF-1) >= kminF)
|
|
{
|
|
fx[tid] = d_gp.d2dx * (
|
|
-fh[idx_fh2(iF-1,jF,kF)] + fh[idx_fh2(iF+1,jF,kF)]);
|
|
fy[tid] = d_gp.d2dy * (
|
|
-fh[idx_fh2(iF,jF-1,kF)] + fh[idx_fh2(iF,jF+1,kF)]);
|
|
fz[tid] = d_gp.d2dz * (
|
|
-fh[idx_fh2(iF,jF,kF-1)] + fh[idx_fh2(iF,jF,kF+1)]);
|
|
}
|
|
else {
|
|
fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ---- Second derivatives (ord=2, 4th/2nd order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fdderivs(const double * __restrict__ fh,
|
|
double * __restrict__ fxx, double * __restrict__ fxy,
|
|
double * __restrict__ fxz, double * __restrict__ fyy,
|
|
double * __restrict__ fyz, double * __restrict__ fzz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
|
|
fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
|
|
continue;
|
|
}
|
|
|
|
int iF = i0+1, jF = j0+1, kF = k0+1;
|
|
|
|
if ((iF+2)<=imaxF && (iF-2)>=iminF &&
|
|
(jF+2)<=jmaxF && (jF-2)>=jminF &&
|
|
(kF+2)<=kmaxF && (kF-2)>=kminF)
|
|
{
|
|
/* 4th-order diagonal */
|
|
double c = fh[idx_fh2(iF,jF,kF)];
|
|
fxx[tid] = d_gp.Fdxdx*(
|
|
-fh[idx_fh2(iF-2,jF,kF)] + 16.0*fh[idx_fh2(iF-1,jF,kF)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
|
|
fyy[tid] = d_gp.Fdydy*(
|
|
-fh[idx_fh2(iF,jF-2,kF)] + 16.0*fh[idx_fh2(iF,jF-1,kF)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
|
|
fzz[tid] = d_gp.Fdzdz*(
|
|
-fh[idx_fh2(iF,jF,kF-2)] + 16.0*fh[idx_fh2(iF,jF,kF-1)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);
|
|
|
|
/* 4th-order cross: fxy */
|
|
{
|
|
double t_jm2 = fh[idx_fh2(iF-2,jF-2,kF)] - 8.0*fh[idx_fh2(iF-1,jF-2,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF-2,kF)] - fh[idx_fh2(iF+2,jF-2,kF)];
|
|
double t_jm1 = fh[idx_fh2(iF-2,jF-1,kF)] - 8.0*fh[idx_fh2(iF-1,jF-1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF-1,kF)] - fh[idx_fh2(iF+2,jF-1,kF)];
|
|
double t_jp1 = fh[idx_fh2(iF-2,jF+1,kF)] - 8.0*fh[idx_fh2(iF-1,jF+1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF+1,kF)] - fh[idx_fh2(iF+2,jF+1,kF)];
|
|
double t_jp2 = fh[idx_fh2(iF-2,jF+2,kF)] - 8.0*fh[idx_fh2(iF-1,jF+2,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF+2,kF)] - fh[idx_fh2(iF+2,jF+2,kF)];
|
|
fxy[tid] = d_gp.Fdxdy*(t_jm2 - 8.0*t_jm1 + 8.0*t_jp1 - t_jp2);
|
|
}
|
|
/* 4th-order cross: fxz */
|
|
{
|
|
double t_km2 = fh[idx_fh2(iF-2,jF,kF-2)] - 8.0*fh[idx_fh2(iF-1,jF,kF-2)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF-2)] - fh[idx_fh2(iF+2,jF,kF-2)];
|
|
double t_km1 = fh[idx_fh2(iF-2,jF,kF-1)] - 8.0*fh[idx_fh2(iF-1,jF,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF-1)] - fh[idx_fh2(iF+2,jF,kF-1)];
|
|
double t_kp1 = fh[idx_fh2(iF-2,jF,kF+1)] - 8.0*fh[idx_fh2(iF-1,jF,kF+1)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF+1)] - fh[idx_fh2(iF+2,jF,kF+1)];
|
|
double t_kp2 = fh[idx_fh2(iF-2,jF,kF+2)] - 8.0*fh[idx_fh2(iF-1,jF,kF+2)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF+2)] - fh[idx_fh2(iF+2,jF,kF+2)];
|
|
fxz[tid] = d_gp.Fdxdz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
|
|
}
|
|
/* 4th-order cross: fyz */
|
|
{
|
|
double t_km2 = fh[idx_fh2(iF,jF-2,kF-2)] - 8.0*fh[idx_fh2(iF,jF-1,kF-2)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF-2)] - fh[idx_fh2(iF,jF+2,kF-2)];
|
|
double t_km1 = fh[idx_fh2(iF,jF-2,kF-1)] - 8.0*fh[idx_fh2(iF,jF-1,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF-1)] - fh[idx_fh2(iF,jF+2,kF-1)];
|
|
double t_kp1 = fh[idx_fh2(iF,jF-2,kF+1)] - 8.0*fh[idx_fh2(iF,jF-1,kF+1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF+1)] - fh[idx_fh2(iF,jF+2,kF+1)];
|
|
double t_kp2 = fh[idx_fh2(iF,jF-2,kF+2)] - 8.0*fh[idx_fh2(iF,jF-1,kF+2)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF+2)] - fh[idx_fh2(iF,jF+2,kF+2)];
|
|
fyz[tid] = d_gp.Fdydz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
|
|
}
|
|
}
|
|
else if ((iF+1)<=imaxF && (iF-1)>=iminF &&
|
|
(jF+1)<=jmaxF && (jF-1)>=jminF &&
|
|
(kF+1)<=kmaxF && (kF-1)>=kminF)
|
|
{
|
|
double c = fh[idx_fh2(iF,jF,kF)];
|
|
fxx[tid] = d_gp.Sdxdx*(fh[idx_fh2(iF-1,jF,kF)] - 2.0*c + fh[idx_fh2(iF+1,jF,kF)]);
|
|
fyy[tid] = d_gp.Sdydy*(fh[idx_fh2(iF,jF-1,kF)] - 2.0*c + fh[idx_fh2(iF,jF+1,kF)]);
|
|
fzz[tid] = d_gp.Sdzdz*(fh[idx_fh2(iF,jF,kF-1)] - 2.0*c + fh[idx_fh2(iF,jF,kF+1)]);
|
|
fxy[tid] = d_gp.Sdxdy*(fh[idx_fh2(iF-1,jF-1,kF)] - fh[idx_fh2(iF+1,jF-1,kF)]
|
|
-fh[idx_fh2(iF-1,jF+1,kF)] + fh[idx_fh2(iF+1,jF+1,kF)]);
|
|
fxz[tid] = d_gp.Sdxdz*(fh[idx_fh2(iF-1,jF,kF-1)] - fh[idx_fh2(iF+1,jF,kF-1)]
|
|
-fh[idx_fh2(iF-1,jF,kF+1)] + fh[idx_fh2(iF+1,jF,kF+1)]);
|
|
fyz[tid] = d_gp.Sdydz*(fh[idx_fh2(iF,jF-1,kF-1)] - fh[idx_fh2(iF,jF+1,kF-1)]
|
|
-fh[idx_fh2(iF,jF-1,kF+1)] + fh[idx_fh2(iF,jF+1,kF+1)]);
|
|
}
|
|
else {
|
|
fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
|
|
fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ---- Lopsided (upwind advection) kernel ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_lopsided(const double * __restrict__ fh,
|
|
double * __restrict__ f_rhs,
|
|
const double * __restrict__ Sfx,
|
|
const double * __restrict__ Sfy,
|
|
const double * __restrict__ Sfz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) continue;
|
|
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
double val = 0.0;
|
|
|
|
/* --- x direction --- */
|
|
double sfx = Sfx[tid];
|
|
if (sfx > 0.0) {
|
|
if (i0 <= nx - 4) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
|
|
+ fh[idx_fh3(iF+3,jF,kF)]);
|
|
} else if (i0 <= nx - 3) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
|
|
+8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
|
|
} else if (i0 <= nx - 2) {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
|
|
+ fh[idx_fh3(iF-3,jF,kF)]);
|
|
}
|
|
} else if (sfx < 0.0) {
|
|
if ((i0 - 2) >= iminF) {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
|
|
+ fh[idx_fh3(iF-3,jF,kF)]);
|
|
} else if ((i0 - 1) >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
|
|
+8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
|
|
} else if (i0 >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
|
|
+ fh[idx_fh3(iF+3,jF,kF)]);
|
|
}
|
|
}
|
|
|
|
/* --- y direction --- */
|
|
double sfy = Sfy[tid];
|
|
if (sfy > 0.0) {
|
|
if (j0 <= ny - 4) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
|
|
+ fh[idx_fh3(iF,jF+3,kF)]);
|
|
} else if (j0 <= ny - 3) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
|
|
+8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
|
|
} else if (j0 <= ny - 2) {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
|
|
+ fh[idx_fh3(iF,jF-3,kF)]);
|
|
}
|
|
} else if (sfy < 0.0) {
|
|
if ((j0 - 2) >= jminF) {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
|
|
+ fh[idx_fh3(iF,jF-3,kF)]);
|
|
} else if ((j0 - 1) >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
|
|
+8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
|
|
} else if (j0 >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
|
|
+ fh[idx_fh3(iF,jF+3,kF)]);
|
|
}
|
|
}
|
|
|
|
/* --- z direction --- */
|
|
double sfz = Sfz[tid];
|
|
if (sfz > 0.0) {
|
|
if (k0 <= nz - 4) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
|
|
+ fh[idx_fh3(iF,jF,kF+3)]);
|
|
} else if (k0 <= nz - 3) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
|
|
+8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
|
|
} else if (k0 <= nz - 2) {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
|
|
+ fh[idx_fh3(iF,jF,kF-3)]);
|
|
}
|
|
} else if (sfz < 0.0) {
|
|
if ((k0 - 2) >= kminF) {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
|
|
+ fh[idx_fh3(iF,jF,kF-3)]);
|
|
} else if ((k0 - 1) >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
|
|
+8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
|
|
} else if (k0 >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
|
|
+ fh[idx_fh3(iF,jF,kF+3)]);
|
|
}
|
|
}
|
|
|
|
f_rhs[tid] += val;
|
|
}
|
|
}
|
|
|
|
/* ---- KO dissipation kernel (ord=3, 6th-order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_kodis(const double * __restrict__ fh,
|
|
double * __restrict__ f_rhs,
|
|
double eps_val)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const double cof = 64.0;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
|
|
if ((iF-3) >= iminF && (iF+3) <= imaxF &&
|
|
(jF-3) >= jminF && (jF+3) <= jmaxF &&
|
|
(kF-3) >= kminF && (kF+3) <= kmaxF)
|
|
{
|
|
double Dx = (fh[idx_fh3(iF-3,jF,kF)] + fh[idx_fh3(iF+3,jF,kF)])
|
|
- 6.0*(fh[idx_fh3(iF-2,jF,kF)] + fh[idx_fh3(iF+2,jF,kF)])
|
|
+15.0*(fh[idx_fh3(iF-1,jF,kF)] + fh[idx_fh3(iF+1,jF,kF)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dx /= d_gp.dX;
|
|
|
|
double Dy = (fh[idx_fh3(iF,jF-3,kF)] + fh[idx_fh3(iF,jF+3,kF)])
|
|
- 6.0*(fh[idx_fh3(iF,jF-2,kF)] + fh[idx_fh3(iF,jF+2,kF)])
|
|
+15.0*(fh[idx_fh3(iF,jF-1,kF)] + fh[idx_fh3(iF,jF+1,kF)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dy /= d_gp.dY;
|
|
|
|
double Dz = (fh[idx_fh3(iF,jF,kF-3)] + fh[idx_fh3(iF,jF,kF+3)])
|
|
- 6.0*(fh[idx_fh3(iF,jF,kF-2)] + fh[idx_fh3(iF,jF,kF+2)])
|
|
+15.0*(fh[idx_fh3(iF,jF,kF-1)] + fh[idx_fh3(iF,jF,kF+1)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dz /= d_gp.dZ;
|
|
|
|
f_rhs[tid] += (eps_val / cof) * (Dx + Dy + Dz);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* Host wrapper helpers */
|
|
/* ================================================================== */
|
|
static const int BLK = 128;
|
|
static inline int grid(int n) { return (n + BLK - 1) / BLK; }
|
|
|
|
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
|
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
|
double SoA0, double SoA1, double SoA2, int all)
|
|
{
|
|
double *fh = g_buf.d_fh2;
|
|
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
|
kern_symbd_ighost_ord2<<<grid(all), BLK>>>(fh, SoA0);
|
|
kern_symbd_jghost_ord2<<<grid(all), BLK>>>(fh, SoA1);
|
|
kern_symbd_kghost_ord2<<<grid(all), BLK>>>(fh, SoA2);
|
|
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
|
}
|
|
|
|
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
|
static void gpu_fdderivs(double *d_f,
|
|
double *d_fxx, double *d_fxy, double *d_fxz,
|
|
double *d_fyy, double *d_fyz, double *d_fzz,
|
|
double SoA0, double SoA1, double SoA2, int all)
|
|
{
|
|
double *fh = g_buf.d_fh2;
|
|
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
|
kern_symbd_ighost_ord2<<<grid(all), BLK>>>(fh, SoA0);
|
|
kern_symbd_jghost_ord2<<<grid(all), BLK>>>(fh, SoA1);
|
|
kern_symbd_kghost_ord2<<<grid(all), BLK>>>(fh, SoA2);
|
|
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
|
}
|
|
|
|
/* symmetry_bd on GPU for ord=3, then launch lopsided kernel */
|
|
static void gpu_lopsided(double *d_f, double *d_f_rhs,
|
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
|
double SoA0, double SoA1, double SoA2, int all)
|
|
{
|
|
double *fh = g_buf.d_fh3;
|
|
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
|
|
kern_symbd_ighost_ord3<<<grid(all), BLK>>>(fh, SoA0);
|
|
kern_symbd_jghost_ord3<<<grid(all), BLK>>>(fh, SoA1);
|
|
kern_symbd_kghost_ord3<<<grid(all), BLK>>>(fh, SoA2);
|
|
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
|
|
}
|
|
|
|
/* symmetry_bd on GPU for ord=3, then launch kodis kernel */
|
|
static void gpu_kodis(double *d_f, double *d_f_rhs,
|
|
double SoA0, double SoA1, double SoA2,
|
|
double eps_val, int all)
|
|
{
|
|
double *fh = g_buf.d_fh3;
|
|
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
|
|
kern_symbd_ighost_ord3<<<grid(all), BLK>>>(fh, SoA0);
|
|
kern_symbd_jghost_ord3<<<grid(all), BLK>>>(fh, SoA1);
|
|
kern_symbd_kghost_ord3<<<grid(all), BLK>>>(fh, SoA2);
|
|
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* C. Point-wise computation kernels */
|
|
/* ================================================================== */
|
|
|
|
/* Phase 1: alpn1, chin1, gxx=dxx+1, gyy=dyy+1, gzz=dzz+1 */
|
|
__global__ void kern_phase1_prep(
|
|
const double* __restrict__ Lap, const double* __restrict__ chi,
|
|
const double* __restrict__ dxx, const double* __restrict__ dyy,
|
|
const double* __restrict__ dzz,
|
|
double* __restrict__ alpn1, double* __restrict__ chin1,
|
|
double* __restrict__ gxx, double* __restrict__ gyy, double* __restrict__ gzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
alpn1[i] = Lap[i] + 1.0;
|
|
chin1[i] = chi[i] + 1.0;
|
|
gxx[i] = dxx[i] + 1.0;
|
|
gyy[i] = dyy[i] + 1.0;
|
|
gzz[i] = dzz[i] + 1.0;
|
|
}
|
|
}
|
|
|
|
/* Phase 2a: chi_rhs, gij_rhs, div_beta */
|
|
__global__ void kern_phase2_metric_rhs(
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
double* __restrict__ div_beta,
|
|
double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs,
|
|
double* __restrict__ gyy_rhs, double* __restrict__ gzz_rhs,
|
|
double* __restrict__ gxy_rhs, double* __restrict__ gyz_rhs,
|
|
double* __restrict__ gxz_rhs)
|
|
{
|
|
const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double db = betaxx[i] + betayy[i] + betazz[i];
|
|
div_beta[i] = db;
|
|
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db);
|
|
gxx_rhs[i] = -TWO*alpn1[i]*Axx[i] - F2o3*gxx[i]*db
|
|
+ TWO*(gxx[i]*betaxx[i] + gxy[i]*betayx[i] + gxz[i]*betazx[i]);
|
|
gyy_rhs[i] = -TWO*alpn1[i]*Ayy[i] - F2o3*gyy[i]*db
|
|
+ TWO*(gxy[i]*betaxy[i] + gyy[i]*betayy[i] + gyz[i]*betazy[i]);
|
|
gzz_rhs[i] = -TWO*alpn1[i]*Azz[i] - F2o3*gzz[i]*db
|
|
+ TWO*(gxz[i]*betaxz[i] + gyz[i]*betayz[i] + gzz[i]*betazz[i]);
|
|
gxy_rhs[i] = -TWO*alpn1[i]*Axy[i] + F1o3*gxy[i]*db
|
|
+ gxx[i]*betaxy[i] + gxz[i]*betazy[i] + gyy[i]*betayx[i]
|
|
+ gyz[i]*betazx[i] - gxy[i]*betazz[i];
|
|
gyz_rhs[i] = -TWO*alpn1[i]*Ayz[i] + F1o3*gyz[i]*db
|
|
+ gxy[i]*betaxz[i] + gyy[i]*betayz[i] + gxz[i]*betaxy[i]
|
|
+ gzz[i]*betazy[i] - gyz[i]*betaxx[i];
|
|
gxz_rhs[i] = -TWO*alpn1[i]*Axz[i] + F1o3*gxz[i]*db
|
|
+ gxx[i]*betaxz[i] + gxy[i]*betayz[i] + gyz[i]*betayx[i]
|
|
+ gzz[i]*betazx[i] - gxz[i]*betayy[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 2b: metric inverse */
|
|
__global__ void kern_phase2_inverse(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
double* __restrict__ gupxx, double* __restrict__ gupxy,
|
|
double* __restrict__ gupxz, double* __restrict__ gupyy,
|
|
double* __restrict__ gupyz, double* __restrict__ gupzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double det = gxx[i]*gyy[i]*gzz[i] + gxy[i]*gyz[i]*gxz[i] + gxz[i]*gxy[i]*gyz[i]
|
|
- gxz[i]*gyy[i]*gxz[i] - gxy[i]*gxy[i]*gzz[i] - gxx[i]*gyz[i]*gyz[i];
|
|
double inv = 1.0 / det;
|
|
gupxx[i] = (gyy[i]*gzz[i] - gyz[i]*gyz[i]) * inv;
|
|
gupxy[i] = -(gxy[i]*gzz[i] - gyz[i]*gxz[i]) * inv;
|
|
gupxz[i] = (gxy[i]*gyz[i] - gyy[i]*gxz[i]) * inv;
|
|
gupyy[i] = (gxx[i]*gzz[i] - gxz[i]*gxz[i]) * inv;
|
|
gupyz[i] = -(gxx[i]*gyz[i] - gxy[i]*gxz[i]) * inv;
|
|
gupzz[i] = (gxx[i]*gyy[i] - gxy[i]*gxy[i]) * inv;
|
|
}
|
|
}
|
|
|
|
/* Phase 3: Gamma constraint residuals (co==0 only) */
|
|
__global__ void kern_phase3_gamma_constraint(
|
|
const double* __restrict__ Gamx, const double* __restrict__ Gamy,
|
|
const double* __restrict__ Gamz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ gxxx, const double* __restrict__ gxyx,
|
|
const double* __restrict__ gxzx, const double* __restrict__ gyyx,
|
|
const double* __restrict__ gyzx, const double* __restrict__ gzzx,
|
|
const double* __restrict__ gxxy, const double* __restrict__ gxyy,
|
|
const double* __restrict__ gxzy, const double* __restrict__ gyyy,
|
|
const double* __restrict__ gyzy, const double* __restrict__ gzzy,
|
|
const double* __restrict__ gxxz, const double* __restrict__ gxyz,
|
|
const double* __restrict__ gxzz, const double* __restrict__ gyyz,
|
|
const double* __restrict__ gyzz, const double* __restrict__ gzzz,
|
|
double* __restrict__ Gmx_Res, double* __restrict__ Gmy_Res,
|
|
double* __restrict__ Gmz_Res)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i], uxy=gupxy[i], uxz=gupxz[i];
|
|
double uyy=gupyy[i], uyz=gupyz[i], uzz=gupzz[i];
|
|
|
|
Gmx_Res[i] = Gamx[i] - (
|
|
uxx*(uxx*gxxx[i]+uxy*gxyx[i]+uxz*gxzx[i]) +
|
|
uxy*(uxx*gxyx[i]+uxy*gyyx[i]+uxz*gyzx[i]) +
|
|
uxz*(uxx*gxzx[i]+uxy*gyzx[i]+uxz*gzzx[i]) +
|
|
uxx*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
|
|
uxy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
|
|
uxz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
|
|
uxx*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uxy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uxz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
|
|
Gmy_Res[i] = Gamy[i] - (
|
|
uxx*(uxy*gxxx[i]+uyy*gxyx[i]+uyz*gxzx[i]) +
|
|
uxy*(uxy*gxyx[i]+uyy*gyyx[i]+uyz*gyzx[i]) +
|
|
uxz*(uxy*gxzx[i]+uyy*gyzx[i]+uyz*gzzx[i]) +
|
|
uxy*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
|
|
uyy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
|
|
uyz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
|
|
uxy*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uyy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uyz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
|
|
Gmz_Res[i] = Gamz[i] - (
|
|
uxx*(uxz*gxxx[i]+uyz*gxyx[i]+uzz*gxzx[i]) +
|
|
uxy*(uxz*gxyx[i]+uyz*gyyx[i]+uzz*gyzx[i]) +
|
|
uxz*(uxz*gxzx[i]+uyz*gyzx[i]+uzz*gzzx[i]) +
|
|
uxy*(uxz*gxxy[i]+uyz*gxyy[i]+uzz*gxzy[i]) +
|
|
uyy*(uxz*gxyy[i]+uyz*gyyy[i]+uzz*gyzy[i]) +
|
|
uyz*(uxz*gxzy[i]+uyz*gyzy[i]+uzz*gzzy[i]) +
|
|
uxz*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uyz*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uzz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 4: 18 Christoffel symbols */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase4_christoffel(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ gxxx, const double* __restrict__ gxyx,
|
|
const double* __restrict__ gxzx, const double* __restrict__ gyyx,
|
|
const double* __restrict__ gyzx, const double* __restrict__ gzzx,
|
|
const double* __restrict__ gxxy, const double* __restrict__ gxyy,
|
|
const double* __restrict__ gxzy, const double* __restrict__ gyyy,
|
|
const double* __restrict__ gyzy, const double* __restrict__ gzzy,
|
|
const double* __restrict__ gxxz, const double* __restrict__ gxyz,
|
|
const double* __restrict__ gxzz, const double* __restrict__ gyyz,
|
|
const double* __restrict__ gyzz, const double* __restrict__ gzzz,
|
|
double* __restrict__ Gxxx, double* __restrict__ Gxxy, double* __restrict__ Gxxz,
|
|
double* __restrict__ Gxyy, double* __restrict__ Gxyz, double* __restrict__ Gxzz,
|
|
double* __restrict__ Gyxx, double* __restrict__ Gyxy, double* __restrict__ Gyxz,
|
|
double* __restrict__ Gyyy, double* __restrict__ Gyyz, double* __restrict__ Gyzz,
|
|
double* __restrict__ Gzxx, double* __restrict__ Gzxy, double* __restrict__ Gzxz,
|
|
double* __restrict__ Gzyy, double* __restrict__ Gzyz, double* __restrict__ Gzzz_o)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* Gamma^x_{xx} */
|
|
Gxxx[i]=H*(uxx*gxxx[i]+uxy*(TWO*gxyx[i]-gxxy[i])+uxz*(TWO*gxzx[i]-gxxz[i]));
|
|
Gyxx[i]=H*(uxy*gxxx[i]+uyy*(TWO*gxyx[i]-gxxy[i])+uyz*(TWO*gxzx[i]-gxxz[i]));
|
|
Gzxx[i]=H*(uxz*gxxx[i]+uyz*(TWO*gxyx[i]-gxxy[i])+uzz*(TWO*gxzx[i]-gxxz[i]));
|
|
/* yy */
|
|
Gxyy[i]=H*(uxx*(TWO*gxyy[i]-gyyx[i])+uxy*gyyy[i]+uxz*(TWO*gyzy[i]-gyyz[i]));
|
|
Gyyy[i]=H*(uxy*(TWO*gxyy[i]-gyyx[i])+uyy*gyyy[i]+uyz*(TWO*gyzy[i]-gyyz[i]));
|
|
Gzyy[i]=H*(uxz*(TWO*gxyy[i]-gyyx[i])+uyz*gyyy[i]+uzz*(TWO*gyzy[i]-gyyz[i]));
|
|
/* zz */
|
|
Gxzz[i]=H*(uxx*(TWO*gxzz[i]-gzzx[i])+uxy*(TWO*gyzz[i]-gzzy[i])+uxz*gzzz[i]);
|
|
Gyzz[i]=H*(uxy*(TWO*gxzz[i]-gzzx[i])+uyy*(TWO*gyzz[i]-gzzy[i])+uyz*gzzz[i]);
|
|
Gzzz_o[i]=H*(uxz*(TWO*gxzz[i]-gzzx[i])+uyz*(TWO*gyzz[i]-gzzy[i])+uzz*gzzz[i]);
|
|
/* xy */
|
|
Gxxy[i]=H*(uxx*gxxy[i]+uxy*gyyx[i]+uxz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
Gyxy[i]=H*(uxy*gxxy[i]+uyy*gyyx[i]+uyz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
Gzxy[i]=H*(uxz*gxxy[i]+uyz*gyyx[i]+uzz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
/* xz */
|
|
Gxxz[i]=H*(uxx*gxxz[i]+uxy*(gxyz[i]+gyzx[i]-gxzy[i])+uxz*gzzx[i]);
|
|
Gyxz[i]=H*(uxy*gxxz[i]+uyy*(gxyz[i]+gyzx[i]-gxzy[i])+uyz*gzzx[i]);
|
|
Gzxz[i]=H*(uxz*gxxz[i]+uyz*(gxyz[i]+gyzx[i]-gxzy[i])+uzz*gzzx[i]);
|
|
/* yz */
|
|
Gxyz[i]=H*(uxx*(gxyz[i]+gxzy[i]-gyzx[i])+uxy*gyyz[i]+uxz*gzzy[i]);
|
|
Gyyz[i]=H*(uxy*(gxyz[i]+gxzy[i]-gyzx[i])+uyy*gyyz[i]+uyz*gzzy[i]);
|
|
Gzyz[i]=H*(uxz*(gxyz[i]+gxzy[i]-gyzx[i])+uyz*gyyz[i]+uzz*gzzy[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 5: A^ij = gup^ia gup^jb A_ab (stored temporarily in Rxx..Rzz) */
|
|
__global__ void kern_phase5_raise_A(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz,
|
|
double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
Rxx[i]=uxx*uxx*Axx[i]+uxy*uxy*Ayy[i]+uxz*uxz*Azz[i]
|
|
+TWO*(uxx*uxy*Axy[i]+uxx*uxz*Axz[i]+uxy*uxz*Ayz[i]);
|
|
Ryy[i]=uxy*uxy*Axx[i]+uyy*uyy*Ayy[i]+uyz*uyz*Azz[i]
|
|
+TWO*(uxy*uyy*Axy[i]+uxy*uyz*Axz[i]+uyy*uyz*Ayz[i]);
|
|
Rzz[i]=uxz*uxz*Axx[i]+uyz*uyz*Ayy[i]+uzz*uzz*Azz[i]
|
|
+TWO*(uxz*uyz*Axy[i]+uxz*uzz*Axz[i]+uyz*uzz*Ayz[i]);
|
|
Rxy[i]=uxx*uxy*Axx[i]+uxy*uyy*Ayy[i]+uxz*uyz*Azz[i]
|
|
+(uxx*uyy+uxy*uxy)*Axy[i]+(uxx*uyz+uxz*uxy)*Axz[i]+(uxy*uyz+uxz*uyy)*Ayz[i];
|
|
Rxz[i]=uxx*uxz*Axx[i]+uxy*uyz*Ayy[i]+uxz*uzz*Azz[i]
|
|
+(uxx*uyz+uxy*uxz)*Axy[i]+(uxx*uzz+uxz*uxz)*Axz[i]+(uxy*uzz+uxz*uyz)*Ayz[i];
|
|
Ryz[i]=uxy*uxz*Axx[i]+uyy*uyz*Ayy[i]+uyz*uzz*Azz[i]
|
|
+(uxy*uyz+uyy*uxz)*Axy[i]+(uxy*uzz+uyz*uxz)*Axz[i]+(uyy*uzz+uyz*uyz)*Ayz[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 6: Gamma_rhs part 1 (before fdderivs(beta) and fderivs(Gamma)) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase6_gamma_rhs_part1(
|
|
const double* __restrict__ Lapx, const double* __restrict__ Lapy,
|
|
const double* __restrict__ Lapz,
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Kx, const double* __restrict__ Ky,
|
|
const double* __restrict__ Kz,
|
|
const double* __restrict__ Sx, const double* __restrict__ Sy,
|
|
const double* __restrict__ Sz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs)
|
|
{
|
|
const double TWO=2.0, F3o2=1.5, F2o3=2.0/3.0, EIGHT=8.0;
|
|
const double PI_V = 3.14159265358979323846;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
|
|
double a=alpn1[i], c1=chin1[i];
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
|
|
Gamx_rhs[i] = -TWO*(lx*Rxx[i]+ly*Rxy[i]+lz*Rxz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxx[i]+cy*Rxy[i]+cz*Rxz[i])
|
|
-uxx*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uxy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uxz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gxxx[i]*Rxx[i]+Gxyy[i]*Ryy[i]+Gxzz[i]*Rzz[i]
|
|
+TWO*(Gxxy[i]*Rxy[i]+Gxxz[i]*Rxz[i]+Gxyz[i]*Ryz[i]));
|
|
|
|
Gamy_rhs[i] = -TWO*(lx*Rxy[i]+ly*Ryy[i]+lz*Ryz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxy[i]+cy*Ryy[i]+cz*Ryz[i])
|
|
-uxy*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uyy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uyz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gyxx[i]*Rxx[i]+Gyyy[i]*Ryy[i]+Gyzz[i]*Rzz[i]
|
|
+TWO*(Gyxy[i]*Rxy[i]+Gyxz[i]*Rxz[i]+Gyyz[i]*Ryz[i]));
|
|
|
|
Gamz_rhs[i] = -TWO*(lx*Rxz[i]+ly*Ryz[i]+lz*Rzz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxz[i]+cy*Ryz[i]+cz*Rzz[i])
|
|
-uxz*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uyz*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uzz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gzxx[i]*Rxx[i]+Gzyy[i]*Ryy[i]+Gzzz[i]*Rzz[i]
|
|
+TWO*(Gzxy[i]*Rxy[i]+Gzxz[i]*Rxz[i]+Gzyz[i]*Ryz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 8: Gamma_rhs part 2 — after fdderivs(beta) and fderivs(Gamma)
|
|
* Computes: fxx=div(beta_xx), Gamxa, then updates Gamx_rhs etc.
|
|
* Input arrays gxxx..gzzz here hold fdderivs(beta) results,
|
|
* Gamxx..Gamzz hold fderivs(Gamma) results.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase8_gamma_rhs_part2(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ div_beta,
|
|
/* fdderivs(betax) -> gxxx,gxyx,gxzx,gyyx,gyzx,gzzx */
|
|
const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy,
|
|
const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy,
|
|
const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz,
|
|
/* fdderivs(betay) -> gxxy,gxyy,gxzy,gyyy,gyzy,gzzy */
|
|
const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy,
|
|
const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy,
|
|
const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz,
|
|
/* fdderivs(betaz) -> gxxz,gxyz,gxzz,gyyz,gyzz,gzzz */
|
|
const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy,
|
|
const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy,
|
|
const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz,
|
|
/* fderivs(Gamx) -> Gamxx,Gamxy,Gamxz */
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
/* fderivs(Gamy) -> Gamyx,Gamyy,Gamyz */
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
/* fderivs(Gamz) -> Gamzx,Gamzy,Gamzz */
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
/* Christoffel symbols */
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* betaij first derivs */
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs,
|
|
double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out,
|
|
double* __restrict__ Gamza_out)
|
|
{
|
|
const double TWO=2.0, F2o3=2.0/3.0, F1o3=1.0/3.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* div(beta_second_derivs) */
|
|
double fxx_v = bxx_xx[i]+bxy_xy[i]+bxz_xz[i];
|
|
double fxy_v = bxx_xy[i]+bxy_yy[i]+bxz_yz[i];
|
|
double fxz_v = bxx_xz[i]+bxy_yz[i]+bxz_zz[i];
|
|
/* Gamma^a contracted */
|
|
double Ga_x = uxx*Gxxx[i]+uyy*Gxyy[i]+uzz*Gxzz[i]
|
|
+TWO*(uxy*Gxxy[i]+uxz*Gxxz[i]+uyz*Gxyz[i]);
|
|
double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i]
|
|
+TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]);
|
|
double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i]
|
|
+TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]);
|
|
Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z;
|
|
double db = div_beta[i];
|
|
Gamx_rhs[i] += F2o3*Ga_x*db
|
|
- Ga_x*betaxx[i] - Ga_y*betaxy[i] - Ga_z*betaxz[i]
|
|
+ F1o3*(uxx*fxx_v+uxy*fxy_v+uxz*fxz_v)
|
|
+ uxx*bxx_xx[i]+uyy*bxx_yy[i]+uzz*bxx_zz[i]
|
|
+ TWO*(uxy*bxx_xy[i]+uxz*bxx_xz[i]+uyz*bxx_yz[i]);
|
|
Gamy_rhs[i] += F2o3*Ga_y*db
|
|
- Ga_x*betayx[i] - Ga_y*betayy[i] - Ga_z*betayz[i]
|
|
+ F1o3*(uxy*fxx_v+uyy*fxy_v+uyz*fxz_v)
|
|
+ uxx*bxy_xx[i]+uyy*bxy_yy[i]+uzz*bxy_zz[i]
|
|
+ TWO*(uxy*bxy_xy[i]+uxz*bxy_xz[i]+uyz*bxy_yz[i]);
|
|
Gamz_rhs[i] += F2o3*Ga_z*db
|
|
- Ga_x*betazx[i] - Ga_y*betazy[i] - Ga_z*betazz[i]
|
|
+ F1o3*(uxz*fxx_v+uyz*fxy_v+uzz*fxz_v)
|
|
+ uxx*bxz_xx[i]+uyy*bxz_yy[i]+uzz*bxz_zz[i]
|
|
+ TWO*(uxy*bxz_xy[i]+uxz*bxz_xz[i]+uyz*bxz_yz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 9: Christoffel contract — compute g_{ia} Gamma^a_{bc} products
|
|
* Overwrites gxxx..gzzz with lowered Christoffel products needed for Ricci.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase9_christoffel_contract(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* output: lowered products g_{ia} Gamma^a_{bc} */
|
|
double* __restrict__ o_gxxx, double* __restrict__ o_gxyx,
|
|
double* __restrict__ o_gxzx, double* __restrict__ o_gyyx,
|
|
double* __restrict__ o_gyzx, double* __restrict__ o_gzzx,
|
|
double* __restrict__ o_gxxy, double* __restrict__ o_gxyy,
|
|
double* __restrict__ o_gxzy, double* __restrict__ o_gyyy,
|
|
double* __restrict__ o_gyzy, double* __restrict__ o_gzzy,
|
|
double* __restrict__ o_gxxz, double* __restrict__ o_gxyz,
|
|
double* __restrict__ o_gxzz, double* __restrict__ o_gyyz,
|
|
double* __restrict__ o_gyzz, double* __restrict__ o_gzzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double g11=gxx[i],g12=gxy[i],g13=gxz[i];
|
|
double g22=gyy[i],g23=gyz[i],g33=gzz[i];
|
|
/* row x: g_{x,a} Gamma^a_{bc} */
|
|
o_gxxx[i]=g11*Gxxx[i]+g12*Gyxx[i]+g13*Gzxx[i];
|
|
o_gxyx[i]=g11*Gxxy[i]+g12*Gyxy[i]+g13*Gzxy[i];
|
|
o_gxzx[i]=g11*Gxxz[i]+g12*Gyxz[i]+g13*Gzxz[i];
|
|
o_gyyx[i]=g11*Gxyy[i]+g12*Gyyy[i]+g13*Gzyy[i];
|
|
o_gyzx[i]=g11*Gxyz[i]+g12*Gyyz[i]+g13*Gzyz[i];
|
|
o_gzzx[i]=g11*Gxzz[i]+g12*Gyzz[i]+g13*Gzzz[i];
|
|
/* row y: g_{y,a} Gamma^a_{bc} */
|
|
o_gxxy[i]=g12*Gxxx[i]+g22*Gyxx[i]+g23*Gzxx[i];
|
|
o_gxyy[i]=g12*Gxxy[i]+g22*Gyxy[i]+g23*Gzxy[i];
|
|
o_gxzy[i]=g12*Gxxz[i]+g22*Gyxz[i]+g23*Gzxz[i];
|
|
o_gyyy[i]=g12*Gxyy[i]+g22*Gyyy[i]+g23*Gzyy[i];
|
|
o_gyzy[i]=g12*Gxyz[i]+g22*Gyyz[i]+g23*Gzyz[i];
|
|
o_gzzy[i]=g12*Gxzz[i]+g22*Gyzz[i]+g23*Gzzz[i];
|
|
/* row z: g_{z,a} Gamma^a_{bc} */
|
|
o_gxxz[i]=g13*Gxxx[i]+g23*Gyxx[i]+g33*Gzxx[i];
|
|
o_gxyz[i]=g13*Gxxy[i]+g23*Gyxy[i]+g33*Gzxy[i];
|
|
o_gxzz[i]=g13*Gxxz[i]+g23*Gyxz[i]+g33*Gzxz[i];
|
|
o_gyyz[i]=g13*Gxyy[i]+g23*Gyyy[i]+g33*Gzyy[i];
|
|
o_gyzz[i]=g13*Gxyz[i]+g23*Gyyz[i]+g33*Gzyz[i];
|
|
o_gzzz[i]=g13*Gxzz[i]+g23*Gyzz[i]+g33*Gzzz[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 10: After fdderivs of a metric component, contract with gup^{ij}
|
|
* R_comp = gup^xx*fxx + gup^yy*fyy + gup^zz*fzz + 2*(gup^xy*fxy + gup^xz*fxz + gup^yz*fyz)
|
|
*/
|
|
__global__ void kern_phase10_ricci_contract(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ fxx, const double* __restrict__ fxy,
|
|
const double* __restrict__ fxz, const double* __restrict__ fyy,
|
|
const double* __restrict__ fyz, const double* __restrict__ fzz,
|
|
double* __restrict__ R_comp)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
R_comp[i] = gupxx[i]*fxx[i] + gupyy[i]*fyy[i] + gupzz[i]*fzz[i]
|
|
+ 2.0*(gupxy[i]*fxy[i] + gupxz[i]*fxz[i] + gupyz[i]*fyz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 11a: Ricci diagonal assembly (Rxx, Ryy, Rzz) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase11_ricci_diag(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
|
|
const double* __restrict__ Gamza,
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* lowered Christoffel products */
|
|
const double* __restrict__ lxxx, const double* __restrict__ lxyx,
|
|
const double* __restrict__ lxzx, const double* __restrict__ lyyx,
|
|
const double* __restrict__ lyzx, const double* __restrict__ lzzx,
|
|
const double* __restrict__ lxxy, const double* __restrict__ lxyy,
|
|
const double* __restrict__ lxzy, const double* __restrict__ lyyy,
|
|
const double* __restrict__ lyzy, const double* __restrict__ lzzy,
|
|
const double* __restrict__ lxxz, const double* __restrict__ lxyz,
|
|
const double* __restrict__ lxzz, const double* __restrict__ lyyz,
|
|
const double* __restrict__ lyzz, const double* __restrict__ lzzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Ryy, double* __restrict__ Rzz)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* Rxx */
|
|
Rxx[i] = -H*Rxx[i]
|
|
+ gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i]
|
|
+ Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i]
|
|
+ uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i])
|
|
+(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i]))
|
|
+ uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i])
|
|
+(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i])
|
|
+(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]))
|
|
+ uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i])
|
|
+(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i])
|
|
+(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]))
|
|
+ uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i])
|
|
+(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i])
|
|
+(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]))
|
|
+ uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]));
|
|
|
|
/* Ryy */
|
|
Ryy[i] = -H*Ryy[i]
|
|
+ gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i]
|
|
+ Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i]
|
|
+ uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i])
|
|
+(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i])
|
|
+(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]))
|
|
+ uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i])
|
|
+(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i])
|
|
+(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]))
|
|
+ uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i])
|
|
+(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i]))
|
|
+ uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i])
|
|
+(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i])
|
|
+(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]))
|
|
+ uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]));
|
|
|
|
/* Rzz */
|
|
Rzz[i] = -H*Rzz[i]
|
|
+ gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i]
|
|
+ Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i]
|
|
+ uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]))
|
|
+ uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]))
|
|
+ uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]))
|
|
+ uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]))
|
|
+ uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]))
|
|
+ uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i])
|
|
+(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 11b: Ricci off-diagonal assembly (Rxy, Rxz, Ryz) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase11_ricci_offdiag(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
|
|
const double* __restrict__ Gamza,
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
const double* __restrict__ lxxx, const double* __restrict__ lxyx,
|
|
const double* __restrict__ lxzx, const double* __restrict__ lyyx,
|
|
const double* __restrict__ lyzx, const double* __restrict__ lzzx,
|
|
const double* __restrict__ lxxy, const double* __restrict__ lxyy,
|
|
const double* __restrict__ lxzy, const double* __restrict__ lyyy,
|
|
const double* __restrict__ lyzy, const double* __restrict__ lzzy,
|
|
const double* __restrict__ lxxz, const double* __restrict__ lxyz,
|
|
const double* __restrict__ lxzz, const double* __restrict__ lyyz,
|
|
const double* __restrict__ lyzz, const double* __restrict__ lzzz,
|
|
double* __restrict__ Rxy, double* __restrict__ Rxz, double* __restrict__ Ryz)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
|
|
/* Rxy */
|
|
Rxy[i] = H*(
|
|
-Rxy[i]
|
|
+gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i]
|
|
+gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i]
|
|
+Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i])
|
|
+uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]
|
|
+Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])
|
|
+uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]
|
|
+Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]
|
|
+Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i]
|
|
+Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i])
|
|
+uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i]
|
|
+Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])
|
|
+uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i]
|
|
+Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]);
|
|
|
|
/* Rxz */
|
|
Rxz[i] = H*(
|
|
-Rxz[i]
|
|
+gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i]
|
|
+Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i])
|
|
+uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]
|
|
+Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])
|
|
+uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]
|
|
+Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i]
|
|
+Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i])
|
|
+uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]
|
|
+Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i]
|
|
+Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]);
|
|
|
|
/* Ryz */
|
|
Ryz[i] = H*(
|
|
-Ryz[i]
|
|
+gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i]
|
|
+Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i]
|
|
+Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i])
|
|
+uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])
|
|
+uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]
|
|
+Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])
|
|
+uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]
|
|
+Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]
|
|
+Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i]
|
|
+Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i])
|
|
+uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i]
|
|
+Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 13: chi correction to Ricci tensor
|
|
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
|
|
* then add chi contribution to Rxx..Rzz.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase13_chi_correction(
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ fxx, double* __restrict__ fxy,
|
|
double* __restrict__ fxz, double* __restrict__ fyy,
|
|
double* __restrict__ fyz, double* __restrict__ fzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy,
|
|
double* __restrict__ Rxz, double* __restrict__ Ryy,
|
|
double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double H=0.5, TWO=2.0, F3o2=1.5;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i],c1=chin1[i];
|
|
/* subtract Christoffel * chi_deriv */
|
|
fxx[i] -= Gxxx[i]*cx+Gyxx[i]*cy+Gzxx[i]*cz;
|
|
fxy[i] -= Gxxy[i]*cx+Gyxy[i]*cy+Gzxy[i]*cz;
|
|
fxz[i] -= Gxxz[i]*cx+Gyxz[i]*cy+Gzxz[i]*cz;
|
|
fyy[i] -= Gxyy[i]*cx+Gyyy[i]*cy+Gzyy[i]*cz;
|
|
fyz[i] -= Gxyz[i]*cx+Gyyz[i]*cy+Gzyz[i]*cz;
|
|
fzz[i] -= Gxzz[i]*cx+Gyzz[i]*cy+Gzzz[i]*cz;
|
|
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double f_val = uxx*(fxx[i]-F3o2/c1*cx*cx)
|
|
+ uyy*(fyy[i]-F3o2/c1*cy*cy)
|
|
+ uzz*(fzz[i]-F3o2/c1*cz*cz)
|
|
+ TWO*uxy*(fxy[i]-F3o2/c1*cx*cy)
|
|
+ TWO*uxz*(fxz[i]-F3o2/c1*cx*cz)
|
|
+ TWO*uyz*(fyz[i]-F3o2/c1*cy*cz);
|
|
|
|
double inv2c = 1.0/(c1*TWO);
|
|
Rxx[i] += (fxx[i]-cx*cx*inv2c+gxx[i]*f_val)*inv2c;
|
|
Ryy[i] += (fyy[i]-cy*cy*inv2c+gyy[i]*f_val)*inv2c;
|
|
Rzz[i] += (fzz[i]-cz*cz*inv2c+gzz[i]*f_val)*inv2c;
|
|
Rxy[i] += (fxy[i]-cx*cy*inv2c+gxy[i]*f_val)*inv2c;
|
|
Rxz[i] += (fxz[i]-cx*cz*inv2c+gxz[i]*f_val)*inv2c;
|
|
Ryz[i] += (fyz[i]-cy*cz*inv2c+gyz[i]*f_val)*inv2c;
|
|
}
|
|
}
|
|
|
|
/* Phase 15: trK_rhs, Aij_rhs, gauge (after fdderivs(Lap) and fderivs(chi))
|
|
* Also updates Christoffel with physical chi correction, computes Lap_rhs, beta_rhs, dtSf_rhs.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase15_trK_Aij_gauge(
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Lapx, const double* __restrict__ Lapy,
|
|
const double* __restrict__ Lapz,
|
|
const double* __restrict__ div_beta,
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
const double* __restrict__ rho,
|
|
const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
|
|
const double* __restrict__ Sz_m,
|
|
const double* __restrict__ Sxx_m, const double* __restrict__ Sxy_m,
|
|
const double* __restrict__ Sxz_m, const double* __restrict__ Syy_m,
|
|
const double* __restrict__ Syz_m, const double* __restrict__ Szz_m,
|
|
const double* __restrict__ dtSfx, const double* __restrict__ dtSfy,
|
|
const double* __restrict__ dtSfz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
double* __restrict__ Gxxx, double* __restrict__ Gxxy,
|
|
double* __restrict__ Gxxz, double* __restrict__ Gxyy,
|
|
double* __restrict__ Gxyz_o, double* __restrict__ Gxzz,
|
|
double* __restrict__ Gyxx, double* __restrict__ Gyxy,
|
|
double* __restrict__ Gyxz, double* __restrict__ Gyyy,
|
|
double* __restrict__ Gyyz, double* __restrict__ Gyzz,
|
|
double* __restrict__ Gzxx, double* __restrict__ Gzxy,
|
|
double* __restrict__ Gzxz, double* __restrict__ Gzyy,
|
|
double* __restrict__ Gzyz, double* __restrict__ Gzzz,
|
|
/* fxx..fzz = fdderivs(Lap) output */
|
|
double* __restrict__ fxx, double* __restrict__ fxy,
|
|
double* __restrict__ fxz, double* __restrict__ fyy,
|
|
double* __restrict__ fyz, double* __restrict__ fzz,
|
|
/* dtSfx_rhs..dtSfz_rhs = fderivs(chi) output, then overwritten */
|
|
double* __restrict__ dtSfx_rhs, double* __restrict__ dtSfy_rhs,
|
|
double* __restrict__ dtSfz_rhs,
|
|
double* __restrict__ trK_rhs,
|
|
double* __restrict__ Axx_rhs, double* __restrict__ Axy_rhs,
|
|
double* __restrict__ Axz_rhs, double* __restrict__ Ayy_rhs,
|
|
double* __restrict__ Ayz_rhs, double* __restrict__ Azz_rhs,
|
|
double* __restrict__ Lap_rhs,
|
|
double* __restrict__ betax_rhs, double* __restrict__ betay_rhs,
|
|
double* __restrict__ betaz_rhs,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs,
|
|
double* __restrict__ f_arr, double* __restrict__ S_arr)
|
|
{
|
|
const double TWO=2.0, FOUR=4.0, EIGHT=8.0, H=0.5;
|
|
const double F1o3=1.0/3.0, F2o3=2.0/3.0, F3o2=1.5;
|
|
const double PI_V=3.14159265358979323846;
|
|
const double F16=16.0, F8=8.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double a=alpn1[i], c1=chin1[i];
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
|
|
|
|
/* raised chi/chi */
|
|
double gx=(uxx*cx+uxy*cy+uxz*cz)/c1;
|
|
double gy=(uxy*cx+uyy*cy+uyz*cz)/c1;
|
|
double gz=(uxz*cx+uyz*cy+uzz*cz)/c1;
|
|
|
|
/* Christoffel physical correction */
|
|
Gxxx[i]-=((cx+cx)/c1-gxx[i]*gx)*H;
|
|
Gyxx[i]-=(0.0-gxx[i]*gy)*H;
|
|
Gzxx[i]-=(0.0-gxx[i]*gz)*H;
|
|
Gxyy[i]-=(0.0-gyy[i]*gx)*H;
|
|
Gyyy[i]-=((cy+cy)/c1-gyy[i]*gy)*H;
|
|
Gzyy[i]-=(0.0-gyy[i]*gz)*H;
|
|
Gxzz[i]-=(0.0-gzz[i]*gx)*H;
|
|
Gyzz[i]-=(0.0-gzz[i]*gy)*H;
|
|
Gzzz[i]-=((cz+cz)/c1-gzz[i]*gz)*H;
|
|
Gxxy[i]-=(cy/c1-gxy[i]*gx)*H;
|
|
Gyxy[i]-=(cx/c1-gxy[i]*gy)*H;
|
|
Gzxy[i]-=(0.0-gxy[i]*gz)*H;
|
|
Gxxz[i]-=(cz/c1-gxz[i]*gx)*H;
|
|
Gyxz[i]-=(0.0-gxz[i]*gy)*H;
|
|
Gzxz[i]-=(cx/c1-gxz[i]*gz)*H;
|
|
Gxyz_o[i]-=(0.0-gyz[i]*gx)*H;
|
|
Gyyz[i]-=(cz/c1-gyz[i]*gy)*H;
|
|
Gzyz[i]-=(cy/c1-gyz[i]*gz)*H;
|
|
|
|
/* fxx..fzz correction: subtract Gamma*Lap_deriv */
|
|
fxx[i]-=Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz;
|
|
fyy[i]-=Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz;
|
|
fzz[i]-=Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz;
|
|
fxy[i]-=Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz;
|
|
fxz[i]-=Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz;
|
|
fyz[i]-=Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz;
|
|
|
|
/* D^i D_i alpha */
|
|
double DDA = uxx*fxx[i]+uyy*fyy[i]+uzz*fzz[i]
|
|
+TWO*(uxy*fxy[i]+uxz*fxz[i]+uyz*fyz[i]);
|
|
|
|
/* trace of S_ij (physical) */
|
|
double S_v = c1*(uxx*Sxx_m[i]+uyy*Syy_m[i]+uzz*Szz_m[i]
|
|
+TWO*(uxy*Sxy_m[i]+uxz*Sxz_m[i]+uyz*Syz_m[i]));
|
|
|
|
/* A^ij A_ij */
|
|
double AijAij =
|
|
uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
|
|
+uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
|
|
+uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
|
|
+TWO*(
|
|
uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
|
|
+uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
|
|
+uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));
|
|
|
|
double trK_v = trK[i];
|
|
double db = div_beta[i];
|
|
|
|
/* trK_rhs step 1: store D^iD_i alpha * chin1 */
|
|
trK_rhs[i] = c1 * DDA;
|
|
|
|
/* f_arr = -(1/3) * (DDA + alpha/chi * (2/3*K^2 - AijAij - 16pi*rho + 8pi*S)) */
|
|
double f_v = F2o3*trK_v*trK_v - AijAij - F16*PI_V*rho[i] + EIGHT*PI_V*S_v;
|
|
f_arr[i] = -F1o3*(uxx*fxx[i]+uyy*fyy[i]+uzz*fzz[i]
|
|
+TWO*(uxy*fxy[i]+uxz*fxz[i]+uyz*fyz[i])
|
|
+(a/c1)*f_v);
|
|
|
|
/* fij = alpha*(Rij - 8pi*Sij) - D_iD_j alpha */
|
|
double fxx_v=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx[i];
|
|
double fxy_v=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy[i];
|
|
double fxz_v=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz[i];
|
|
double fyy_v=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy[i];
|
|
double fyz_v=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz[i];
|
|
double fzz_v=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz[i];
|
|
|
|
/* Aij_rhs = chi*(fij - gij*f) */
|
|
Axx_rhs[i]=fxx_v-gxx[i]*f_arr[i];
|
|
Ayy_rhs[i]=fyy_v-gyy[i]*f_arr[i];
|
|
Azz_rhs[i]=fzz_v-gzz[i]*f_arr[i];
|
|
Axy_rhs[i]=fxy_v-gxy[i]*f_arr[i];
|
|
Axz_rhs[i]=fxz_v-gxz[i]*f_arr[i];
|
|
Ayz_rhs[i]=fyz_v-gyz[i]*f_arr[i];
|
|
|
|
/* A_il A^l_j */
|
|
double AA_xx=uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]);
|
|
double AA_yy=uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]);
|
|
double AA_zz=uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]);
|
|
double AA_xy=uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]);
|
|
double AA_xz=uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]);
|
|
double AA_yz=uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]);
|
|
|
|
/* trK_rhs final */
|
|
trK_rhs[i] = -trK_rhs[i]
|
|
+ a*(F1o3*trK_v*trK_v
|
|
+uxx*AA_xx+uyy*AA_yy+uzz*AA_zz
|
|
+TWO*(uxy*AA_xy+uxz*AA_xz+uyz*AA_yz)
|
|
+FOUR*PI_V*(rho[i]+S_v));
|
|
|
|
/* Aij_rhs final */
|
|
Axx_rhs[i]=c1*Axx_rhs[i]+a*(trK_v*Axx[i]-TWO*AA_xx)
|
|
+TWO*(Axx[i]*betaxx[i]+Axy[i]*betayx[i]+Axz[i]*betazx[i])-F2o3*Axx[i]*db;
|
|
Ayy_rhs[i]=c1*Ayy_rhs[i]+a*(trK_v*Ayy[i]-TWO*AA_yy)
|
|
+TWO*(Axy[i]*betaxy[i]+Ayy[i]*betayy[i]+Ayz[i]*betazy[i])-F2o3*Ayy[i]*db;
|
|
Azz_rhs[i]=c1*Azz_rhs[i]+a*(trK_v*Azz[i]-TWO*AA_zz)
|
|
+TWO*(Axz[i]*betaxz[i]+Ayz[i]*betayz[i]+Azz[i]*betazz[i])-F2o3*Azz[i]*db;
|
|
Axy_rhs[i]=c1*Axy_rhs[i]+a*(trK_v*Axy[i]-TWO*AA_xy)
|
|
+Axx[i]*betaxy[i]+Axz[i]*betazy[i]+Ayy[i]*betayx[i]
|
|
+Ayz[i]*betazx[i]+F1o3*Axy[i]*db-Axy[i]*betazz[i];
|
|
Ayz_rhs[i]=c1*Ayz_rhs[i]+a*(trK_v*Ayz[i]-TWO*AA_yz)
|
|
+Axy[i]*betaxz[i]+Ayy[i]*betayz[i]+Axz[i]*betaxy[i]
|
|
+Azz[i]*betazy[i]+F1o3*Ayz[i]*db-Ayz[i]*betaxx[i];
|
|
Axz_rhs[i]=c1*Axz_rhs[i]+a*(trK_v*Axz[i]-TWO*AA_xz)
|
|
+Axx[i]*betaxz[i]+Axy[i]*betayz[i]+Ayz[i]*betayx[i]
|
|
+Azz[i]*betazx[i]+F1o3*Axz[i]*db-Axz[i]*betayy[i];
|
|
|
|
/* gauge */
|
|
Lap_rhs[i] = -TWO*a*trK_v;
|
|
betax_rhs[i] = 0.75*dtSfx[i];
|
|
betay_rhs[i] = 0.75*dtSfy[i];
|
|
betaz_rhs[i] = 0.75*dtSfz[i];
|
|
#if (GAUGE == 0)
|
|
dtSfx_rhs[i] = Gamx_rhs[i] - 2.0*dtSfx[i];
|
|
dtSfy_rhs[i] = Gamy_rhs[i] - 2.0*dtSfy[i];
|
|
dtSfz_rhs[i] = Gamz_rhs[i] - 2.0*dtSfz[i];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
/* Phase 18: Hamilton & momentum constraints (co==0 only) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase18_constraints(
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
const double* __restrict__ rho,
|
|
const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
|
|
const double* __restrict__ Sz_m,
|
|
const double* __restrict__ Kx, const double* __restrict__ Ky,
|
|
const double* __restrict__ Kz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* dA/dx arrays (fderivs of Aij) */
|
|
const double* __restrict__ dAxx_x, const double* __restrict__ dAxx_y,
|
|
const double* __restrict__ dAxx_z,
|
|
const double* __restrict__ dAxy_x, const double* __restrict__ dAxy_y,
|
|
const double* __restrict__ dAxy_z,
|
|
const double* __restrict__ dAxz_x, const double* __restrict__ dAxz_y,
|
|
const double* __restrict__ dAxz_z,
|
|
const double* __restrict__ dAyy_x, const double* __restrict__ dAyy_y,
|
|
const double* __restrict__ dAyy_z,
|
|
const double* __restrict__ dAyz_x, const double* __restrict__ dAyz_y,
|
|
const double* __restrict__ dAyz_z,
|
|
const double* __restrict__ dAzz_x, const double* __restrict__ dAzz_y,
|
|
const double* __restrict__ dAzz_z,
|
|
double* __restrict__ ham_Res,
|
|
double* __restrict__ movx_Res, double* __restrict__ movy_Res,
|
|
double* __restrict__ movz_Res)
|
|
{
|
|
const double TWO=2.0, F2o3=2.0/3.0, F8=8.0, F16=16.0;
|
|
const double PI_V=3.14159265358979323846;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all;
|
|
i += blockDim.x*gridDim.x)
|
|
{
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double c1=chin1[i];
|
|
|
|
/* Hamiltonian constraint */
|
|
double R_sc = uxx*Rxx[i]+uyy*Ryy[i]+uzz*Rzz[i]
|
|
+TWO*(uxy*Rxy[i]+uxz*Rxz[i]+uyz*Ryz[i]);
|
|
/* AijAij (same as in phase15) */
|
|
double AijAij =
|
|
uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
|
|
+uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
|
|
+uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
|
|
+TWO*(uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
|
|
+uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
|
|
+uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));
|
|
|
|
ham_Res[i] = c1*R_sc + F2o3*trK[i]*trK[i] - AijAij - F16*PI_V*rho[i];
|
|
|
|
/* Momentum constraints: need covariant derivative of A */
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
/* D_j A^j_x etc — subtract Christoffel and chi terms */
|
|
/* gxxx = dAxx_x - 2*Gxxx*Axx - ... - chix*Axx/chin1 etc */
|
|
double mx_xx = dAxx_x[i]-(Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i]
|
|
+Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i])-cx*Axx[i]/c1;
|
|
double mx_xy = dAxy_x[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
|
|
+Gxxx[i]*Axy[i]+Gyxx[i]*Ayy[i]+Gzxx[i]*Ayz[i])-cx*Axy[i]/c1;
|
|
double mx_xz = dAxz_x[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
|
|
+Gxxx[i]*Axz[i]+Gyxx[i]*Ayz[i]+Gzxx[i]*Azz[i])-cx*Axz[i]/c1;
|
|
double mx_yy = dAyy_x[i]-(Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i]
|
|
+Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cx*Ayy[i]/c1;
|
|
double mx_yz = dAyz_x[i]-(Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i]
|
|
+Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cx*Ayz[i]/c1;
|
|
double mx_zz = dAzz_x[i]-(Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i]
|
|
+Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cx*Azz[i]/c1;
|
|
|
|
double my_xx = dAxx_y[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
|
|
+Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i])-cy*Axx[i]/c1;
|
|
double my_xy = dAxy_y[i]-(Gxyy[i]*Axx[i]+Gyyy[i]*Axy[i]+Gzyy[i]*Axz[i]
|
|
+Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cy*Axy[i]/c1;
|
|
double my_xz = dAxz_y[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
|
|
+Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cy*Axz[i]/c1;
|
|
double my_yy = dAyy_y[i]-(Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i]
|
|
+Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i])-cy*Ayy[i]/c1;
|
|
double my_yz = dAyz_y[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
|
|
+Gxyy[i]*Axz[i]+Gyyy[i]*Ayz[i]+Gzyy[i]*Azz[i])-cy*Ayz[i]/c1;
|
|
double my_zz = dAzz_y[i]-(Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i]
|
|
+Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cy*Azz[i]/c1;
|
|
|
|
double mz_xx = dAxx_z[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
|
|
+Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i])-cz*Axx[i]/c1;
|
|
double mz_xy = dAxy_z[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
|
|
+Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i])-cz*Axy[i]/c1;
|
|
double mz_xz = dAxz_z[i]-(Gxzz[i]*Axx[i]+Gyzz[i]*Axy[i]+Gzzz[i]*Axz[i]
|
|
+Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cz*Axz[i]/c1;
|
|
double mz_yy = dAyy_z[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
|
|
+Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i])-cz*Ayy[i]/c1;
|
|
double mz_yz = dAyz_z[i]-(Gxzz[i]*Axy[i]+Gyzz[i]*Ayy[i]+Gzzz[i]*Ayz[i]
|
|
+Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cz*Ayz[i]/c1;
|
|
double mz_zz = dAzz_z[i]-(Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i]
|
|
+Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i])-cz*Azz[i]/c1;
|
|
|
|
movx_Res[i] = uxx*mx_xx+uyy*my_xy+uzz*mz_xz
|
|
+uxy*mx_xy+uxz*mx_xz+uyz*my_xz
|
|
+uxy*my_xx+uxz*mz_xx+uyz*mz_xy
|
|
- F2o3*Kx[i] - F8*PI_V*Sx_m[i];
|
|
movy_Res[i] = uxx*mx_xy+uyy*my_yy+uzz*mz_yz
|
|
+uxy*mx_yy+uxz*mx_yz+uyz*my_yz
|
|
+uxy*my_xy+uxz*mz_xy+uyz*mz_yy
|
|
- F2o3*Ky[i] - F8*PI_V*Sy_m[i];
|
|
movz_Res[i] = uxx*mx_xz+uyy*my_yz+uzz*mz_zz
|
|
+uxy*mx_yz+uxz*mx_zz+uyz*my_zz
|
|
+uxy*my_xz+uxz*mz_xz+uyz*mz_yz
|
|
- F2o3*Kz[i] - F8*PI_V*Sz_m[i];
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* Main host function — drop-in replacement for bssn_rhs_c.C */
|
|
/* ================================================================== */
|
|
|
|
extern "C"
|
|
int f_compute_rhs_bssn(int *ex, double &T,
|
|
double *X, double *Y, double *Z,
|
|
double *chi, double *trK,
|
|
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
|
double *Gamx, double *Gamy, double *Gamz,
|
|
double *Lap, double *betax, double *betay, double *betaz,
|
|
double *dtSfx, double *dtSfy, double *dtSfz,
|
|
double *chi_rhs, double *trK_rhs,
|
|
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
|
|
double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
|
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
|
|
double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
|
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
|
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
|
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
|
double *rho, double *Sx, double *Sy, double *Sz,
|
|
double *Sxx, double *Sxy_m, double *Sxz, double *Syy, double *Syz_m, double *Szz,
|
|
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy,
|
|
double *Gamxyz, double *Gamxzz,
|
|
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy,
|
|
double *Gamyyz, double *Gamyzz,
|
|
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy,
|
|
double *Gamzyz, double *Gamzzz,
|
|
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
|
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
|
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
|
int &Symmetry, int &Lev, double &eps, int &co)
|
|
{
|
|
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
|
const int all = nx * ny * nz;
|
|
const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
|
|
const int NO_SYMM = 0, EQ_SYMM = 1;
|
|
const double SYM = 1.0, ANTI = -1.0;
|
|
|
|
/* --- Allocate GPU buffers --- */
|
|
ensure_gpu_buffers(nx, ny, nz);
|
|
|
|
/* --- Setup GridParams --- */
|
|
GridParams gp;
|
|
gp.ex[0]=nx; gp.ex[1]=ny; gp.ex[2]=nz;
|
|
gp.all=all; gp.dX=dX; gp.dY=dY; gp.dZ=dZ;
|
|
gp.d12dx=1.0/(12.0*dX); gp.d12dy=1.0/(12.0*dY); gp.d12dz=1.0/(12.0*dZ);
|
|
gp.d2dx=1.0/(2.0*dX); gp.d2dy=1.0/(2.0*dY); gp.d2dz=1.0/(2.0*dZ);
|
|
gp.Fdxdx=1.0/(12.0*dX*dX); gp.Fdydy=1.0/(12.0*dY*dY); gp.Fdzdz=1.0/(12.0*dZ*dZ);
|
|
gp.Sdxdx=1.0/(dX*dX); gp.Sdydy=1.0/(dY*dY); gp.Sdzdz=1.0/(dZ*dZ);
|
|
gp.Fdxdy=1.0/(144.0*dX*dY); gp.Fdxdz=1.0/(144.0*dX*dZ); gp.Fdydz=1.0/(144.0*dY*dZ);
|
|
gp.Sdxdy=0.25/(dX*dY); gp.Sdxdz=0.25/(dX*dZ); gp.Sdydz=0.25/(dY*dZ);
|
|
gp.imaxF=nx; gp.jmaxF=ny; gp.kmaxF=nz;
|
|
gp.iminF=1; gp.jminF=1; gp.kminF=1;
|
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
|
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
|
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
|
|
gp.iminF3=1; gp.jminF3=1; gp.kminF3=1;
|
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
|
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
|
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
|
|
gp.Symmetry=Symmetry; gp.eps=eps; gp.co=co;
|
|
gp.fh2_nx=nx+2; gp.fh2_ny=ny+2; gp.fh2_nz=nz+2;
|
|
gp.fh3_nx=nx+3; gp.fh3_ny=ny+3; gp.fh3_nz=nz+3;
|
|
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
|
|
|
/* --- Shorthand for device slot pointers --- */
|
|
#define D(s) g_buf.slot[s]
|
|
const size_t bytes = (size_t)all * sizeof(double);
|
|
|
|
/* --- H2D: copy all input arrays --- */
|
|
CUDA_CHECK(cudaMemcpy(D(S_chi), chi, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_trK), trK, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dxx), dxx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_gxy), gxy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_gxz), gxz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dyy), dyy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_gyz), gyz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dzz), dzz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Axx), Axx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Axy), Axy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Axz), Axz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Ayy), Ayy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Ayz), Ayz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Azz), Azz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Gamx), Gamx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Gamy), Gamy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Gamz), Gamz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Lap), Lap, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_betax), betax, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_betay), betay, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_betaz), betaz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dtSfx), dtSfx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dtSfy), dtSfy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_dtSfz), dtSfz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_rho), rho, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sx), Sx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sy), Sy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sz), Sz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sxx), Sxx, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sxy), Sxy_m, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Sxz), Sxz, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Syy), Syy, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
|
|
/* ============================================================ */
|
|
kern_phase1_prep<<<grid(all),BLK>>>(
|
|
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
|
|
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
|
|
|
|
/* 12x fderivs */
|
|
gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
|
|
gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
|
gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 2: metric RHS + inverse */
|
|
/* ============================================================ */
|
|
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_div_beta),
|
|
D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
|
|
D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));
|
|
|
|
kern_phase2_inverse<<<grid(all),BLK>>>(
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz));
|
|
|
|
/* Phase 3: Gamma constraint (co==0) */
|
|
if (co == 0) {
|
|
kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
|
|
D(S_Gamx), D(S_Gamy), D(S_Gamz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
|
|
}
|
|
|
|
/* Phase 4: Christoffel symbols */
|
|
kern_phase4_christoffel<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));
|
|
|
|
/* Phase 5: Raise A index (stored in Rxx..Rzz temporarily) */
|
|
kern_phase5_raise_A<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
|
|
|
/* Phase 6: Gamma_rhs part 1 */
|
|
kern_phase6_gamma_rhs_part1<<<grid(all),BLK>>>(
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Kx), D(S_Ky), D(S_Kz),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
|
D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
|
|
|
|
/* Phase 7: fdderivs(beta) + fderivs(Gamma) */
|
|
gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
|
|
D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
|
|
gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
|
|
D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
|
|
gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
|
|
D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
|
|
|
|
/* Phase 8: Gamma_rhs part 2 */
|
|
kern_phase8_gamma_rhs_part2<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_div_beta),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
|
|
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
|
|
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_betaxx),D(S_betaxy),D(S_betaxz),
|
|
D(S_betayx),D(S_betayy),D(S_betayz),
|
|
D(S_betazx),D(S_betazy),D(S_betazz),
|
|
D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza));
|
|
|
|
/* Phase 9: Christoffel contract (lowered products for Ricci) */
|
|
kern_phase9_christoffel_contract<<<grid(all),BLK>>>(
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
|
|
|
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
|
|
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
|
|
|
|
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
|
|
|
|
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
|
|
|
|
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
|
|
|
|
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
|
|
|
|
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
|
|
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
|
|
|
|
/* Phase 11: Ricci assembly (diagonal + off-diagonal) */
|
|
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
|
|
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
|
|
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Rxx),D(S_Ryy),D(S_Rzz));
|
|
|
|
kern_phase11_ricci_offdiag<<<grid(all),BLK>>>(
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
|
|
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
|
|
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 12: fdderivs(chi) */
|
|
/* ============================================================ */
|
|
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 13: chi correction to Ricci */
|
|
/* ============================================================ */
|
|
kern_phase13_chi_correction<<<grid(all),BLK>>>(
|
|
D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_fxx), D(S_fxy), D(S_fxz),
|
|
D(S_fyy), D(S_fyz), D(S_fzz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
|
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 14: fdderivs(Lap) + fderivs(chi) */
|
|
/* ============================================================ */
|
|
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
|
|
SYM,SYM,SYM, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 15: trK_rhs, Aij_rhs, gauge */
|
|
/* ============================================================ */
|
|
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_div_beta),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_rho),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
|
D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_fxx), D(S_fxy), D(S_fxz),
|
|
D(S_fyy), D(S_fyz), D(S_fzz),
|
|
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
|
|
D(S_trK_rhs),
|
|
D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
|
|
D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
|
|
D(S_Lap_rhs),
|
|
D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
|
|
D(S_f_arr), D(S_S_arr));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 16: 23x lopsided (advection) */
|
|
/* ============================================================ */
|
|
gpu_lopsided(D(S_gxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
|
gpu_lopsided(D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all);
|
|
gpu_lopsided(D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all);
|
|
gpu_lopsided(D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
|
gpu_lopsided(D(S_gyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
|
gpu_lopsided(D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all);
|
|
gpu_lopsided(D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
|
gpu_lopsided(D(S_gzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
|
gpu_lopsided(D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
|
gpu_lopsided(D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all);
|
|
gpu_lopsided(D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
|
gpu_lopsided(D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all);
|
|
gpu_lopsided(D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all);
|
|
gpu_lopsided(D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
|
gpu_lopsided(D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
|
gpu_lopsided(D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 17: 24x KO dissipation (eps > 0) */
|
|
/* ============================================================ */
|
|
if (eps > 0) {
|
|
gpu_kodis(D(S_chi), D(S_chi_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_trK), D(S_trK_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_dxx), D(S_gxx_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_gxy), D(S_gxy_rhs), ANTI,ANTI,SYM, eps, all);
|
|
gpu_kodis(D(S_gxz), D(S_gxz_rhs), ANTI,SYM,ANTI, eps, all);
|
|
gpu_kodis(D(S_dyy), D(S_gyy_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_gyz), D(S_gyz_rhs), SYM,ANTI,ANTI, eps, all);
|
|
gpu_kodis(D(S_dzz), D(S_gzz_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_Axx), D(S_Axx_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_dtSfz), D(S_dtSfz_rhs), SYM,SYM,ANTI, eps, all);
|
|
gpu_kodis(D(S_Axy), D(S_Axy_rhs), ANTI,ANTI,SYM, eps, all);
|
|
gpu_kodis(D(S_dtSfy), D(S_dtSfy_rhs), SYM,ANTI,SYM, eps, all);
|
|
gpu_kodis(D(S_Axz), D(S_Axz_rhs), ANTI,SYM,ANTI, eps, all);
|
|
gpu_kodis(D(S_dtSfx), D(S_dtSfx_rhs), ANTI,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_Ayy), D(S_Ayy_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_betaz), D(S_betaz_rhs), SYM,SYM,ANTI, eps, all);
|
|
gpu_kodis(D(S_Ayz), D(S_Ayz_rhs), SYM,ANTI,ANTI, eps, all);
|
|
gpu_kodis(D(S_betay), D(S_betay_rhs), SYM,ANTI,SYM, eps, all);
|
|
gpu_kodis(D(S_Azz), D(S_Azz_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_betax), D(S_betax_rhs), ANTI,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_Gamx), D(S_Gamx_rhs), ANTI,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_Lap), D(S_Lap_rhs), SYM,SYM,SYM, eps, all);
|
|
gpu_kodis(D(S_Gamy), D(S_Gamy_rhs), SYM,ANTI,SYM, eps, all);
|
|
gpu_kodis(D(S_Gamz), D(S_Gamz_rhs), SYM,SYM,ANTI, eps, all);
|
|
}
|
|
|
|
/* ============================================================ */
|
|
/* Phase 18: Hamilton & momentum constraints (co==0) */
|
|
/* ============================================================ */
|
|
if (co == 0) {
|
|
/* 6x fderivs on Aij — reuse gxxx..gzzz slots for dA/dx output */
|
|
gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
|
gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
|
|
|
kern_phase18_constraints<<<grid(all),BLK>>>(
|
|
D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Kx), D(S_Ky), D(S_Kz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
/* dA/dx arrays */
|
|
D(S_gxxx), D(S_gxxy), D(S_gxxz),
|
|
D(S_gxyx), D(S_gxyy), D(S_gxyz),
|
|
D(S_gxzx), D(S_gxzy), D(S_gxzz),
|
|
D(S_gyyx), D(S_gyyy), D(S_gyyz),
|
|
D(S_gyzx), D(S_gyzy), D(S_gyzz),
|
|
D(S_gzzx), D(S_gzzy), D(S_gzzz),
|
|
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
|
|
}
|
|
|
|
/* ============================================================ */
|
|
/* D2H: copy all output arrays back to host */
|
|
/* ============================================================ */
|
|
CUDA_CHECK(cudaMemcpy(chi_rhs, D(S_chi_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(trK_rhs, D(S_trK_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gxx_rhs, D(S_gxx_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gxy_rhs, D(S_gxy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gxz_rhs, D(S_gxz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gyy_rhs, D(S_gyy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gyz_rhs, D(S_gyz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(gzz_rhs, D(S_gzz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Axx_rhs, D(S_Axx_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Axy_rhs, D(S_Axy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Axz_rhs, D(S_Axz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Ayy_rhs, D(S_Ayy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Ayz_rhs, D(S_Ayz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Azz_rhs, D(S_Azz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamx_rhs, D(S_Gamx_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamy_rhs, D(S_Gamy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamz_rhs, D(S_Gamz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Lap_rhs, D(S_Lap_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(betax_rhs, D(S_betax_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(betay_rhs, D(S_betay_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(betaz_rhs, D(S_betaz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost));
|
|
|
|
/* Christoffel symbols */
|
|
CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamxyy, D(S_Gamxyy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamxyz, D(S_Gamxyz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamxzz, D(S_Gamxzz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyxx, D(S_Gamyxx), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyxy, D(S_Gamyxy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyxz, D(S_Gamyxz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyyy, D(S_Gamyyy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyyz, D(S_Gamyyz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamyzz, D(S_Gamyzz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzxx, D(S_Gamzxx), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzxy, D(S_Gamzxy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzxz, D(S_Gamzxz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost));
|
|
|
|
/* Ricci tensor */
|
|
CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost));
|
|
|
|
/* Constraint residuals (only meaningful when co==0) */
|
|
if (co == 0) {
|
|
CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(movy_Res, D(S_movy_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(movz_Res, D(S_movz_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gmx_Res, D(S_Gmx_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gmy_Res, D(S_Gmy_Res), bytes, cudaMemcpyDeviceToHost));
|
|
CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
#undef D
|
|
return 0;
|
|
} |