add fused symmetry packing kernels for orders 2 and 3 in BSSN RHS
This commit is contained in:
@@ -274,10 +274,10 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
||||
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
|
||||
/* ================================================================== */
|
||||
|
||||
/* Step 1: Copy interior into ghost-padded array */
|
||||
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
/* Step 1: Copy interior into ghost-padded array */
|
||||
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
||||
@@ -290,12 +290,47 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
int k0 = tid / (nx * ny);
|
||||
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
||||
fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
||||
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 1; /* -1 .. nx */
|
||||
int jF = jj - 1; /* -1 .. ny */
|
||||
int kF = kk - 1; /* -1 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
||||
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
||||
/* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
|
||||
@@ -369,9 +404,9 @@ __global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
|
||||
|
||||
/* ---- ord=3 variants (for lopsided / kodis) ---- */
|
||||
|
||||
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh)
|
||||
{
|
||||
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -383,11 +418,46 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
int k0 = tid / (nx * ny);
|
||||
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
||||
fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 2; /* -2 .. nx */
|
||||
int jF = jj - 2; /* -2 .. ny */
|
||||
int kF = kk - 2; /* -2 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
||||
int total = ny * nz;
|
||||
@@ -813,21 +883,16 @@ static inline int grid(size_t n) {
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_ighost = 2ull * ny * nz;
|
||||
const size_t w_jghost = 2ull * (nx + 2ull) * nz;
|
||||
const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
|
||||
kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
|
||||
kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||
}
|
||||
|
||||
@@ -841,14 +906,9 @@ static void gpu_fdderivs(double *d_f,
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_ighost = 2ull * ny * nz;
|
||||
const size_t w_jghost = 2ull * (nx + 2ull) * nz;
|
||||
const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
|
||||
kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
|
||||
kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||
}
|
||||
|
||||
@@ -861,14 +921,9 @@ static void gpu_lopsided(double *d_f, double *d_f_rhs,
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_ighost = 3ull * ny * nz;
|
||||
const size_t w_jghost = 3ull * (nx + 3ull) * nz;
|
||||
const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
|
||||
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
||||
|
||||
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
|
||||
kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
|
||||
kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
|
||||
kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
|
||||
}
|
||||
|
||||
@@ -881,14 +936,9 @@ static void gpu_kodis(double *d_f, double *d_f_rhs,
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_ighost = 3ull * ny * nz;
|
||||
const size_t w_jghost = 3ull * (nx + 3ull) * nz;
|
||||
const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
|
||||
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
||||
|
||||
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
|
||||
kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
|
||||
kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
|
||||
kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user