add fused symmetry packing kernels for orders 2 and 3 in BSSN RHS

This commit is contained in:
2026-02-28 15:35:14 +08:00
parent 94f40627aa
commit abf2f640e4

View File

@@ -293,6 +293,41 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
}
}
/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
double * __restrict__ fh,
double SoA0, double SoA1, double SoA2)
{
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
const int total = fnx * fny * fnz;
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += blockDim.x * gridDim.x)
{
int ii = tid % fnx;
int jj = (tid / fnx) % fny;
int kk = tid / (fnx * fny);
int iF = ii - 1; /* -1 .. nx */
int jF = jj - 1; /* -1 .. ny */
int kF = kk - 1; /* -1 .. nz */
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
double sign = 1.0;
if (iF <= 0) sign *= SoA0;
if (jF <= 0) sign *= SoA1;
if (kF <= 0) sign *= SoA2;
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
fh[tid] = sign * func[src];
}
}
/* Step 2: Fill i-ghosts (x-direction symmetry) */
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
{
@@ -386,6 +421,41 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
}
}
/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
double * __restrict__ fh,
double SoA0, double SoA1, double SoA2)
{
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
const int total = fnx * fny * fnz;
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += blockDim.x * gridDim.x)
{
int ii = tid % fnx;
int jj = (tid / fnx) % fny;
int kk = tid / (fnx * fny);
int iF = ii - 2; /* -2 .. nx */
int jF = jj - 2; /* -2 .. ny */
int kF = kk - 2; /* -2 .. nz */
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
double sign = 1.0;
if (iF <= 0) sign *= SoA0;
if (jF <= 0) sign *= SoA1;
if (kF <= 0) sign *= SoA2;
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
fh[tid] = sign * func[src];
}
}
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
{
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
@@ -820,14 +890,9 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
const size_t nx = (size_t)g_buf.prev_nx;
const size_t ny = (size_t)g_buf.prev_ny;
const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_ighost = 2ull * ny * nz;
const size_t w_jghost = 2ull * (nx + 2ull) * nz;
const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
}
@@ -841,14 +906,9 @@ static void gpu_fdderivs(double *d_f,
const size_t nx = (size_t)g_buf.prev_nx;
const size_t ny = (size_t)g_buf.prev_ny;
const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_ighost = 2ull * ny * nz;
const size_t w_jghost = 2ull * (nx + 2ull) * nz;
const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull);
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
kern_symbd_copy_interior_ord2<<<grid(all), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_symbd_ighost_ord2<<<grid(w_ighost), BLK>>>(fh, SoA0);
kern_symbd_jghost_ord2<<<grid(w_jghost), BLK>>>(fh, SoA1);
kern_symbd_kghost_ord2<<<grid(w_kghost), BLK>>>(fh, SoA2);
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
}
@@ -861,14 +921,9 @@ static void gpu_lopsided(double *d_f, double *d_f_rhs,
const size_t nx = (size_t)g_buf.prev_nx;
const size_t ny = (size_t)g_buf.prev_ny;
const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_ighost = 3ull * ny * nz;
const size_t w_jghost = 3ull * (nx + 3ull) * nz;
const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
}
@@ -881,14 +936,9 @@ static void gpu_kodis(double *d_f, double *d_f_rhs,
const size_t nx = (size_t)g_buf.prev_nx;
const size_t ny = (size_t)g_buf.prev_ny;
const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_ighost = 3ull * ny * nz;
const size_t w_jghost = 3ull * (nx + 3ull) * nz;
const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull);
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
kern_symbd_copy_interior_ord3<<<grid(all), BLK>>>(d_f, fh);
kern_symbd_ighost_ord3<<<grid(w_ighost), BLK>>>(fh, SoA0);
kern_symbd_jghost_ord3<<<grid(w_jghost), BLK>>>(fh, SoA1);
kern_symbd_kghost_ord3<<<grid(w_kghost), BLK>>>(fh, SoA2);
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
}