From abf2f640e4e46f54d6cff03a4838f39d4786cdf3 Mon Sep 17 00:00:00 2001 From: ianchb Date: Sat, 28 Feb 2026 15:35:14 +0800 Subject: [PATCH] add fused symmetry packing kernels for orders 2 and 3 in BSSN RHS --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 146 ++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 48 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 94b073a..9d072c6 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -274,10 +274,10 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { /* A. Symmetry boundary kernels (ord=2 and ord=3) */ /* ================================================================== */ -/* Step 1: Copy interior into ghost-padded array */ -__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, - double * __restrict__ fh, - double SoA0, double SoA1, double SoA2) +/* Step 1: Copy interior into ghost-padded array */ +__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) { const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; @@ -290,12 +290,47 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, int k0 = tid / (nx * ny); int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid]; - } -} - -/* Step 2: Fill i-ghosts (x-direction symmetry) */ -__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0) -{ + } +} + +/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 1; /* -1 .. nx */ + int jF = jj - 1; /* -1 .. ny */ + int kF = kk - 1; /* -1 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +/* Step 2: Fill i-ghosts (x-direction symmetry) */ +__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0) +{ const int ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; /* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */ @@ -369,9 +404,9 @@ __global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2) /* ---- ord=3 variants (for lopsided / kodis) ---- */ -__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, - double * __restrict__ fh) -{ +__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, + double * __restrict__ fh) +{ const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; for (int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -383,11 +418,46 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, int k0 = tid / (nx * ny); int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid]; - } -} - -__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0) -{ + } +} + +/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord3(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 2; /* -2 .. nx */ + int jF = jj - 2; /* -2 .. ny */ + int kF = kk - 2; /* -2 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0) +{ const int ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; int total = ny * nz; @@ -813,21 +883,16 @@ static inline int grid(size_t n) { } /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */ -static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, - double SoA0, double SoA1, double SoA2, int all) +static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, + double SoA0, double SoA1, double SoA2, int all) { double *fh = g_buf.d_fh2; const size_t nx = (size_t)g_buf.prev_nx; const size_t ny = (size_t)g_buf.prev_ny; const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_ighost = 2ull * ny * nz; - const size_t w_jghost = 2ull * (nx + 2ull) * nz; - const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull); + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); - kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_symbd_ighost_ord2<<>>(fh, SoA0); - kern_symbd_jghost_ord2<<>>(fh, SoA1); - kern_symbd_kghost_ord2<<>>(fh, SoA2); + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); } @@ -841,14 +906,9 @@ static void gpu_fdderivs(double *d_f, const size_t nx = (size_t)g_buf.prev_nx; const size_t ny = (size_t)g_buf.prev_ny; const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_ighost = 2ull * ny * nz; - const size_t w_jghost = 2ull * (nx + 2ull) * nz; - const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull); + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); - kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_symbd_ighost_ord2<<>>(fh, SoA0); - kern_symbd_jghost_ord2<<>>(fh, SoA1); - kern_symbd_kghost_ord2<<>>(fh, SoA2); + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); } @@ -861,14 +921,9 @@ static void gpu_lopsided(double *d_f, double *d_f_rhs, const size_t nx = (size_t)g_buf.prev_nx; const size_t ny = (size_t)g_buf.prev_ny; const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_ighost = 3ull * ny * nz; - const size_t w_jghost = 3ull * (nx + 3ull) * nz; - const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull); + const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); - kern_symbd_copy_interior_ord3<<>>(d_f, fh); - kern_symbd_ighost_ord3<<>>(fh, SoA0); - kern_symbd_jghost_ord3<<>>(fh, SoA1); - kern_symbd_kghost_ord3<<>>(fh, SoA2); + kern_symbd_pack_ord3<<>>(d_f, fh, SoA0, SoA1, SoA2); kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); } @@ -881,14 +936,9 @@ static void gpu_kodis(double *d_f, double *d_f_rhs, const size_t nx = (size_t)g_buf.prev_nx; const size_t ny = (size_t)g_buf.prev_ny; const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_ighost = 3ull * ny * nz; - const size_t w_jghost = 3ull * (nx + 3ull) * nz; - const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull); + const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); - kern_symbd_copy_interior_ord3<<>>(d_f, fh); - kern_symbd_ighost_ord3<<>>(fh, SoA0); - kern_symbd_jghost_ord3<<>>(fh, SoA1); - kern_symbd_kghost_ord3<<>>(fh, SoA2); + kern_symbd_pack_ord3<<>>(d_f, fh, SoA0, SoA1, SoA2); kern_kodis<<>>(fh, d_f_rhs, eps_val); }