From ad999e4c5a4a27b0d4774c3c0da412ee33d2b8bd Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 14:28:36 +0800 Subject: [PATCH] Add guarded GPU prolong3 path scaffold --- AMSS_NCKU_source/Parallel.C | 28 ++- AMSS_NCKU_source/bssn_cuda_ops.cu | 284 ++++++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_cuda_ops.h | 6 + 3 files changed, 312 insertions(+), 6 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 59db417..3ecb14b 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -1,11 +1,20 @@ -#include "Parallel.h" +#include "Parallel.h" #include "fmisc.h" #include "prolongrestrict.h" +#include "bssn_cuda_ops.h" #include "misc.h" #include "parameters.h" #include +#if defined(__GNUC__) || defined(__clang__) +extern int bssn_cuda_prolong3_pack(int wei, + const double *llbc, const double *uubc, const int *extc, const double *func, + const double *llbf, const double *uubf, const int *extf, double *funf, + const double *llbp, const double *uubp, + const double *SoA, int symmetry) __attribute__((weak)); +#endif + namespace { const char *g_parallel_transfer_context = "Parallel::transfer"; @@ -3885,11 +3894,18 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); break; - case 3: - f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], - dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, - dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); - } + case 3: + if (!bssn_cuda_prolong3_pack || + bssn_cuda_prolong3_pack(DIM, + src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry)) + { + f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn], + dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, + dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); + } + } if (dir == UNPACK) // from target data to corresponding grid f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 7101227..68eb9ba 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -5,6 +5,7 @@ #include #include #include +#include namespace { @@ -191,6 +192,78 @@ __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k return sign * f[index3(i, j, k, nx, ny)]; } +__device__ inline double load_prolong_cell_padded(const double *f, + int i, int j, int k, + int sx, int sy) +{ + return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy]; +} + +__global__ void prolong3_cell_kernel(const double *funcc, double *funf, + int sxc, int syc, + int nxf, int nyf, int nzf, + int lbc0, int lbc1, int lbc2, + int lbf0, int lbf1, int lbf2, + int ibegin, int iend, + int jbegin, int jend, + int kbegin, int kend) +{ + const double C1 = 7.7e1 / 8.192e3; + const double C2 = -6.93e2 / 8.192e3; + const double C3 = 3.465e3 / 4.096e3; + const double C4 = 1.155e3 / 4.096e3; + const double C5 = -4.95e2 / 8.192e3; + const double C6 = 6.3e1 / 8.192e3; + const double w_even[6] = {C1, C2, C3, C4, C5, C6}; + const double w_odd[6] = {C6, C5, C4, C3, C2, C1}; + + const int n = nxf * nyf * nzf; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + const int plane = nxf * nyf; + const int k = idx / plane; + const int rem = idx - k * plane; + const int j = rem / nxf; + const int i = rem - j * nxf; + + if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend) + continue; + + const int ii = i + lbf0; + const int jj = j + lbf1; + const int kk = k + lbf2; + + const int ic0 = ii / 2 - lbc0 - 1; + const int jc0 = jj / 2 - lbc1 - 1; + const int kc0 = kk / 2 - lbc2 - 1; + + const double *wx = (ii & 1) ? w_odd : w_even; + const double *wy = (jj & 1) ? w_odd : w_even; + const double *wz = (kk & 1) ? w_odd : w_even; + + double value = 0.0; + for (int ox = 0; ox < 6; ++ox) + { + const int cx = ic0 + ox; + double yz = 0.0; + for (int oy = 0; oy < 6; ++oy) + { + const int cy = jc0 + oy; + double zsum = 0.0; + for (int oz = 0; oz < 6; ++oz) + { + const int cz = kc0 + oz; + zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc); + } + yz += wy[oy] * zsum; + } + value += wx[ox] * yz; + } + + funf[idx] = value; + } +} + __global__ void rk4_kernel(int n, double dT, const double *f0, double *f1, @@ -697,3 +770,214 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) } return ok ? 0 : 1; } + +int bssn_cuda_prolong3_pack(int wei, + const double *llbc, const double *uubc, const int *extc, const double *func, + const double *llbf, const double *uubf, const int *extf, double *funf, + const double *llbp, const double *uubp, + const double *SoA, int symmetry) +{ + if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA) + return 1; + + // The current input runs with equatorial symmetry enabled. + // The symmetry-aware prolong CUDA path is not numerically stable yet, + // so force a safe fallback to the original Fortran implementation. + if (symmetry != 0) + return 1; + + auto idint_like = [](double x) -> int { + return static_cast(std::trunc(x)); + }; + + double base[3]; + double CD[3]; + double FD[3]; + int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3]; + for (int d = 0; d < 3; ++d) + { + CD[d] = (uubc[d] - llbc[d]) / static_cast(extc[d]); + FD[d] = (uubf[d] - llbf[d]) / static_cast(extf[d]); + if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10) + return 1; + + if (llbc[d] <= llbf[d]) + { + base[d] = llbc[d]; + } + else + { + int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4); + base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0); + } + + lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1; + ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4); + lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1; + ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4); + lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1; + lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1; + ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4); + ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4); + (void)ubc[d]; + (void)ubf[d]; + } + + const int imino = lbp[0] - lbf[0] + 1; + const int imaxo = ubp[0] - lbf[0] + 1; + const int jmino = lbp[1] - lbf[1] + 1; + const int jmaxo = ubp[1] - lbf[1] + 1; + const int kmino = lbp[2] - lbf[2] + 1; + const int kmaxo = ubp[2] - lbf[2] + 1; + + const int imini = lbpc[0] - lbc[0] + 1; + const int imaxi = ubpc[0] - lbc[0] + 1; + const int jmini = lbpc[1] - lbc[1] + 1; + const int jmaxi = ubpc[1] - lbc[1] + 1; + const int kmini = lbpc[2] - lbc[2] + 1; + const int kmaxi = ubpc[2] - lbc[2] + 1; + + if (imino < 1 || jmino < 1 || kmino < 1 || + imini < 1 || jmini < 1 || kmini < 1 || + imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] || + imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2) + { + return 1; + } + + auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int { + int ii = fine_idx + lbf_d - 1; + return ii / 2 - lbc_d + 1; + }; + const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]); + const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]); + const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]); + + // Current CUDA prolong path only supports the same fast path as the + // optimized Fortran code: interior stencil access without symmetry_bd(). + // If the stencil touches the symmetry boundary, fall back to Fortran. + if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1) + return 1; + + struct ProlongCache + { + CachedBuffer coarse; + CachedBuffer fine; + }; + static thread_local ProlongCache cache; + + int nxc = extc[0], nyc = extc[1], nzc = extc[2]; + int nxf = extf[0], nyf = extf[1], nzf = extf[2]; + int sxc = nxc + 3; + int syc = nyc + 3; + int szc = nzc + 3; + int coarse_points = sxc * syc * szc; + int fine_points = nxf * nyf * nzf; + const size_t coarse_bytes = static_cast(coarse_points) * sizeof(double); + const size_t fine_bytes = static_cast(fine_points) * sizeof(double); + + std::vector funcc_host(static_cast(coarse_points), 0.0); + auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t { + return static_cast(fi + 2) + + static_cast(fj + 2) * static_cast(sxc) + + static_cast(fk + 2) * static_cast(sxc) * static_cast(syc); + }; + auto func_index = [nxc, nyc](int i, int j, int k) -> size_t { + return static_cast(i - 1) + + static_cast(j - 1) * static_cast(nxc) + + static_cast(k - 1) * static_cast(nxc) * static_cast(nyc); + }; + + for (int k = 1; k <= nzc; ++k) + { + for (int j = 1; j <= nyc; ++j) + { + for (int i = 1; i <= nxc; ++i) + { + funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)]; + } + } + } + + for (int offset = 0; offset < 3; ++offset) + { + int target_i = -offset; + int source_i = offset + 2; + for (int k = 1; k <= nzc; ++k) + { + for (int j = 1; j <= nyc; ++j) + { + funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0]; + } + } + } + + for (int offset = 0; offset < 3; ++offset) + { + int target_j = -offset; + int source_j = offset + 2; + for (int k = 1; k <= nzc; ++k) + { + for (int i = -2; i <= nxc; ++i) + { + funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1]; + } + } + } + + for (int offset = 0; offset < 3; ++offset) + { + int target_k = -offset; + int source_k = offset + 2; + for (int j = -2; j <= nyc; ++j) + { + for (int i = -2; i <= nxc; ++i) + { + funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2]; + } + } + } + + double *d_func = nullptr; + if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes)) + return 1; + d_func = cache.coarse.ptr; + + if (!ensure_capacity(cache.fine, fine_bytes)) + { + return 1; + } + + dim3 block(256); + dim3 grid(div_up(fine_points, static_cast(block.x))); + double *d_funf = cache.fine.ptr; + int ibegin = imino - 1, iend = imaxo - 1; + int jbegin = jmino - 1, jend = jmaxo - 1; + int kbegin = kmino - 1, kend = kmaxo - 1; + void *args[] = {&d_func, &d_funf, + &sxc, &syc, + &nxf, &nyf, &nzf, + &lbc[0], &lbc[1], &lbc[2], + &lbf[0], &lbf[1], &lbf[2], + &ibegin, &iend, + &jbegin, &jend, + &kbegin, &kend}; + + if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args)) + return 1; + + cudaError_t sync_err = cudaDeviceSynchronize(); + if (sync_err != cudaSuccess) + { + report_cuda_error("cudaDeviceSynchronize prolong3", sync_err); + return 1; + } + + cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) prolong3", err); + return 1; + } + return 0; +} diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index e45bcaa..080ae95 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -23,4 +23,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int bssn_cuda_lowerbound(int *ex, double *chi, double tinny); +int bssn_cuda_prolong3_pack(int wei, + const double *llbc, const double *uubc, const int *extc, const double *func, + const double *llbf, const double *uubf, const int *extf, double *funf, + const double *llbp, const double *uubp, + const double *SoA, int symmetry); + #endif