Add guarded GPU prolong3 path scaffold
This commit is contained in:
@@ -1,11 +1,20 @@
|
||||
|
||||
#include "Parallel.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#include "prolongrestrict.h"
|
||||
#include "bssn_cuda_ops.h"
|
||||
#include "misc.h"
|
||||
#include "parameters.h"
|
||||
#include <cstring>
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
extern int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, double *funf,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry) __attribute__((weak));
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
const char *g_parallel_transfer_context = "Parallel::transfer";
|
||||
@@ -3885,11 +3894,18 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
break;
|
||||
case 3:
|
||||
f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
}
|
||||
case 3:
|
||||
if (!bssn_cuda_prolong3_pack ||
|
||||
bssn_cuda_prolong3_pack(DIM,
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry))
|
||||
{
|
||||
f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
}
|
||||
}
|
||||
if (dir == UNPACK) // from target data to corresponding grid
|
||||
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -191,6 +192,78 @@ __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k
|
||||
return sign * f[index3(i, j, k, nx, ny)];
|
||||
}
|
||||
|
||||
__device__ inline double load_prolong_cell_padded(const double *f,
|
||||
int i, int j, int k,
|
||||
int sx, int sy)
|
||||
{
|
||||
return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
|
||||
}
|
||||
|
||||
__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
||||
int sxc, int syc,
|
||||
int nxf, int nyf, int nzf,
|
||||
int lbc0, int lbc1, int lbc2,
|
||||
int lbf0, int lbf1, int lbf2,
|
||||
int ibegin, int iend,
|
||||
int jbegin, int jend,
|
||||
int kbegin, int kend)
|
||||
{
|
||||
const double C1 = 7.7e1 / 8.192e3;
|
||||
const double C2 = -6.93e2 / 8.192e3;
|
||||
const double C3 = 3.465e3 / 4.096e3;
|
||||
const double C4 = 1.155e3 / 4.096e3;
|
||||
const double C5 = -4.95e2 / 8.192e3;
|
||||
const double C6 = 6.3e1 / 8.192e3;
|
||||
const double w_even[6] = {C1, C2, C3, C4, C5, C6};
|
||||
const double w_odd[6] = {C6, C5, C4, C3, C2, C1};
|
||||
|
||||
const int n = nxf * nyf * nzf;
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int plane = nxf * nyf;
|
||||
const int k = idx / plane;
|
||||
const int rem = idx - k * plane;
|
||||
const int j = rem / nxf;
|
||||
const int i = rem - j * nxf;
|
||||
|
||||
if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
|
||||
continue;
|
||||
|
||||
const int ii = i + lbf0;
|
||||
const int jj = j + lbf1;
|
||||
const int kk = k + lbf2;
|
||||
|
||||
const int ic0 = ii / 2 - lbc0 - 1;
|
||||
const int jc0 = jj / 2 - lbc1 - 1;
|
||||
const int kc0 = kk / 2 - lbc2 - 1;
|
||||
|
||||
const double *wx = (ii & 1) ? w_odd : w_even;
|
||||
const double *wy = (jj & 1) ? w_odd : w_even;
|
||||
const double *wz = (kk & 1) ? w_odd : w_even;
|
||||
|
||||
double value = 0.0;
|
||||
for (int ox = 0; ox < 6; ++ox)
|
||||
{
|
||||
const int cx = ic0 + ox;
|
||||
double yz = 0.0;
|
||||
for (int oy = 0; oy < 6; ++oy)
|
||||
{
|
||||
const int cy = jc0 + oy;
|
||||
double zsum = 0.0;
|
||||
for (int oz = 0; oz < 6; ++oz)
|
||||
{
|
||||
const int cz = kc0 + oz;
|
||||
zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
|
||||
}
|
||||
yz += wy[oy] * zsum;
|
||||
}
|
||||
value += wx[ox] * yz;
|
||||
}
|
||||
|
||||
funf[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void rk4_kernel(int n, double dT,
|
||||
const double *f0,
|
||||
double *f1,
|
||||
@@ -697,3 +770,214 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
}
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, double *funf,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry)
|
||||
{
|
||||
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
||||
return 1;
|
||||
|
||||
// The current input runs with equatorial symmetry enabled.
|
||||
// The symmetry-aware prolong CUDA path is not numerically stable yet,
|
||||
// so force a safe fallback to the original Fortran implementation.
|
||||
if (symmetry != 0)
|
||||
return 1;
|
||||
|
||||
auto idint_like = [](double x) -> int {
|
||||
return static_cast<int>(std::trunc(x));
|
||||
};
|
||||
|
||||
double base[3];
|
||||
double CD[3];
|
||||
double FD[3];
|
||||
int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
|
||||
for (int d = 0; d < 3; ++d)
|
||||
{
|
||||
CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
|
||||
FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
|
||||
if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
|
||||
return 1;
|
||||
|
||||
if (llbc[d] <= llbf[d])
|
||||
{
|
||||
base[d] = llbc[d];
|
||||
}
|
||||
else
|
||||
{
|
||||
int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
|
||||
base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
|
||||
}
|
||||
|
||||
lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
|
||||
ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
|
||||
lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
|
||||
ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
|
||||
lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
|
||||
lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
|
||||
ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
|
||||
ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
|
||||
(void)ubc[d];
|
||||
(void)ubf[d];
|
||||
}
|
||||
|
||||
const int imino = lbp[0] - lbf[0] + 1;
|
||||
const int imaxo = ubp[0] - lbf[0] + 1;
|
||||
const int jmino = lbp[1] - lbf[1] + 1;
|
||||
const int jmaxo = ubp[1] - lbf[1] + 1;
|
||||
const int kmino = lbp[2] - lbf[2] + 1;
|
||||
const int kmaxo = ubp[2] - lbf[2] + 1;
|
||||
|
||||
const int imini = lbpc[0] - lbc[0] + 1;
|
||||
const int imaxi = ubpc[0] - lbc[0] + 1;
|
||||
const int jmini = lbpc[1] - lbc[1] + 1;
|
||||
const int jmaxi = ubpc[1] - lbc[1] + 1;
|
||||
const int kmini = lbpc[2] - lbc[2] + 1;
|
||||
const int kmaxi = ubpc[2] - lbc[2] + 1;
|
||||
|
||||
if (imino < 1 || jmino < 1 || kmino < 1 ||
|
||||
imini < 1 || jmini < 1 || kmini < 1 ||
|
||||
imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
|
||||
imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
||||
int ii = fine_idx + lbf_d - 1;
|
||||
return ii / 2 - lbc_d + 1;
|
||||
};
|
||||
const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
|
||||
const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
|
||||
const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
|
||||
|
||||
// Current CUDA prolong path only supports the same fast path as the
|
||||
// optimized Fortran code: interior stencil access without symmetry_bd().
|
||||
// If the stencil touches the symmetry boundary, fall back to Fortran.
|
||||
if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
|
||||
return 1;
|
||||
|
||||
struct ProlongCache
|
||||
{
|
||||
CachedBuffer coarse;
|
||||
CachedBuffer fine;
|
||||
};
|
||||
static thread_local ProlongCache cache;
|
||||
|
||||
int nxc = extc[0], nyc = extc[1], nzc = extc[2];
|
||||
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
||||
int sxc = nxc + 3;
|
||||
int syc = nyc + 3;
|
||||
int szc = nzc + 3;
|
||||
int coarse_points = sxc * syc * szc;
|
||||
int fine_points = nxf * nyf * nzf;
|
||||
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
||||
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
||||
|
||||
std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
|
||||
auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
|
||||
return static_cast<size_t>(fi + 2) +
|
||||
static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
|
||||
static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
|
||||
};
|
||||
auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
|
||||
return static_cast<size_t>(i - 1) +
|
||||
static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
|
||||
static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
|
||||
};
|
||||
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
{
|
||||
for (int j = 1; j <= nyc; ++j)
|
||||
{
|
||||
for (int i = 1; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
int target_i = -offset;
|
||||
int source_i = offset + 2;
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
{
|
||||
for (int j = 1; j <= nyc; ++j)
|
||||
{
|
||||
funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
int target_j = -offset;
|
||||
int source_j = offset + 2;
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
{
|
||||
for (int i = -2; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
int target_k = -offset;
|
||||
int source_k = offset + 2;
|
||||
for (int j = -2; j <= nyc; ++j)
|
||||
{
|
||||
for (int i = -2; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double *d_func = nullptr;
|
||||
if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
|
||||
return 1;
|
||||
d_func = cache.coarse.ptr;
|
||||
|
||||
if (!ensure_capacity(cache.fine, fine_bytes))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
|
||||
double *d_funf = cache.fine.ptr;
|
||||
int ibegin = imino - 1, iend = imaxo - 1;
|
||||
int jbegin = jmino - 1, jend = jmaxo - 1;
|
||||
int kbegin = kmino - 1, kend = kmaxo - 1;
|
||||
void *args[] = {&d_func, &d_funf,
|
||||
&sxc, &syc,
|
||||
&nxf, &nyf, &nzf,
|
||||
&lbc[0], &lbc[1], &lbc[2],
|
||||
&lbf[0], &lbf[1], &lbf[2],
|
||||
&ibegin, &iend,
|
||||
&jbegin, &jend,
|
||||
&kbegin, &kend};
|
||||
|
||||
if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
|
||||
return 1;
|
||||
|
||||
cudaError_t sync_err = cudaDeviceSynchronize();
|
||||
if (sync_err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) prolong3", err);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -23,4 +23,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
|
||||
|
||||
int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, double *funf,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry);
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user