Stabilize GPU AMR prolong/restrict paths
This commit is contained in:
@@ -14,6 +14,11 @@ extern int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbf, const double *uubf, const int *extf, double *funf,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry) __attribute__((weak));
|
||||
extern int bssn_cuda_restrict3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, const double *funf,
|
||||
const double *llbr, const double *uubr,
|
||||
const double *SoA, int symmetry) __attribute__((weak));
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
@@ -4048,11 +4053,18 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub);
|
||||
break;
|
||||
case 2:
|
||||
f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
break;
|
||||
case 2:
|
||||
if (!bssn_cuda_restrict3_pack ||
|
||||
bssn_cuda_restrict3_pack(DIM,
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry))
|
||||
{
|
||||
f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
if (!bssn_cuda_prolong3_pack ||
|
||||
bssn_cuda_prolong3_pack(DIM,
|
||||
|
||||
@@ -423,14 +423,23 @@ __device__ inline double load_prolong_cell_padded(const double *f,
|
||||
return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
|
||||
}
|
||||
|
||||
__device__ inline double load_restrict_cell_padded(const double *f,
|
||||
int i, int j, int k,
|
||||
int sx, int sy)
|
||||
{
|
||||
return f[(i + 1) + (j + 1) * sx + (k + 1) * sx * sy];
|
||||
}
|
||||
|
||||
__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
||||
int sxc, int syc,
|
||||
int nxc, int nyc, int nzc,
|
||||
int nxf, int nyf, int nzf,
|
||||
int lbc0, int lbc1, int lbc2,
|
||||
int lbf0, int lbf1, int lbf2,
|
||||
int ibegin, int iend,
|
||||
int jbegin, int jend,
|
||||
int kbegin, int kend)
|
||||
int kbegin, int kend,
|
||||
int *error_flag)
|
||||
{
|
||||
const double C1 = 7.7e1 / 8.192e3;
|
||||
const double C2 = -6.93e2 / 8.192e3;
|
||||
@@ -477,6 +486,15 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
||||
for (int oz = 0; oz < 6; ++oz)
|
||||
{
|
||||
const int cz = kc0 + oz;
|
||||
if (cx < -2 || cx > nxc || cy < -2 || cy > nyc || cz < -2 || cz > nzc)
|
||||
{
|
||||
if (error_flag)
|
||||
*error_flag = 1;
|
||||
zsum = 0.0;
|
||||
yz = 0.0;
|
||||
value = 0.0;
|
||||
goto prolong3_cell_kernel_next_point;
|
||||
}
|
||||
zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
|
||||
}
|
||||
yz += wy[oy] * zsum;
|
||||
@@ -485,6 +503,74 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
||||
}
|
||||
|
||||
funf[idx] = value;
|
||||
prolong3_cell_kernel_next_point:
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void restrict3_cell_kernel(const double *funf, double *func,
|
||||
int sxf, int syf,
|
||||
int nxf, int nyf, int nzf,
|
||||
int nxc, int nyc, int nzc,
|
||||
int lbc0, int lbc1, int lbc2,
|
||||
int lbf0, int lbf1, int lbf2,
|
||||
int ibegin, int iend,
|
||||
int jbegin, int jend,
|
||||
int kbegin, int kend,
|
||||
int *error_flag)
|
||||
{
|
||||
const double C1 = 3.0 / 256.0;
|
||||
const double C2 = -25.0 / 256.0;
|
||||
const double C3 = 75.0 / 128.0;
|
||||
const double w[6] = {C1, C2, C3, C3, C2, C1};
|
||||
|
||||
const int n = nxc * nyc * nzc;
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int plane = nxc * nyc;
|
||||
const int k = idx / plane;
|
||||
const int rem = idx - k * plane;
|
||||
const int j = rem / nxc;
|
||||
const int i = rem - j * nxc;
|
||||
|
||||
if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
|
||||
continue;
|
||||
|
||||
const int fi = 2 * (i + lbc0) - lbf0;
|
||||
const int fj = 2 * (j + lbc1) - lbf1;
|
||||
const int fk = 2 * (k + lbc2) - lbf2;
|
||||
|
||||
double value = 0.0;
|
||||
for (int ox = 0; ox < 6; ++ox)
|
||||
{
|
||||
const int fx = fi - 2 + ox;
|
||||
double yz = 0.0;
|
||||
for (int oy = 0; oy < 6; ++oy)
|
||||
{
|
||||
const int fy = fj - 2 + oy;
|
||||
double zsum = 0.0;
|
||||
for (int oz = 0; oz < 6; ++oz)
|
||||
{
|
||||
const int fz = fk - 2 + oz;
|
||||
if (fx < -1 || fx > nxf || fy < -1 || fy > nyf || fz < -1 || fz > nzf)
|
||||
{
|
||||
if (error_flag)
|
||||
*error_flag = 1;
|
||||
zsum = 0.0;
|
||||
yz = 0.0;
|
||||
value = 0.0;
|
||||
goto restrict3_cell_kernel_next_point;
|
||||
}
|
||||
zsum += w[oz] * load_restrict_cell_padded(funf, fx, fy, fz, sxf, syf);
|
||||
}
|
||||
yz += w[oy] * zsum;
|
||||
}
|
||||
value += w[ox] * yz;
|
||||
}
|
||||
|
||||
func[idx] = value;
|
||||
restrict3_cell_kernel_next_point:
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1287,13 +1373,14 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry)
|
||||
{
|
||||
const int no_symm = 0;
|
||||
const int eq_symm = 1;
|
||||
|
||||
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
||||
return 1;
|
||||
|
||||
// The symmetry-aware prolong CUDA path is still not equivalent to the
|
||||
// active Cell/ghost_width=3 Fortran implementation, so keep the safe
|
||||
// fallback for all symmetry-enabled cases.
|
||||
if (symmetry != 0)
|
||||
// Keep the CPU fallback for octant and any unrecognized symmetry modes.
|
||||
if (symmetry != no_symm && symmetry != eq_symm)
|
||||
return 1;
|
||||
|
||||
auto idint_like = [](double x) -> int {
|
||||
@@ -1355,28 +1442,55 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Keep thin prolongation slabs on the established CPU path for now.
|
||||
// The GPU kernel is only enabled for chunkier regions where the current
|
||||
// device pack path is known to be safe.
|
||||
if (extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
|
||||
return 1;
|
||||
|
||||
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
||||
int ii = fine_idx + lbf_d - 1;
|
||||
return ii / 2 - lbc_d + 1;
|
||||
};
|
||||
const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
|
||||
const int ic_max = coarse_center_index(imaxo, lbf[0], lbc[0]);
|
||||
const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
|
||||
const int jc_max = coarse_center_index(jmaxo, lbf[1], lbc[1]);
|
||||
const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
|
||||
const int kc_max = coarse_center_index(kmaxo, lbf[2], lbc[2]);
|
||||
const bool need_equatorial_z_reflect = (symmetry == eq_symm && kc_min - 2 < 1);
|
||||
|
||||
// Current CUDA prolong path only supports the same fast path as the
|
||||
// optimized Fortran code: interior stencil access without symmetry_bd().
|
||||
if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
|
||||
// Match the active Fortran implementation's later stencil safety check.
|
||||
if (ic_max + 3 > extc[0] || jc_max + 3 > extc[1] || kc_max + 3 > extc[2] ||
|
||||
ic_min < 1 || jc_min < 1 || kc_min < 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Match the active Cell-centered Fortran fast path: only the z<0
|
||||
// reflection needed by equatorial symmetry is handled here.
|
||||
if (ic_min - 2 < 1 || jc_min - 2 < 1 || (kc_min - 2 < 1 && !need_equatorial_z_reflect))
|
||||
return 1;
|
||||
|
||||
struct ProlongCache
|
||||
{
|
||||
CachedBuffer coarse;
|
||||
CachedBuffer fine;
|
||||
CachedIntBuffer error_flag;
|
||||
};
|
||||
static thread_local ProlongCache cache;
|
||||
|
||||
int nxc = extc[0], nyc = extc[1], nzc = extc[2];
|
||||
int src_nxc = extc[0], src_nyc = extc[1], src_nzc = extc[2];
|
||||
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
||||
const int stage_ix_lo = ic_min - 2;
|
||||
const int stage_ix_hi = ic_max + 3;
|
||||
const int stage_iy_lo = jc_min - 2;
|
||||
const int stage_iy_hi = jc_max + 3;
|
||||
const int stage_iz_lo = need_equatorial_z_reflect ? 1 : (kc_min - 2);
|
||||
const int stage_iz_hi = kc_max + 3;
|
||||
int nxc = stage_ix_hi - stage_ix_lo + 1;
|
||||
int nyc = stage_iy_hi - stage_iy_lo + 1;
|
||||
int nzc = stage_iz_hi - stage_iz_lo + 1;
|
||||
int sxc = nxc + 3;
|
||||
int syc = nyc + 3;
|
||||
int szc = nzc + 3;
|
||||
@@ -1391,58 +1505,39 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
|
||||
static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
|
||||
};
|
||||
auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
|
||||
auto func_index = [src_nxc, src_nyc](int i, int j, int k) -> size_t {
|
||||
return static_cast<size_t>(i - 1) +
|
||||
static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
|
||||
static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
|
||||
static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxc) +
|
||||
static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxc) * static_cast<size_t>(src_nyc);
|
||||
};
|
||||
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
for (int k = stage_iz_lo; k <= stage_iz_hi; ++k)
|
||||
{
|
||||
for (int j = 1; j <= nyc; ++j)
|
||||
const int lk = k - stage_iz_lo + 1;
|
||||
for (int j = stage_iy_lo; j <= stage_iy_hi; ++j)
|
||||
{
|
||||
for (int i = 1; i <= nxc; ++i)
|
||||
const int lj = j - stage_iy_lo + 1;
|
||||
for (int i = stage_ix_lo; i <= stage_ix_hi; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
|
||||
const int li = i - stage_ix_lo + 1;
|
||||
funcc_host[coarse_index(li, lj, lk)] = func[func_index(i, j, k)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
if (need_equatorial_z_reflect)
|
||||
{
|
||||
int target_i = -offset;
|
||||
int source_i = offset + 2;
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
const int target_k = -offset;
|
||||
const int source_k = offset + 1;
|
||||
for (int j = 1; j <= nyc; ++j)
|
||||
{
|
||||
funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
int target_j = -offset;
|
||||
int source_j = offset + 2;
|
||||
for (int k = 1; k <= nzc; ++k)
|
||||
{
|
||||
for (int i = -2; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int offset = 0; offset < 3; ++offset)
|
||||
{
|
||||
int target_k = -offset;
|
||||
int source_k = offset + 2;
|
||||
for (int j = -2; j <= nyc; ++j)
|
||||
{
|
||||
for (int i = -2; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
|
||||
for (int i = 1; i <= nxc; ++i)
|
||||
{
|
||||
funcc_host[coarse_index(i, j, target_k)] =
|
||||
funcc_host[coarse_index(i, j, source_k)] * SoA[2];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1456,21 +1551,38 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
if (!ensure_capacity(cache.error_flag, sizeof(int)))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
int zero = 0;
|
||||
cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) prolong3_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
|
||||
double *d_funf = cache.fine.ptr;
|
||||
int lbc_local[3] = {lbc[0] + stage_ix_lo - 1,
|
||||
lbc[1] + stage_iy_lo - 1,
|
||||
lbc[2] + stage_iz_lo - 1};
|
||||
int ibegin = imino - 1, iend = imaxo - 1;
|
||||
int jbegin = jmino - 1, jend = jmaxo - 1;
|
||||
int kbegin = kmino - 1, kend = kmaxo - 1;
|
||||
void *args[] = {&d_func, &d_funf,
|
||||
&sxc, &syc,
|
||||
&nxc, &nyc, &nzc,
|
||||
&nxf, &nyf, &nzf,
|
||||
&lbc[0], &lbc[1], &lbc[2],
|
||||
&lbc_local[0], &lbc_local[1], &lbc_local[2],
|
||||
&lbf[0], &lbf[1], &lbf[2],
|
||||
&ibegin, &iend,
|
||||
&jbegin, &jend,
|
||||
&kbegin, &kend};
|
||||
&kbegin, &kend,
|
||||
&cache.error_flag.ptr};
|
||||
|
||||
if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
|
||||
return 1;
|
||||
@@ -1478,11 +1590,33 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
cudaError_t sync_err = cudaDeviceSynchronize();
|
||||
if (sync_err != cudaSuccess)
|
||||
{
|
||||
std::fprintf(stderr,
|
||||
"prolong3 debug: symmetry=%d extc=(%d,%d,%d) extf=(%d,%d,%d) "
|
||||
"imino=%d imaxo=%d jmino=%d jmaxo=%d kmino=%d kmaxo=%d "
|
||||
"ic_min=%d ic_max=%d jc_min=%d jc_max=%d kc_min=%d kc_max=%d "
|
||||
"lbc=(%d,%d,%d) lbf=(%d,%d,%d)\n",
|
||||
symmetry,
|
||||
extc[0], extc[1], extc[2],
|
||||
extf[0], extf[1], extf[2],
|
||||
imino, imaxo, jmino, jmaxo, kmino, kmaxo,
|
||||
ic_min, ic_max, jc_min, jc_max, kc_min, kc_max,
|
||||
lbc[0], lbc[1], lbc[2],
|
||||
lbf[0], lbf[1], lbf[2]);
|
||||
report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
|
||||
int host_error_flag = 0;
|
||||
err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) prolong3_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
if (host_error_flag)
|
||||
return 1;
|
||||
|
||||
err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) prolong3", err);
|
||||
@@ -1490,3 +1624,249 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bssn_cuda_restrict3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, const double *funf,
|
||||
const double *llbr, const double *uubr,
|
||||
const double *SoA, int symmetry)
|
||||
{
|
||||
const int no_symm = 0;
|
||||
const int eq_symm = 1;
|
||||
|
||||
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbr || !uubr || !SoA)
|
||||
return 1;
|
||||
|
||||
if (symmetry != no_symm && symmetry != eq_symm)
|
||||
return 1;
|
||||
|
||||
auto idint_like = [](double x) -> int {
|
||||
return static_cast<int>(std::trunc(x));
|
||||
};
|
||||
|
||||
double base[3];
|
||||
double CD[3];
|
||||
double FD[3];
|
||||
int lbc[3], ubc[3], lbf[3], ubf[3], lbr_i[3], ubr_i[3], lbrf[3], ubrf[3];
|
||||
for (int d = 0; d < 3; ++d)
|
||||
{
|
||||
CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
|
||||
FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
|
||||
if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
|
||||
return 1;
|
||||
|
||||
if (llbc[d] <= llbf[d])
|
||||
{
|
||||
base[d] = llbc[d];
|
||||
}
|
||||
else
|
||||
{
|
||||
int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
|
||||
base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
|
||||
}
|
||||
|
||||
lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
|
||||
ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
|
||||
lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
|
||||
ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
|
||||
lbr_i[d] = idint_like((llbr[d] - base[d]) / CD[d] + 0.4) + 1;
|
||||
lbrf[d] = idint_like((llbr[d] - base[d]) / FD[d] + 0.4) + 1;
|
||||
ubr_i[d] = idint_like((uubr[d] - base[d]) / CD[d] + 0.4);
|
||||
ubrf[d] = idint_like((uubr[d] - base[d]) / FD[d] + 0.4);
|
||||
(void)ubc[d];
|
||||
(void)ubf[d];
|
||||
}
|
||||
|
||||
const int imino = lbr_i[0] - lbc[0] + 1;
|
||||
const int imaxo = ubr_i[0] - lbc[0] + 1;
|
||||
const int jmino = lbr_i[1] - lbc[1] + 1;
|
||||
const int jmaxo = ubr_i[1] - lbc[1] + 1;
|
||||
const int kmino = lbr_i[2] - lbc[2] + 1;
|
||||
const int kmaxo = ubr_i[2] - lbc[2] + 1;
|
||||
|
||||
const int imini = lbrf[0] - lbf[0] + 1;
|
||||
const int imaxi = ubrf[0] - lbf[0] + 1;
|
||||
const int jmini = lbrf[1] - lbf[1] + 1;
|
||||
const int jmaxi = ubrf[1] - lbf[1] + 1;
|
||||
const int kmini = lbrf[2] - lbf[2] + 1;
|
||||
const int kmaxi = ubrf[2] - lbf[2] + 1;
|
||||
|
||||
if (imino < 1 || jmino < 1 || kmino < 1 ||
|
||||
imini < 1 || jmini < 1 || kmini < 1 ||
|
||||
imaxo > extc[0] || jmaxo > extc[1] || kmaxo > extc[2] ||
|
||||
imaxi > extf[0] - 2 || jmaxi > extf[1] - 2 || kmaxi > extf[2] - 2)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Keep thin restrict slabs on the established CPU path for now.
|
||||
if (extc[0] <= 2 * wei || extc[1] <= 2 * wei || extc[2] <= 2 * wei ||
|
||||
extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int fi_min = 2 * (imino + lbc[0] - 1) - lbf[0];
|
||||
const int fi_max = 2 * (imaxo + lbc[0] - 1) - lbf[0];
|
||||
const int fj_min = 2 * (jmino + lbc[1] - 1) - lbf[1];
|
||||
const int fj_max = 2 * (jmaxo + lbc[1] - 1) - lbf[1];
|
||||
const int fk_min = 2 * (kmino + lbc[2] - 1) - lbf[2];
|
||||
const int fk_max = 2 * (kmaxo + lbc[2] - 1) - lbf[2];
|
||||
|
||||
const int ii_lo = fi_min - 2;
|
||||
const int ii_hi = fi_max + 3;
|
||||
const int jj_lo = fj_min - 2;
|
||||
const int jj_hi = fj_max + 3;
|
||||
const int kk_lo = fk_min - 2;
|
||||
const int kk_hi = fk_max + 3;
|
||||
|
||||
if (ii_hi > extf[0] || jj_hi > extf[1] || kk_hi > extf[2] ||
|
||||
ii_lo < -1 || jj_lo < -1 || kk_lo < -1)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
const bool need_equatorial_z_reflect = (symmetry == eq_symm && kk_lo < 1);
|
||||
// Keep symmetry-reflected restrict windows on the CPU path for now.
|
||||
if (ii_lo < 1 || jj_lo < 1 || kk_lo < 1)
|
||||
return 1;
|
||||
|
||||
struct RestrictCache
|
||||
{
|
||||
CachedBuffer fine;
|
||||
CachedBuffer coarse;
|
||||
CachedIntBuffer error_flag;
|
||||
};
|
||||
static thread_local RestrictCache cache;
|
||||
|
||||
int nxc = extc[0];
|
||||
int nyc = extc[1];
|
||||
int nzc = extc[2];
|
||||
int src_nxf = extf[0];
|
||||
int src_nyf = extf[1];
|
||||
int src_nzf = extf[2];
|
||||
const int stage_fx_lo = ii_lo;
|
||||
const int stage_fx_hi = ii_hi;
|
||||
const int stage_fy_lo = jj_lo;
|
||||
const int stage_fy_hi = jj_hi;
|
||||
const int stage_fz_lo = kk_lo;
|
||||
const int stage_fz_hi = kk_hi;
|
||||
int nxf = stage_fx_hi - stage_fx_lo + 1;
|
||||
int nyf = stage_fy_hi - stage_fy_lo + 1;
|
||||
int nzf = stage_fz_hi - stage_fz_lo + 1;
|
||||
int sxf = nxf + 2;
|
||||
int syf = nyf + 2;
|
||||
int szf = nzf + 2;
|
||||
int fine_points = sxf * syf * szf;
|
||||
int coarse_points = nxc * nyc * nzc;
|
||||
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
||||
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
||||
|
||||
std::vector<double> funff_host(static_cast<size_t>(fine_points), 0.0);
|
||||
auto fine_index = [sxf, syf](int fi, int fj, int fk) -> size_t {
|
||||
return static_cast<size_t>(fi + 1) +
|
||||
static_cast<size_t>(fj + 1) * static_cast<size_t>(sxf) +
|
||||
static_cast<size_t>(fk + 1) * static_cast<size_t>(sxf) * static_cast<size_t>(syf);
|
||||
};
|
||||
auto src_index = [src_nxf, src_nyf](int i, int j, int k) -> size_t {
|
||||
return static_cast<size_t>(i - 1) +
|
||||
static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxf) +
|
||||
static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxf) * static_cast<size_t>(src_nyf);
|
||||
};
|
||||
|
||||
for (int k = stage_fz_lo; k <= stage_fz_hi; ++k)
|
||||
{
|
||||
const int lk = k - stage_fz_lo + 1;
|
||||
for (int j = stage_fy_lo; j <= stage_fy_hi; ++j)
|
||||
{
|
||||
const int lj = j - stage_fy_lo + 1;
|
||||
for (int i = stage_fx_lo; i <= stage_fx_hi; ++i)
|
||||
{
|
||||
const int li = i - stage_fx_lo + 1;
|
||||
funff_host[fine_index(li, lj, lk)] = funf[src_index(i, j, k)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(void)need_equatorial_z_reflect;
|
||||
|
||||
if (!copy_to_device(cache.fine, funff_host.data(), fine_bytes))
|
||||
return 1;
|
||||
if (!ensure_capacity(cache.coarse, coarse_bytes))
|
||||
return 1;
|
||||
if (!ensure_capacity(cache.error_flag, sizeof(int)))
|
||||
return 1;
|
||||
|
||||
int zero = 0;
|
||||
cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) restrict3_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(coarse_points, static_cast<int>(block.x)));
|
||||
double *d_funf = cache.fine.ptr;
|
||||
double *d_func = cache.coarse.ptr;
|
||||
int lbf_local[3] = {lbf[0] + stage_fx_lo - 1,
|
||||
lbf[1] + stage_fy_lo - 1,
|
||||
lbf[2] + stage_fz_lo - 1};
|
||||
int ibegin = imino - 1, iend = imaxo - 1;
|
||||
int jbegin = jmino - 1, jend = jmaxo - 1;
|
||||
int kbegin = kmino - 1, kend = kmaxo - 1;
|
||||
void *args[] = {&d_funf, &d_func,
|
||||
&sxf, &syf,
|
||||
&nxf, &nyf, &nzf,
|
||||
&nxc, &nyc, &nzc,
|
||||
&lbc[0], &lbc[1], &lbc[2],
|
||||
&lbf_local[0], &lbf_local[1], &lbf_local[2],
|
||||
&ibegin, &iend,
|
||||
&jbegin, &jend,
|
||||
&kbegin, &kend,
|
||||
&cache.error_flag.ptr};
|
||||
|
||||
if (!launch_kernel(grid, block, (const void *)restrict3_cell_kernel, args))
|
||||
return 1;
|
||||
|
||||
cudaError_t sync_err = cudaDeviceSynchronize();
|
||||
if (sync_err != cudaSuccess)
|
||||
{
|
||||
std::fprintf(stderr,
|
||||
"restrict3 debug: symmetry=%d extc=(%d,%d,%d) extf=(%d,%d,%d) "
|
||||
"imino=%d imaxo=%d jmino=%d jmaxo=%d kmino=%d kmaxo=%d "
|
||||
"imini=%d imaxi=%d jmini=%d jmaxi=%d kmini=%d kmaxi=%d "
|
||||
"lbc=(%d,%d,%d) lbf=(%d,%d,%d) "
|
||||
"fi=[%d,%d] fj=[%d,%d] fk=[%d,%d] window=[%d:%d,%d:%d,%d:%d]\n",
|
||||
symmetry,
|
||||
extc[0], extc[1], extc[2],
|
||||
extf[0], extf[1], extf[2],
|
||||
imino, imaxo, jmino, jmaxo, kmino, kmaxo,
|
||||
imini, imaxi, jmini, jmaxi, kmini, kmaxi,
|
||||
lbc[0], lbc[1], lbc[2],
|
||||
lbf[0], lbf[1], lbf[2],
|
||||
fi_min, fi_max, fj_min, fj_max, fk_min, fk_max,
|
||||
ii_lo, ii_hi, jj_lo, jj_hi, kk_lo, kk_hi);
|
||||
report_cuda_error("cudaDeviceSynchronize restrict3", sync_err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int host_error_flag = 0;
|
||||
err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) restrict3_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
if (host_error_flag)
|
||||
return 1;
|
||||
|
||||
err = cudaMemcpy(func, cache.coarse.ptr, coarse_bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) restrict3", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -33,6 +33,12 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry);
|
||||
|
||||
int bssn_cuda_restrict3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, const double *funf,
|
||||
const double *llbr, const double *uubr,
|
||||
const double *SoA, int symmetry);
|
||||
|
||||
int bssn_cuda_interp_points_batch(const int *ex,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
|
||||
Reference in New Issue
Block a user