Refine stable GPU AMR staging path
This commit is contained in:
@@ -262,6 +262,65 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool copy_region_to_padded_stage(CachedBuffer &dst,
|
||||||
|
const double *src,
|
||||||
|
const int src_shape[3],
|
||||||
|
int src_i0, int src_j0, int src_k0,
|
||||||
|
int region_nx, int region_ny, int region_nz,
|
||||||
|
int dst_sx, int dst_sy, int dst_sz,
|
||||||
|
int dst_x0, int dst_y0, int dst_z0,
|
||||||
|
const char *where)
|
||||||
|
{
|
||||||
|
if (!src || !where || region_nx <= 0 || region_ny <= 0 || region_nz <= 0 ||
|
||||||
|
dst_sx <= 0 || dst_sy <= 0 || dst_sz <= 0)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t dst_bytes =
|
||||||
|
static_cast<size_t>(dst_sx) * static_cast<size_t>(dst_sy) * static_cast<size_t>(dst_sz) * sizeof(double);
|
||||||
|
if (!ensure_capacity(dst, dst_bytes))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const double *device_src = bssn_gpu_find_device_buffer(src);
|
||||||
|
const void *copy_src = device_src ? static_cast<const void *>(device_src) : static_cast<const void *>(src);
|
||||||
|
cudaMemcpyKind kind = device_src ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
|
||||||
|
|
||||||
|
if (!device_src)
|
||||||
|
{
|
||||||
|
const int src_count = src_shape[0] * src_shape[1] * src_shape[2];
|
||||||
|
bssn_gpu_prepare_host_buffer(src, src_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMemcpy3DParms parms = {};
|
||||||
|
parms.srcPtr = make_cudaPitchedPtr(const_cast<void *>(copy_src),
|
||||||
|
static_cast<size_t>(src_shape[0]) * sizeof(double),
|
||||||
|
static_cast<size_t>(src_shape[0]),
|
||||||
|
static_cast<size_t>(src_shape[1]));
|
||||||
|
parms.dstPtr = make_cudaPitchedPtr(dst.ptr,
|
||||||
|
static_cast<size_t>(dst_sx) * sizeof(double),
|
||||||
|
static_cast<size_t>(dst_sx),
|
||||||
|
static_cast<size_t>(dst_sy));
|
||||||
|
parms.srcPos = make_cudaPos(static_cast<size_t>(src_i0) * sizeof(double),
|
||||||
|
static_cast<size_t>(src_j0),
|
||||||
|
static_cast<size_t>(src_k0));
|
||||||
|
parms.dstPos = make_cudaPos(static_cast<size_t>(dst_x0) * sizeof(double),
|
||||||
|
static_cast<size_t>(dst_y0),
|
||||||
|
static_cast<size_t>(dst_z0));
|
||||||
|
parms.extent = make_cudaExtent(static_cast<size_t>(region_nx) * sizeof(double),
|
||||||
|
static_cast<size_t>(region_ny),
|
||||||
|
static_cast<size_t>(region_nz));
|
||||||
|
parms.kind = kind;
|
||||||
|
|
||||||
|
cudaError_t err = cudaMemcpy3D(&parms);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error(where, err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
inline int interp_idint_like_host(double x)
|
inline int interp_idint_like_host(double x)
|
||||||
{
|
{
|
||||||
return static_cast<int>(x);
|
return static_cast<int>(x);
|
||||||
@@ -508,6 +567,30 @@ prolong3_cell_kernel_next_point:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void prolong3_fill_equatorial_reflect_kernel(double *funcc,
|
||||||
|
int sxc, int syc,
|
||||||
|
int nxc, int nyc,
|
||||||
|
double z_soa)
|
||||||
|
{
|
||||||
|
const int n = nxc * nyc * 3;
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int plane = nxc * nyc;
|
||||||
|
const int zoff = idx / plane;
|
||||||
|
const int rem = idx - zoff * plane;
|
||||||
|
const int j = rem / nxc;
|
||||||
|
const int i = rem - j * nxc;
|
||||||
|
|
||||||
|
const int li = i + 1;
|
||||||
|
const int lj = j + 1;
|
||||||
|
const int source_k = zoff + 1;
|
||||||
|
const int target_k = -zoff;
|
||||||
|
|
||||||
|
funcc[(li + 2) + (lj + 2) * sxc + (target_k + 2) * sxc * syc] =
|
||||||
|
z_soa * funcc[(li + 2) + (lj + 2) * sxc + (source_k + 2) * sxc * syc];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void restrict3_cell_kernel(const double *funf, double *func,
|
__global__ void restrict3_cell_kernel(const double *funf, double *func,
|
||||||
int sxf, int syf,
|
int sxf, int syf,
|
||||||
int nxf, int nyf, int nzf,
|
int nxf, int nyf, int nzf,
|
||||||
@@ -1442,10 +1525,13 @@ int bssn_cuda_prolong3_pack(int wei,
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keep thin prolongation slabs on the established CPU path for now.
|
const bool thin_x = (extf[0] <= 2 * wei);
|
||||||
// The GPU kernel is only enabled for chunkier regions where the current
|
const bool thin_y = (extf[1] <= 2 * wei);
|
||||||
// device pack path is known to be safe.
|
const bool thin_z = (extf[2] <= 2 * wei);
|
||||||
if (extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
|
// Keep thin slabs on the CPU path until the dedicated slab kernel is
|
||||||
|
// fully validated. The first z-thin specialization still triggers illegal
|
||||||
|
// accesses on the common extf=(48,48,6) case.
|
||||||
|
if (thin_x || thin_y || thin_z)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
||||||
@@ -1480,7 +1566,6 @@ int bssn_cuda_prolong3_pack(int wei,
|
|||||||
};
|
};
|
||||||
static thread_local ProlongCache cache;
|
static thread_local ProlongCache cache;
|
||||||
|
|
||||||
int src_nxc = extc[0], src_nyc = extc[1], src_nzc = extc[2];
|
|
||||||
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
||||||
const int stage_ix_lo = ic_min - 2;
|
const int stage_ix_lo = ic_min - 2;
|
||||||
const int stage_ix_hi = ic_max + 3;
|
const int stage_ix_hi = ic_max + 3;
|
||||||
@@ -1494,59 +1579,50 @@ int bssn_cuda_prolong3_pack(int wei,
|
|||||||
int sxc = nxc + 3;
|
int sxc = nxc + 3;
|
||||||
int syc = nyc + 3;
|
int syc = nyc + 3;
|
||||||
int szc = nzc + 3;
|
int szc = nzc + 3;
|
||||||
int coarse_points = sxc * syc * szc;
|
|
||||||
int fine_points = nxf * nyf * nzf;
|
int fine_points = nxf * nyf * nzf;
|
||||||
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
|
||||||
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
||||||
|
if (!copy_region_to_padded_stage(cache.coarse,
|
||||||
std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
|
func,
|
||||||
auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
|
extc,
|
||||||
return static_cast<size_t>(fi + 2) +
|
stage_ix_lo - 1,
|
||||||
static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
|
stage_iy_lo - 1,
|
||||||
static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
|
stage_iz_lo - 1,
|
||||||
};
|
nxc,
|
||||||
auto func_index = [src_nxc, src_nyc](int i, int j, int k) -> size_t {
|
nyc,
|
||||||
return static_cast<size_t>(i - 1) +
|
nzc,
|
||||||
static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxc) +
|
sxc,
|
||||||
static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxc) * static_cast<size_t>(src_nyc);
|
syc,
|
||||||
};
|
szc,
|
||||||
|
3,
|
||||||
for (int k = stage_iz_lo; k <= stage_iz_hi; ++k)
|
3,
|
||||||
|
3,
|
||||||
|
"cudaMemcpy3D prolong3 stage"))
|
||||||
{
|
{
|
||||||
const int lk = k - stage_iz_lo + 1;
|
return 1;
|
||||||
for (int j = stage_iy_lo; j <= stage_iy_hi; ++j)
|
|
||||||
{
|
|
||||||
const int lj = j - stage_iy_lo + 1;
|
|
||||||
for (int i = stage_ix_lo; i <= stage_ix_hi; ++i)
|
|
||||||
{
|
|
||||||
const int li = i - stage_ix_lo + 1;
|
|
||||||
funcc_host[coarse_index(li, lj, lk)] = func[func_index(i, j, k)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double *d_func = cache.coarse.ptr;
|
||||||
|
|
||||||
if (need_equatorial_z_reflect)
|
if (need_equatorial_z_reflect)
|
||||||
{
|
{
|
||||||
for (int offset = 0; offset < 3; ++offset)
|
dim3 reflect_block(256);
|
||||||
|
dim3 reflect_grid(div_up(nxc * nyc * 3, static_cast<int>(reflect_block.x)));
|
||||||
|
double z_soa = SoA[2];
|
||||||
|
void *reflect_args[] = {
|
||||||
|
&d_func,
|
||||||
|
&sxc,
|
||||||
|
&syc,
|
||||||
|
&nxc,
|
||||||
|
&nyc,
|
||||||
|
&z_soa};
|
||||||
|
if (!launch_kernel(reflect_grid, reflect_block,
|
||||||
|
(const void *)prolong3_fill_equatorial_reflect_kernel,
|
||||||
|
reflect_args))
|
||||||
{
|
{
|
||||||
const int target_k = -offset;
|
return 1;
|
||||||
const int source_k = offset + 1;
|
|
||||||
for (int j = 1; j <= nyc; ++j)
|
|
||||||
{
|
|
||||||
for (int i = 1; i <= nxc; ++i)
|
|
||||||
{
|
|
||||||
funcc_host[coarse_index(i, j, target_k)] =
|
|
||||||
funcc_host[coarse_index(i, j, source_k)] * SoA[2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double *d_func = nullptr;
|
|
||||||
if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
|
|
||||||
return 1;
|
|
||||||
d_func = cache.coarse.ptr;
|
|
||||||
|
|
||||||
if (!ensure_capacity(cache.fine, fine_bytes))
|
if (!ensure_capacity(cache.fine, fine_bytes))
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
@@ -1742,9 +1818,6 @@ int bssn_cuda_restrict3_pack(int wei,
|
|||||||
int nxc = extc[0];
|
int nxc = extc[0];
|
||||||
int nyc = extc[1];
|
int nyc = extc[1];
|
||||||
int nzc = extc[2];
|
int nzc = extc[2];
|
||||||
int src_nxf = extf[0];
|
|
||||||
int src_nyf = extf[1];
|
|
||||||
int src_nzf = extf[2];
|
|
||||||
const int stage_fx_lo = ii_lo;
|
const int stage_fx_lo = ii_lo;
|
||||||
const int stage_fx_hi = ii_hi;
|
const int stage_fx_hi = ii_hi;
|
||||||
const int stage_fy_lo = jj_lo;
|
const int stage_fy_lo = jj_lo;
|
||||||
@@ -1757,40 +1830,26 @@ int bssn_cuda_restrict3_pack(int wei,
|
|||||||
int sxf = nxf + 2;
|
int sxf = nxf + 2;
|
||||||
int syf = nyf + 2;
|
int syf = nyf + 2;
|
||||||
int szf = nzf + 2;
|
int szf = nzf + 2;
|
||||||
int fine_points = sxf * syf * szf;
|
|
||||||
int coarse_points = nxc * nyc * nzc;
|
int coarse_points = nxc * nyc * nzc;
|
||||||
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
|
||||||
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
||||||
|
|
||||||
std::vector<double> funff_host(static_cast<size_t>(fine_points), 0.0);
|
|
||||||
auto fine_index = [sxf, syf](int fi, int fj, int fk) -> size_t {
|
|
||||||
return static_cast<size_t>(fi + 1) +
|
|
||||||
static_cast<size_t>(fj + 1) * static_cast<size_t>(sxf) +
|
|
||||||
static_cast<size_t>(fk + 1) * static_cast<size_t>(sxf) * static_cast<size_t>(syf);
|
|
||||||
};
|
|
||||||
auto src_index = [src_nxf, src_nyf](int i, int j, int k) -> size_t {
|
|
||||||
return static_cast<size_t>(i - 1) +
|
|
||||||
static_cast<size_t>(j - 1) * static_cast<size_t>(src_nxf) +
|
|
||||||
static_cast<size_t>(k - 1) * static_cast<size_t>(src_nxf) * static_cast<size_t>(src_nyf);
|
|
||||||
};
|
|
||||||
|
|
||||||
for (int k = stage_fz_lo; k <= stage_fz_hi; ++k)
|
|
||||||
{
|
|
||||||
const int lk = k - stage_fz_lo + 1;
|
|
||||||
for (int j = stage_fy_lo; j <= stage_fy_hi; ++j)
|
|
||||||
{
|
|
||||||
const int lj = j - stage_fy_lo + 1;
|
|
||||||
for (int i = stage_fx_lo; i <= stage_fx_hi; ++i)
|
|
||||||
{
|
|
||||||
const int li = i - stage_fx_lo + 1;
|
|
||||||
funff_host[fine_index(li, lj, lk)] = funf[src_index(i, j, k)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(void)need_equatorial_z_reflect;
|
(void)need_equatorial_z_reflect;
|
||||||
|
|
||||||
if (!copy_to_device(cache.fine, funff_host.data(), fine_bytes))
|
if (!copy_region_to_padded_stage(cache.fine,
|
||||||
|
funf,
|
||||||
|
extf,
|
||||||
|
stage_fx_lo - 1,
|
||||||
|
stage_fy_lo - 1,
|
||||||
|
stage_fz_lo - 1,
|
||||||
|
nxf,
|
||||||
|
nyf,
|
||||||
|
nzf,
|
||||||
|
sxf,
|
||||||
|
syf,
|
||||||
|
szf,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
"cudaMemcpy3D restrict3 stage"))
|
||||||
return 1;
|
return 1;
|
||||||
if (!ensure_capacity(cache.coarse, coarse_bytes))
|
if (!ensure_capacity(cache.coarse, coarse_bytes))
|
||||||
return 1;
|
return 1;
|
||||||
|
|||||||
Reference in New Issue
Block a user