1411 lines
46 KiB
Plaintext
1411 lines
46 KiB
Plaintext
#include "bssn_cuda_ops.h"
|
|
#include "bssn_gpu.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cuda_runtime.h>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace {
|
|
|
|
inline void report_cuda_error(const char *where, cudaError_t err)
|
|
{
|
|
if (err != cudaSuccess)
|
|
std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
|
|
}
|
|
|
|
inline int count_points(const int ex[3])
|
|
{
|
|
return ex[0] * ex[1] * ex[2];
|
|
}
|
|
|
|
inline int div_up(int a, int b)
|
|
{
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
struct DeviceArrays
|
|
{
|
|
double *x = nullptr;
|
|
double *y = nullptr;
|
|
double *z = nullptr;
|
|
double *a = nullptr;
|
|
double *b = nullptr;
|
|
double *c = nullptr;
|
|
double *d = nullptr;
|
|
};
|
|
|
|
struct CachedBuffer
|
|
{
|
|
double *ptr = nullptr;
|
|
size_t capacity = 0;
|
|
};
|
|
|
|
struct CachedIntBuffer
|
|
{
|
|
int *ptr = nullptr;
|
|
size_t capacity = 0;
|
|
};
|
|
|
|
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
|
{
|
|
if (bytes <= buffer.capacity && buffer.ptr)
|
|
return true;
|
|
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
|
|
buffer.capacity = bytes;
|
|
return true;
|
|
}
|
|
|
|
inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
|
|
{
|
|
if (bytes <= buffer.capacity && buffer.ptr)
|
|
return true;
|
|
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
|
|
buffer.capacity = bytes;
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D int)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
const double *device_src = bssn_gpu_find_device_buffer(src);
|
|
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
|
|
const void *copy_src = src;
|
|
if (device_src)
|
|
{
|
|
copy_src = device_src;
|
|
kind = cudaMemcpyDeviceToDevice;
|
|
}
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline int interp_idint_like_host(double x)
|
|
{
|
|
return static_cast<int>(x);
|
|
}
|
|
|
|
inline void lagrange_weights_ord6_host(double x, double *w)
|
|
{
|
|
static const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0};
|
|
for (int i = 0; i < 6; ++i)
|
|
{
|
|
double num = 1.0;
|
|
for (int j = 0; j < 6; ++j)
|
|
{
|
|
if (j != i)
|
|
num *= (x - static_cast<double>(j));
|
|
}
|
|
w[i] = num / denom[i];
|
|
}
|
|
}
|
|
|
|
inline bool map_interp_index_host(int logical_idx, int n, int *mapped_idx, int *reflected)
|
|
{
|
|
int idx = logical_idx;
|
|
*reflected = 0;
|
|
if (idx < 0)
|
|
{
|
|
idx = -idx;
|
|
*reflected = 1;
|
|
}
|
|
if (idx < 0 || idx >= n)
|
|
return false;
|
|
*mapped_idx = idx;
|
|
return true;
|
|
}
|
|
|
|
inline bool compute_interp_window_host(double x, const double *coord, int n,
|
|
int ordn, int allow_reflect,
|
|
int *start_idx, double *cx)
|
|
{
|
|
const double dx = coord[1] - coord[0];
|
|
const int center = interp_idint_like_host((x - coord[0]) / dx + 0.4);
|
|
const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0;
|
|
const int cmax = n - 1;
|
|
|
|
int begin = center - ordn / 2 + 1;
|
|
int end = begin + ordn - 1;
|
|
if (begin < cmin)
|
|
{
|
|
begin = cmin;
|
|
end = begin + ordn - 1;
|
|
}
|
|
if (end > cmax)
|
|
{
|
|
end = cmax;
|
|
begin = end + 1 - ordn;
|
|
}
|
|
|
|
if (begin >= 0)
|
|
*cx = (x - coord[begin]) / dx;
|
|
else
|
|
*cx = (x + coord[-begin]) / dx;
|
|
|
|
*start_idx = begin;
|
|
return (begin >= cmin && end <= cmax);
|
|
}
|
|
|
|
__global__ void enforce_ga_kernel(int n,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const double one_third = 1.0 / 3.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
double lgxx = dxx[idx] + one;
|
|
double lgyy = dyy[idx] + one;
|
|
double lgzz = dzz[idx] + one;
|
|
double lgxy = gxy[idx];
|
|
double lgxz = gxz[idx];
|
|
double lgyz = gyz[idx];
|
|
|
|
double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
|
|
- lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
|
|
double scale = 1.0 / cbrt(det);
|
|
|
|
lgxx *= scale;
|
|
lgxy *= scale;
|
|
lgxz *= scale;
|
|
lgyy *= scale;
|
|
lgyz *= scale;
|
|
lgzz *= scale;
|
|
|
|
dxx[idx] = lgxx - one;
|
|
gxy[idx] = lgxy;
|
|
gxz[idx] = lgxz;
|
|
dyy[idx] = lgyy - one;
|
|
gyz[idx] = lgyz;
|
|
dzz[idx] = lgzz - one;
|
|
|
|
double gupxx = (lgyy * lgzz - lgyz * lgyz);
|
|
double gupxy = -(lgxy * lgzz - lgyz * lgxz);
|
|
double gupxz = (lgxy * lgyz - lgyy * lgxz);
|
|
double gupyy = (lgxx * lgzz - lgxz * lgxz);
|
|
double gupyz = -(lgxx * lgyz - lgxy * lgxz);
|
|
double gupzz = (lgxx * lgyy - lgxy * lgxy);
|
|
|
|
double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
|
|
+ two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
|
|
|
|
Axx[idx] -= one_third * lgxx * trA;
|
|
Axy[idx] -= one_third * lgxy * trA;
|
|
Axz[idx] -= one_third * lgxz * trA;
|
|
Ayy[idx] -= one_third * lgyy * trA;
|
|
Ayz[idx] -= one_third * lgyz * trA;
|
|
Azz[idx] -= one_third * lgzz * trA;
|
|
}
|
|
}
|
|
|
|
__device__ inline int index3(int i, int j, int k, int nx, int ny)
|
|
{
|
|
return i + j * nx + k * nx * ny;
|
|
}
|
|
|
|
__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
|
|
int nx, int ny, int nz,
|
|
const double soa[3], int symmetry)
|
|
{
|
|
double sign = 1.0;
|
|
if (i < 0)
|
|
{
|
|
i = -i - 1;
|
|
sign *= soa[0];
|
|
}
|
|
if (j < 0)
|
|
{
|
|
j = -j - 1;
|
|
sign *= soa[1];
|
|
}
|
|
if (k < 0)
|
|
{
|
|
k = -k - 1;
|
|
sign *= soa[2];
|
|
}
|
|
if (i >= nx) i = nx - 1;
|
|
if (j >= ny) j = ny - 1;
|
|
if (k >= nz) k = nz - 1;
|
|
(void)symmetry;
|
|
return sign * f[index3(i, j, k, nx, ny)];
|
|
}
|
|
|
|
__device__ inline double load_prolong_cell_padded(const double *f,
|
|
int i, int j, int k,
|
|
int sx, int sy)
|
|
{
|
|
return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
|
|
}
|
|
|
|
__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
|
int sxc, int syc,
|
|
int nxf, int nyf, int nzf,
|
|
int lbc0, int lbc1, int lbc2,
|
|
int lbf0, int lbf1, int lbf2,
|
|
int ibegin, int iend,
|
|
int jbegin, int jend,
|
|
int kbegin, int kend)
|
|
{
|
|
const double C1 = 7.7e1 / 8.192e3;
|
|
const double C2 = -6.93e2 / 8.192e3;
|
|
const double C3 = 3.465e3 / 4.096e3;
|
|
const double C4 = 1.155e3 / 4.096e3;
|
|
const double C5 = -4.95e2 / 8.192e3;
|
|
const double C6 = 6.3e1 / 8.192e3;
|
|
const double w_even[6] = {C1, C2, C3, C4, C5, C6};
|
|
const double w_odd[6] = {C6, C5, C4, C3, C2, C1};
|
|
|
|
const int n = nxf * nyf * nzf;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nxf * nyf;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nxf;
|
|
const int i = rem - j * nxf;
|
|
|
|
if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
|
|
continue;
|
|
|
|
const int ii = i + lbf0;
|
|
const int jj = j + lbf1;
|
|
const int kk = k + lbf2;
|
|
|
|
const int ic0 = ii / 2 - lbc0 - 1;
|
|
const int jc0 = jj / 2 - lbc1 - 1;
|
|
const int kc0 = kk / 2 - lbc2 - 1;
|
|
|
|
const double *wx = (ii & 1) ? w_odd : w_even;
|
|
const double *wy = (jj & 1) ? w_odd : w_even;
|
|
const double *wz = (kk & 1) ? w_odd : w_even;
|
|
|
|
double value = 0.0;
|
|
for (int ox = 0; ox < 6; ++ox)
|
|
{
|
|
const int cx = ic0 + ox;
|
|
double yz = 0.0;
|
|
for (int oy = 0; oy < 6; ++oy)
|
|
{
|
|
const int cy = jc0 + oy;
|
|
double zsum = 0.0;
|
|
for (int oz = 0; oz < 6; ++oz)
|
|
{
|
|
const int cz = kc0 + oz;
|
|
zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
|
|
}
|
|
yz += wy[oy] * zsum;
|
|
}
|
|
value += wx[ox] * yz;
|
|
}
|
|
|
|
funf[idx] = value;
|
|
}
|
|
}
|
|
|
|
__global__ void interp_points_ord6_kernel(int num_points, int num_var,
|
|
int nx, int ny,
|
|
const double *const *fields,
|
|
const double *soa_flat,
|
|
const int *stencil_idx,
|
|
const int *stencil_reflect,
|
|
const double *stencil_weights,
|
|
double *out,
|
|
int *error_flag)
|
|
{
|
|
const int total = num_points * num_var;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int point_id = idx / num_var;
|
|
const int var_id = idx - point_id * num_var;
|
|
const double *field = fields[var_id];
|
|
const double *soa = soa_flat + 3 * var_id;
|
|
const int *idxp = stencil_idx + point_id * 18;
|
|
const int *refp = stencil_reflect + point_id * 18;
|
|
const double *wp = stencil_weights + point_id * 18;
|
|
|
|
double value = 0.0;
|
|
#pragma unroll
|
|
for (int kz = 0; kz < 6; ++kz)
|
|
{
|
|
const int z_idx = idxp[12 + kz];
|
|
const double z_sign = refp[12 + kz] ? soa[2] : 1.0;
|
|
double yz_sum = 0.0;
|
|
#pragma unroll
|
|
for (int jy = 0; jy < 6; ++jy)
|
|
{
|
|
const int y_idx = idxp[6 + jy];
|
|
const double y_sign = refp[6 + jy] ? soa[1] : 1.0;
|
|
double x_sum = 0.0;
|
|
#pragma unroll
|
|
for (int ix = 0; ix < 6; ++ix)
|
|
{
|
|
const int x_idx = idxp[ix];
|
|
const double x_sign = refp[ix] ? soa[0] : 1.0;
|
|
const double sign = x_sign * y_sign * z_sign;
|
|
x_sum += wp[ix] * sign * field[index3(x_idx, y_idx, z_idx, nx, ny)];
|
|
}
|
|
yz_sum += wp[6 + jy] * x_sum;
|
|
}
|
|
value += wp[12 + kz] * yz_sum;
|
|
}
|
|
out[idx] = value;
|
|
}
|
|
(void)error_flag;
|
|
}
|
|
|
|
__global__ void rk4_kernel(int n, double dT,
|
|
const double *f0,
|
|
double *f1,
|
|
double *rhs,
|
|
int stage)
|
|
{
|
|
const double half = 0.5;
|
|
const double one_sixth = 1.0 / 6.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (stage == 0)
|
|
{
|
|
f1[idx] = f0[idx] + half * dT * rhs[idx];
|
|
}
|
|
else if (stage == 1)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + half * dT * f1[idx];
|
|
}
|
|
else if (stage == 2)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + dT * f1[idx];
|
|
}
|
|
else
|
|
{
|
|
f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void lowerbound_kernel(int n, double *chi, double tinny)
|
|
{
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (chi[idx] < tinny)
|
|
chi[idx] = tinny;
|
|
}
|
|
}
|
|
|
|
__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
const double *src, double *dst)
|
|
{
|
|
const int n = nx * ny * nz;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1))
|
|
{
|
|
dst[idx] = src[idx];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
int imin, int jmin, int kmin,
|
|
double propspeed,
|
|
const double *f0,
|
|
double *target,
|
|
double soa0, double soa1, double soa2,
|
|
int symmetry)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const int n = nx * ny * nz;
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const double d2dx = one / (two * dX);
|
|
const double d2dy = one / (two * dY);
|
|
const double d2dz = one / (two * dZ);
|
|
const double soa[3] = {soa0, soa1, soa2};
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
const bool on_boundary =
|
|
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
|
|
|
if (!on_boundary)
|
|
continue;
|
|
|
|
const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
|
|
if (radius == 0.0)
|
|
continue;
|
|
|
|
const double wx = propspeed * X[i] / radius;
|
|
const double wy = propspeed * Y[j] / radius;
|
|
const double wz = propspeed * Z[k] / radius;
|
|
|
|
double fx = 0.0;
|
|
double fy = 0.0;
|
|
double fz = 0.0;
|
|
|
|
if (wx > 0.0)
|
|
{
|
|
if (i - 2 >= imin)
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i - 1 >= imin)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wx < 0.0)
|
|
{
|
|
if (i + 2 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i + 1 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wy > 0.0)
|
|
{
|
|
if (j - 2 >= jmin)
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
else if (j - 1 >= jmin)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wy < 0.0)
|
|
{
|
|
if (j + 2 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (j + 1 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wz > 0.0)
|
|
{
|
|
if (k - 2 >= kmin)
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
else if (k - 1 >= kmin)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wz < 0.0)
|
|
{
|
|
if (k + 2 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (k + 1 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
|
|
}
|
|
}
|
|
|
|
inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
|
|
{
|
|
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaLaunchKernel", err);
|
|
return false;
|
|
}
|
|
err = cudaPeekAtLastError();
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaPeekAtLastError", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int bssn_cuda_enforce_ga(int *ex,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
struct EnforceGaCache
|
|
{
|
|
CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
|
|
CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
|
|
};
|
|
static thread_local EnforceGaCache cache;
|
|
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
|
|
copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
|
|
copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
|
|
copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
|
|
copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
|
|
copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
|
|
copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Azz, Azz, bytes);
|
|
|
|
if (ok)
|
|
{
|
|
double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
|
|
double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
|
|
double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
|
|
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
|
|
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
|
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
|
ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
// The next GPU RHS stage consumes these fields immediately.
|
|
// Keep them device-resident and expose the mapping so gpu_rhs() can
|
|
// reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
|
|
bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
|
|
bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
|
|
bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
|
|
bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
|
|
bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
|
|
bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
|
|
bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
|
|
bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
|
|
bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
|
|
bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
|
|
bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
|
|
bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
|
|
}
|
|
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
const double *state0,
|
|
const double *boundary_src,
|
|
double *stage_data,
|
|
double *rhs_accum,
|
|
double propspeed,
|
|
const double SoA[3],
|
|
int symmetry,
|
|
int lev,
|
|
int rk_stage,
|
|
bool download_to_host)
|
|
{
|
|
struct Rk4VarCache
|
|
{
|
|
CachedBuffer X, Y, Z;
|
|
CachedBuffer state0, boundary, stage, rhs;
|
|
const double *host_X = nullptr;
|
|
const double *host_Y = nullptr;
|
|
const double *host_Z = nullptr;
|
|
const double *host_state0 = nullptr;
|
|
double *host_rhs = nullptr;
|
|
int nx = 0;
|
|
int ny = 0;
|
|
int nz = 0;
|
|
bool rhs_resident = false;
|
|
};
|
|
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
|
Rk4VarCache &cache = cache_map[state0];
|
|
|
|
int nx = ex[0];
|
|
int ny = ex[1];
|
|
int nz = ex[2];
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
|
|
const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
|
|
const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
const bool need_bam_boundary = (lev == 0);
|
|
const bool need_coord_copy = need_bam_boundary;
|
|
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
|
|
const bool need_stage_input = (rk_stage != 0);
|
|
bool ok = true;
|
|
if (need_coord_copy &&
|
|
(cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
|
cache.nx != nx || cache.ny != ny || cache.nz != nz))
|
|
{
|
|
ok = copy_to_device(cache.X, X, bytes_x) &&
|
|
copy_to_device(cache.Y, Y, bytes_y) &&
|
|
copy_to_device(cache.Z, Z, bytes_z);
|
|
if (ok)
|
|
{
|
|
cache.host_X = X;
|
|
cache.host_Y = Y;
|
|
cache.host_Z = Z;
|
|
cache.nx = nx;
|
|
cache.ny = ny;
|
|
cache.nz = nz;
|
|
}
|
|
}
|
|
|
|
const bool refresh_state0 =
|
|
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
|
|
const bool refresh_rhs =
|
|
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
|
double *stage_ptr = nullptr;
|
|
const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
|
|
const double *mapped_boundary_ptr = need_boundary_input ? bssn_gpu_find_device_buffer(boundary_src) : nullptr;
|
|
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
|
|
const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;
|
|
|
|
if (refresh_state0 && !mapped_state0_ptr)
|
|
bssn_gpu_prepare_host_buffer(state0, n);
|
|
if (need_boundary_input && !mapped_boundary_ptr)
|
|
bssn_gpu_prepare_host_buffer(boundary_src, n);
|
|
if (need_stage_input && !mapped_stage_ptr)
|
|
bssn_gpu_prepare_host_buffer(stage_data, n);
|
|
if (refresh_rhs && !mapped_rhs_ptr)
|
|
bssn_gpu_prepare_host_buffer(rhs_accum, n);
|
|
|
|
ok = ok &&
|
|
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
|
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
|
|
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
|
|
|
if (ok && need_stage_input)
|
|
{
|
|
if (mapped_stage_ptr)
|
|
{
|
|
stage_ptr = const_cast<double *>(mapped_stage_ptr);
|
|
}
|
|
else
|
|
{
|
|
ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
}
|
|
else if (ok)
|
|
{
|
|
ok = ensure_capacity(cache.stage, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
|
|
if (!ok)
|
|
return 1;
|
|
|
|
if (refresh_state0)
|
|
{
|
|
cache.host_state0 = state0;
|
|
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
|
|
}
|
|
if (refresh_rhs)
|
|
{
|
|
cache.host_rhs = rhs_accum;
|
|
cache.rhs_resident = true;
|
|
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
|
|
}
|
|
|
|
double dX = X[1] - X[0];
|
|
double dY = Y[1] - Y[0];
|
|
double dZ = Z[1] - Z[0];
|
|
const int no_symm = 0, eq_symm = 1, octant = 2;
|
|
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
|
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
|
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
|
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
|
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
|
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
|
double soa0 = SoA[0];
|
|
double soa1 = SoA[1];
|
|
double soa2 = SoA[2];
|
|
|
|
if (need_bam_boundary)
|
|
{
|
|
int imin = 1;
|
|
int jmin = 1;
|
|
int kmin = 1;
|
|
if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
|
|
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
|
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
|
|
|
double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
|
|
double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
|
|
double *d_rhs = cache.rhs.ptr;
|
|
double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
|
|
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
|
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
|
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&imin, &jmin, &kmin, &propspeed,
|
|
&bam_source, &bam_target,
|
|
&soa0, &soa1, &soa2,
|
|
&symmetry};
|
|
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
|
|
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
|
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
|
|
}
|
|
|
|
if (ok && lev > 0)
|
|
{
|
|
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
|
|
void *args[] = {&nx, &ny, &nz,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&d_state0, &d_stage};
|
|
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
|
|
if (download_to_host)
|
|
{
|
|
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
}
|
|
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
|
{
|
|
static thread_local CachedBuffer d_chi;
|
|
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
double *device_chi = nullptr;
|
|
const double *mapped = bssn_gpu_find_device_buffer(chi);
|
|
bool ok = true;
|
|
if (mapped)
|
|
{
|
|
device_chi = const_cast<double *>(mapped);
|
|
}
|
|
else
|
|
{
|
|
ok = copy_to_device_preferring_device(d_chi, chi, bytes);
|
|
device_chi = d_chi.ptr;
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
double *ptr = device_chi;
|
|
void *args[] = {&n, &ptr, &tinny};
|
|
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
bssn_gpu_register_device_buffer(chi, device_chi);
|
|
if (download_to_host)
|
|
{
|
|
bssn_gpu_prepare_host_buffer(chi, n);
|
|
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
}
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
|
|
{
|
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
|
if (!device_ptr)
|
|
return 1;
|
|
|
|
const int n = count_points(ex);
|
|
bssn_gpu_prepare_host_buffer(host_ptr, n);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int bssn_cuda_interp_points_batch(const int *ex,
|
|
const double *X, const double *Y, const double *Z,
|
|
const double *const *fields,
|
|
const double *soa_flat,
|
|
int num_var,
|
|
const double *px, const double *py, const double *pz,
|
|
int num_points,
|
|
int ordn,
|
|
int symmetry,
|
|
double *out)
|
|
{
|
|
if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out)
|
|
return 1;
|
|
if (num_var <= 0 || num_points <= 0 || ordn != 6)
|
|
return 1;
|
|
if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
|
return 1;
|
|
|
|
struct InterpBatchCache
|
|
{
|
|
struct StencilCacheEntry
|
|
{
|
|
const double *X;
|
|
const double *Y;
|
|
const double *Z;
|
|
const double *px;
|
|
const double *py;
|
|
const double *pz;
|
|
int nx;
|
|
int ny;
|
|
int nz;
|
|
int num_points;
|
|
int ordn;
|
|
int symmetry;
|
|
CachedBuffer weights;
|
|
CachedIntBuffer indices;
|
|
CachedIntBuffer reflect;
|
|
|
|
StencilCacheEntry()
|
|
: X(nullptr), Y(nullptr), Z(nullptr),
|
|
px(nullptr), py(nullptr), pz(nullptr),
|
|
nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {}
|
|
};
|
|
|
|
CachedBuffer out;
|
|
CachedBuffer soa;
|
|
CachedBuffer field_ptrs;
|
|
CachedIntBuffer error_flag;
|
|
std::vector<CachedBuffer> host_field_copies;
|
|
std::vector<StencilCacheEntry> stencil_entries;
|
|
};
|
|
static thread_local InterpBatchCache cache;
|
|
|
|
const int nx = ex[0];
|
|
const int ny = ex[1];
|
|
const int nz = ex[2];
|
|
const int field_points = count_points(ex);
|
|
const size_t field_bytes = static_cast<size_t>(field_points) * sizeof(double);
|
|
const size_t out_bytes = static_cast<size_t>(num_points) * static_cast<size_t>(num_var) * sizeof(double);
|
|
const size_t soa_bytes = static_cast<size_t>(3 * num_var) * sizeof(double);
|
|
const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
|
|
const size_t point_stencil_doubles = static_cast<size_t>(num_points) * 18;
|
|
const size_t point_stencil_ints = static_cast<size_t>(num_points) * 18;
|
|
const size_t weights_bytes = point_stencil_doubles * sizeof(double);
|
|
const size_t indices_bytes = point_stencil_ints * sizeof(int);
|
|
|
|
bool ok = true;
|
|
InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr;
|
|
for (size_t i = 0; i < cache.stencil_entries.size(); ++i)
|
|
{
|
|
InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i];
|
|
if (entry.X == X && entry.Y == Y && entry.Z == Z &&
|
|
entry.px == px && entry.py == py && entry.pz == pz &&
|
|
entry.nx == nx && entry.ny == ny && entry.nz == nz &&
|
|
entry.num_points == num_points && entry.ordn == ordn &&
|
|
entry.symmetry == symmetry)
|
|
{
|
|
stencil_cache = &entry;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!stencil_cache)
|
|
{
|
|
cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry());
|
|
stencil_cache = &cache.stencil_entries.back();
|
|
stencil_cache->X = X;
|
|
stencil_cache->Y = Y;
|
|
stencil_cache->Z = Z;
|
|
stencil_cache->px = px;
|
|
stencil_cache->py = py;
|
|
stencil_cache->pz = pz;
|
|
stencil_cache->nx = nx;
|
|
stencil_cache->ny = ny;
|
|
stencil_cache->nz = nz;
|
|
stencil_cache->num_points = num_points;
|
|
stencil_cache->ordn = ordn;
|
|
stencil_cache->symmetry = symmetry;
|
|
|
|
std::vector<double> host_weights(point_stencil_doubles);
|
|
std::vector<int> host_indices(point_stencil_ints);
|
|
std::vector<int> host_reflect(point_stencil_ints);
|
|
const double dx = X[1] - X[0];
|
|
const double dy = Y[1] - Y[0];
|
|
const double dz = Z[1] - Z[0];
|
|
const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
|
|
const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
|
|
const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
|
|
for (int p = 0; p < num_points; ++p)
|
|
{
|
|
int start_x = 0, start_y = 0, start_z = 0;
|
|
double cx = 0.0, cy = 0.0, cz = 0.0;
|
|
const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
|
|
const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
|
|
const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
|
|
if (!ok_x || !ok_y || !ok_z)
|
|
return 1;
|
|
|
|
lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
|
|
lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
|
|
lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);
|
|
|
|
for (int i = 0; i < 6; ++i)
|
|
{
|
|
if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
|
|
return 1;
|
|
if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
|
|
return 1;
|
|
if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
ok = ok &&
|
|
copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) &&
|
|
copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) &&
|
|
copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes);
|
|
if (!ok)
|
|
return 1;
|
|
}
|
|
|
|
ok = ok &&
|
|
copy_to_device(cache.soa, soa_flat, soa_bytes) &&
|
|
ensure_capacity(cache.out, out_bytes) &&
|
|
ensure_capacity(cache.field_ptrs, ptr_bytes) &&
|
|
ensure_capacity(cache.error_flag, sizeof(int));
|
|
if (!ok)
|
|
return 1;
|
|
|
|
if (static_cast<int>(cache.host_field_copies.size()) < num_var)
|
|
cache.host_field_copies.resize(num_var);
|
|
|
|
std::vector<const double *> device_fields(num_var);
|
|
for (int v = 0; v < num_var; ++v)
|
|
{
|
|
const double *device_field = bssn_gpu_find_device_buffer(fields[v]);
|
|
if (!device_field)
|
|
{
|
|
ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes);
|
|
device_field = cache.host_field_copies[v].ptr;
|
|
}
|
|
device_fields[v] = device_field;
|
|
if (!ok || !device_fields[v])
|
|
return 1;
|
|
}
|
|
|
|
int zero = 0;
|
|
cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) field_ptrs", err);
|
|
return 1;
|
|
}
|
|
err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err);
|
|
return 1;
|
|
}
|
|
|
|
dim3 block(128);
|
|
dim3 grid(div_up(num_points * num_var, static_cast<int>(block.x)));
|
|
if (grid.x > 4096)
|
|
grid.x = 4096;
|
|
|
|
int nx_local = nx;
|
|
int ny_local = ny;
|
|
const double *dsoa = cache.soa.ptr;
|
|
const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
|
|
const double *dweights = stencil_cache->weights.ptr;
|
|
const int *dindices = stencil_cache->indices.ptr;
|
|
const int *dreflect = stencil_cache->reflect.ptr;
|
|
double *dout = cache.out.ptr;
|
|
int *derror = cache.error_flag.ptr;
|
|
|
|
void *args[] = {
|
|
&num_points, &num_var,
|
|
&nx_local, &ny_local,
|
|
&dfields,
|
|
&dsoa,
|
|
&dindices,
|
|
&dreflect,
|
|
&dweights,
|
|
&dout,
|
|
&derror};
|
|
ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args);
|
|
if (!ok)
|
|
return 1;
|
|
|
|
int error_flag = 0;
|
|
err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err);
|
|
return 1;
|
|
}
|
|
if (error_flag != 0)
|
|
return 1;
|
|
|
|
err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) interp_out", err);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int bssn_cuda_prolong3_pack(int wei,
|
|
const double *llbc, const double *uubc, const int *extc, const double *func,
|
|
const double *llbf, const double *uubf, const int *extf, double *funf,
|
|
const double *llbp, const double *uubp,
|
|
const double *SoA, int symmetry)
|
|
{
|
|
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
|
return 1;
|
|
|
|
// The symmetry-aware prolong CUDA path is still not equivalent to the
|
|
// active Cell/ghost_width=3 Fortran implementation, so keep the safe
|
|
// fallback for all symmetry-enabled cases.
|
|
if (symmetry != 0)
|
|
return 1;
|
|
|
|
auto idint_like = [](double x) -> int {
|
|
return static_cast<int>(std::trunc(x));
|
|
};
|
|
|
|
double base[3];
|
|
double CD[3];
|
|
double FD[3];
|
|
int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
|
|
for (int d = 0; d < 3; ++d)
|
|
{
|
|
CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
|
|
FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
|
|
if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
|
|
return 1;
|
|
|
|
if (llbc[d] <= llbf[d])
|
|
{
|
|
base[d] = llbc[d];
|
|
}
|
|
else
|
|
{
|
|
int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
|
|
base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
|
|
}
|
|
|
|
lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
|
|
lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
|
|
lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
|
|
ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
|
|
(void)ubc[d];
|
|
(void)ubf[d];
|
|
}
|
|
|
|
const int imino = lbp[0] - lbf[0] + 1;
|
|
const int imaxo = ubp[0] - lbf[0] + 1;
|
|
const int jmino = lbp[1] - lbf[1] + 1;
|
|
const int jmaxo = ubp[1] - lbf[1] + 1;
|
|
const int kmino = lbp[2] - lbf[2] + 1;
|
|
const int kmaxo = ubp[2] - lbf[2] + 1;
|
|
|
|
const int imini = lbpc[0] - lbc[0] + 1;
|
|
const int imaxi = ubpc[0] - lbc[0] + 1;
|
|
const int jmini = lbpc[1] - lbc[1] + 1;
|
|
const int jmaxi = ubpc[1] - lbc[1] + 1;
|
|
const int kmini = lbpc[2] - lbc[2] + 1;
|
|
const int kmaxi = ubpc[2] - lbc[2] + 1;
|
|
|
|
if (imino < 1 || jmino < 1 || kmino < 1 ||
|
|
imini < 1 || jmini < 1 || kmini < 1 ||
|
|
imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
|
|
imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
|
int ii = fine_idx + lbf_d - 1;
|
|
return ii / 2 - lbc_d + 1;
|
|
};
|
|
const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
|
|
const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
|
|
const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
|
|
|
|
// Current CUDA prolong path only supports the same fast path as the
|
|
// optimized Fortran code: interior stencil access without symmetry_bd().
|
|
if (ic_min - 2 < 1 || jc_min - 2 < 1 || kc_min - 2 < 1)
|
|
return 1;
|
|
|
|
struct ProlongCache
|
|
{
|
|
CachedBuffer coarse;
|
|
CachedBuffer fine;
|
|
};
|
|
static thread_local ProlongCache cache;
|
|
|
|
int nxc = extc[0], nyc = extc[1], nzc = extc[2];
|
|
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
|
int sxc = nxc + 3;
|
|
int syc = nyc + 3;
|
|
int szc = nzc + 3;
|
|
int coarse_points = sxc * syc * szc;
|
|
int fine_points = nxf * nyf * nzf;
|
|
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
|
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
|
|
|
std::vector<double> funcc_host(static_cast<size_t>(coarse_points), 0.0);
|
|
auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t {
|
|
return static_cast<size_t>(fi + 2) +
|
|
static_cast<size_t>(fj + 2) * static_cast<size_t>(sxc) +
|
|
static_cast<size_t>(fk + 2) * static_cast<size_t>(sxc) * static_cast<size_t>(syc);
|
|
};
|
|
auto func_index = [nxc, nyc](int i, int j, int k) -> size_t {
|
|
return static_cast<size_t>(i - 1) +
|
|
static_cast<size_t>(j - 1) * static_cast<size_t>(nxc) +
|
|
static_cast<size_t>(k - 1) * static_cast<size_t>(nxc) * static_cast<size_t>(nyc);
|
|
};
|
|
|
|
for (int k = 1; k <= nzc; ++k)
|
|
{
|
|
for (int j = 1; j <= nyc; ++j)
|
|
{
|
|
for (int i = 1; i <= nxc; ++i)
|
|
{
|
|
funcc_host[coarse_index(i, j, k)] = func[func_index(i, j, k)];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int offset = 0; offset < 3; ++offset)
|
|
{
|
|
int target_i = -offset;
|
|
int source_i = offset + 2;
|
|
for (int k = 1; k <= nzc; ++k)
|
|
{
|
|
for (int j = 1; j <= nyc; ++j)
|
|
{
|
|
funcc_host[coarse_index(target_i, j, k)] = funcc_host[coarse_index(source_i, j, k)] * SoA[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int offset = 0; offset < 3; ++offset)
|
|
{
|
|
int target_j = -offset;
|
|
int source_j = offset + 2;
|
|
for (int k = 1; k <= nzc; ++k)
|
|
{
|
|
for (int i = -2; i <= nxc; ++i)
|
|
{
|
|
funcc_host[coarse_index(i, target_j, k)] = funcc_host[coarse_index(i, source_j, k)] * SoA[1];
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int offset = 0; offset < 3; ++offset)
|
|
{
|
|
int target_k = -offset;
|
|
int source_k = offset + 2;
|
|
for (int j = -2; j <= nyc; ++j)
|
|
{
|
|
for (int i = -2; i <= nxc; ++i)
|
|
{
|
|
funcc_host[coarse_index(i, j, target_k)] = funcc_host[coarse_index(i, j, source_k)] * SoA[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
double *d_func = nullptr;
|
|
if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes))
|
|
return 1;
|
|
d_func = cache.coarse.ptr;
|
|
|
|
if (!ensure_capacity(cache.fine, fine_bytes))
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
dim3 block(256);
|
|
dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
|
|
double *d_funf = cache.fine.ptr;
|
|
int ibegin = imino - 1, iend = imaxo - 1;
|
|
int jbegin = jmino - 1, jend = jmaxo - 1;
|
|
int kbegin = kmino - 1, kend = kmaxo - 1;
|
|
void *args[] = {&d_func, &d_funf,
|
|
&sxc, &syc,
|
|
&nxf, &nyf, &nzf,
|
|
&lbc[0], &lbc[1], &lbc[2],
|
|
&lbf[0], &lbf[1], &lbf[2],
|
|
&ibegin, &iend,
|
|
&jbegin, &jend,
|
|
&kbegin, &kend};
|
|
|
|
if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
|
|
return 1;
|
|
|
|
cudaError_t sync_err = cudaDeviceSynchronize();
|
|
if (sync_err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaDeviceSynchronize prolong3", sync_err);
|
|
return 1;
|
|
}
|
|
|
|
cudaError_t err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) prolong3", err);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|