Add optional CUDA surface interpolation
This commit is contained in:
@@ -5765,6 +5765,65 @@ __global__ void kern_interp_state_point3(const double * __restrict__ mem,
|
||||
out[f] = value;
|
||||
}
|
||||
|
||||
__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
|
||||
const double * __restrict__ field1,
|
||||
const double * __restrict__ px,
|
||||
const double * __restrict__ py,
|
||||
const double * __restrict__ pz,
|
||||
double * __restrict__ out,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
int npoints,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double soa00, double soa01, double soa02,
|
||||
double soa10, double soa11, double soa12)
|
||||
{
|
||||
const int p = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (p >= npoints || ordn <= 0 || ordn > 8)
|
||||
return;
|
||||
|
||||
int ib, jb, kb;
|
||||
double tx, ty, tz;
|
||||
interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
||||
interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
||||
interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
||||
|
||||
double wx[8], wy[8], wz[8];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
||||
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
||||
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
||||
}
|
||||
|
||||
double v0 = 0.0;
|
||||
double v1 = 0.0;
|
||||
const double soa0[3] = {soa00, soa01, soa02};
|
||||
const double soa1[3] = {soa10, soa11, soa12};
|
||||
for (int k = 0; k < ordn; ++k) {
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
const double wyz = wy[j] * wz[k];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
const double coeff = wx[i] * wyz;
|
||||
v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
|
||||
ib + i, jb + j, kb + k, soa0);
|
||||
v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
|
||||
ib + i, jb + j, kb + k, soa1);
|
||||
}
|
||||
}
|
||||
}
|
||||
out[2 * p] = v0;
|
||||
out[2 * p + 1] = v1;
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int nx, int ny,
|
||||
@@ -7076,6 +7135,82 @@ int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_interp_host_two_fields(void *block_tag,
|
||||
int *ex,
|
||||
double *field0,
|
||||
double *field1,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
const double *px,
|
||||
const double *py,
|
||||
const double *pz,
|
||||
int npoints,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa6,
|
||||
double *out_interleaved)
|
||||
{
|
||||
(void)block_tag;
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
|
||||
!out_interleaved || npoints <= 0)
|
||||
return 1;
|
||||
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
||||
ordn <= 0 || ordn > 8 ||
|
||||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||
return 1;
|
||||
|
||||
const int all = ex[0] * ex[1] * ex[2];
|
||||
const size_t field_bytes = (size_t)all * sizeof(double);
|
||||
const size_t point_bytes = (size_t)npoints * sizeof(double);
|
||||
const size_t out_bytes = (size_t)2 * npoints * sizeof(double);
|
||||
|
||||
double *d_field0 = nullptr;
|
||||
double *d_field1 = nullptr;
|
||||
double *d_px = nullptr;
|
||||
double *d_py = nullptr;
|
||||
double *d_pz = nullptr;
|
||||
double *d_out = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_out, out_bytes));
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));
|
||||
|
||||
const int threads = 256;
|
||||
const int blocks = (npoints + threads - 1) / threads;
|
||||
kern_interp_host_two_fields<<<blocks, threads>>>(
|
||||
d_field0, d_field1, d_px, d_py, d_pz, d_out,
|
||||
ex[0], ex[1], ex[2], all,
|
||||
x0, y0, z0, dx, dy, dz,
|
||||
npoints, ordn, symmetry,
|
||||
soa6[0], soa6[1], soa6[2],
|
||||
soa6[3], soa6[4], soa6[5]);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
cudaFree(d_out);
|
||||
cudaFree(d_pz);
|
||||
cudaFree(d_py);
|
||||
cudaFree(d_px);
|
||||
cudaFree(d_field1);
|
||||
cudaFree(d_field0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
|
||||
Reference in New Issue
Block a user