Add resident BSSN GPU point interpolation
This commit is contained in:
@@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
|
||||
}
|
||||
}
|
||||
|
||||
__device__ double interp_lagrange_weight(int idx, double x, int ordn)
|
||||
{
|
||||
double w = 1.0;
|
||||
const double xi = (double)idx;
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
if (j == idx) continue;
|
||||
w *= (x - (double)j) / (xi - (double)j);
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
__device__ void interp_axis_window(double p,
|
||||
double x0,
|
||||
double dx,
|
||||
int n,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
int axis,
|
||||
int &base,
|
||||
double &local_x)
|
||||
{
|
||||
int cx_i = (int)((p - x0) / dx + 0.4) + 1;
|
||||
int cx_b = cx_i - ordn / 2 + 1;
|
||||
int cx_t = cx_b + ordn - 1;
|
||||
int cmin = 1;
|
||||
if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
|
||||
cmin = -ordn / 2 + 1;
|
||||
if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
|
||||
cmin = -ordn / 2 + 1;
|
||||
|
||||
if (cx_b < cmin) {
|
||||
cx_b = cmin;
|
||||
cx_t = cx_b + ordn - 1;
|
||||
}
|
||||
if (cx_t > n) {
|
||||
cx_t = n;
|
||||
cx_b = cx_t + 1 - ordn;
|
||||
}
|
||||
|
||||
base = cx_b;
|
||||
if (cx_b > 0) {
|
||||
const double xb = x0 + (double)(cx_b - 1) * dx;
|
||||
local_x = (p - xb) / dx;
|
||||
} else {
|
||||
const int reflected = 1 - cx_b;
|
||||
const double xb = x0 + (double)(reflected - 1) * dx;
|
||||
local_x = (p + xb) / dx;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ double load_interp_value(const double * __restrict__ mem,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
int state,
|
||||
int fi,
|
||||
int fj,
|
||||
int fk,
|
||||
const double * __restrict__ soa)
|
||||
{
|
||||
double sign = 1.0;
|
||||
int ii = fi;
|
||||
int jj = fj;
|
||||
int kk = fk;
|
||||
if (ii <= 0) {
|
||||
ii = 1 - ii;
|
||||
sign *= soa[0];
|
||||
}
|
||||
if (jj <= 0) {
|
||||
jj = 1 - jj;
|
||||
sign *= soa[1];
|
||||
}
|
||||
if (kk <= 0) {
|
||||
kk = 1 - kk;
|
||||
sign *= soa[2];
|
||||
}
|
||||
if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
|
||||
return 0.0;
|
||||
const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
|
||||
return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
|
||||
}
|
||||
|
||||
__global__ void kern_interp_state_point3(const double * __restrict__ mem,
|
||||
double * __restrict__ out,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
int state0,
|
||||
int state1,
|
||||
int state2,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
double px,
|
||||
double py,
|
||||
double pz,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double soa00, double soa01, double soa02,
|
||||
double soa10, double soa11, double soa12,
|
||||
double soa20, double soa21, double soa22)
|
||||
{
|
||||
const int f = threadIdx.x;
|
||||
if (f >= 3 || ordn <= 0 || ordn > 8)
|
||||
return;
|
||||
|
||||
const int states[3] = {state0, state1, state2};
|
||||
const double soa_all[9] = {
|
||||
soa00, soa01, soa02,
|
||||
soa10, soa11, soa12,
|
||||
soa20, soa21, soa22
|
||||
};
|
||||
const double *soa = soa_all + 3 * f;
|
||||
|
||||
int ib, jb, kb;
|
||||
double tx, ty, tz;
|
||||
interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
||||
interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
||||
interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
||||
|
||||
double wx[8], wy[8], wz[8];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
||||
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
||||
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
||||
}
|
||||
|
||||
double value = 0.0;
|
||||
for (int k = 0; k < ordn; ++k) {
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
const double coeff = wx[i] * wy[j] * wz[k];
|
||||
value += coeff * load_interp_value(mem, nx, ny, nz, all,
|
||||
states[f],
|
||||
ib + i, jb + j, kb + k,
|
||||
soa);
|
||||
}
|
||||
}
|
||||
}
|
||||
out[f] = value;
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int nx, int ny,
|
||||
@@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
int *ex,
|
||||
int state0,
|
||||
int state1,
|
||||
int state2,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
double px,
|
||||
double py,
|
||||
double pz,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa3,
|
||||
double *out3)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
if (!block_tag || !ex || !out3 || !soa3)
|
||||
return 1;
|
||||
if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
|
||||
state1 < 0 || state1 >= BSSN_STATE_COUNT ||
|
||||
state2 < 0 || state2 >= BSSN_STATE_COUNT)
|
||||
return 1;
|
||||
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
||||
ordn <= 0 || ordn > 8 ||
|
||||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||
return 1;
|
||||
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
|
||||
if (bank < 0 || !ctx.resident_valid[bank])
|
||||
return 1;
|
||||
|
||||
double *d_out = ensure_step_comm_buffer(ctx, 3);
|
||||
kern_interp_state_point3<<<1, 3>>>(
|
||||
ctx.d_resident_mem[bank], d_out,
|
||||
ex[0], ex[1], ex[2], (int)all,
|
||||
state0, state1, state2,
|
||||
x0, y0, z0, dx, dy, dz,
|
||||
px, py, pz, ordn, symmetry,
|
||||
soa3[0], soa3[1], soa3[2],
|
||||
soa3[3], soa3[4], soa3[5],
|
||||
soa3[6], soa3[7], soa3[8]);
|
||||
CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
|
||||
Reference in New Issue
Block a user