Add resident BSSN GPU point interpolation

This commit is contained in:
2026-04-30 11:39:15 +08:00
parent 18e9c9cc50
commit 8486532920
3 changed files with 268 additions and 28 deletions

View File

@@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
}
}
__device__ double interp_lagrange_weight(int idx, double x, int ordn)
{
double w = 1.0;
const double xi = (double)idx;
for (int j = 0; j < ordn; ++j) {
if (j == idx) continue;
w *= (x - (double)j) / (xi - (double)j);
}
return w;
}
__device__ void interp_axis_window(double p,
double x0,
double dx,
int n,
int ordn,
int symmetry,
int axis,
int &base,
double &local_x)
{
int cx_i = (int)((p - x0) / dx + 0.4) + 1;
int cx_b = cx_i - ordn / 2 + 1;
int cx_t = cx_b + ordn - 1;
int cmin = 1;
if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
cmin = -ordn / 2 + 1;
if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
cmin = -ordn / 2 + 1;
if (cx_b < cmin) {
cx_b = cmin;
cx_t = cx_b + ordn - 1;
}
if (cx_t > n) {
cx_t = n;
cx_b = cx_t + 1 - ordn;
}
base = cx_b;
if (cx_b > 0) {
const double xb = x0 + (double)(cx_b - 1) * dx;
local_x = (p - xb) / dx;
} else {
const int reflected = 1 - cx_b;
const double xb = x0 + (double)(reflected - 1) * dx;
local_x = (p + xb) / dx;
}
}
__device__ double load_interp_value(const double * __restrict__ mem,
int nx,
int ny,
int nz,
int all,
int state,
int fi,
int fj,
int fk,
const double * __restrict__ soa)
{
double sign = 1.0;
int ii = fi;
int jj = fj;
int kk = fk;
if (ii <= 0) {
ii = 1 - ii;
sign *= soa[0];
}
if (jj <= 0) {
jj = 1 - jj;
sign *= soa[1];
}
if (kk <= 0) {
kk = 1 - kk;
sign *= soa[2];
}
if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
return 0.0;
const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
}
__global__ void kern_interp_state_point3(const double * __restrict__ mem,
double * __restrict__ out,
int nx,
int ny,
int nz,
int all,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
double soa00, double soa01, double soa02,
double soa10, double soa11, double soa12,
double soa20, double soa21, double soa22)
{
const int f = threadIdx.x;
if (f >= 3 || ordn <= 0 || ordn > 8)
return;
const int states[3] = {state0, state1, state2};
const double soa_all[9] = {
soa00, soa01, soa02,
soa10, soa11, soa12,
soa20, soa21, soa22
};
const double *soa = soa_all + 3 * f;
int ib, jb, kb;
double tx, ty, tz;
interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
double wx[8], wy[8], wz[8];
for (int i = 0; i < ordn; ++i) {
wx[i] = interp_lagrange_weight(i, tx, ordn);
wy[i] = interp_lagrange_weight(i, ty, ordn);
wz[i] = interp_lagrange_weight(i, tz, ordn);
}
double value = 0.0;
for (int k = 0; k < ordn; ++k) {
for (int j = 0; j < ordn; ++j) {
for (int i = 0; i < ordn; ++i) {
const double coeff = wx[i] * wy[j] * wz[k];
value += coeff * load_interp_value(mem, nx, ny, nz, all,
states[f],
ib + i, jb + j, kb + k,
soa);
}
}
}
out[f] = value;
}
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
double * __restrict__ dst,
int nx, int ny,
@@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_interp_state_point3(void *block_tag,
int *ex,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
const double *soa3,
double *out3)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (!block_tag || !ex || !out3 || !soa3)
return 1;
if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
state1 < 0 || state1 >= BSSN_STATE_COUNT ||
state2 < 0 || state2 >= BSSN_STATE_COUNT)
return 1;
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
ordn <= 0 || ordn > 8 ||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
if (bank < 0 || !ctx.resident_valid[bank])
return 1;
double *d_out = ensure_step_comm_buffer(ctx, 3);
kern_interp_state_point3<<<1, 3>>>(
ctx.d_resident_mem[bank], d_out,
ex[0], ex[1], ex[2], (int)all,
state0, state1, state2,
x0, y0, z0, dx, dy, dz,
px, py, pz, ordn, symmetry,
soa3[0], soa3[1], soa3[2],
soa3[3], soa3[4], soa3[5],
soa3[6], soa3[7], soa3[8]);
CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
return 0;
}
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,