Add resident BSSN GPU point interpolation

2026-04-30 11:39:15 +08:00
parent 18e9c9cc50
commit 8486532920
3 changed files with 268 additions and 28 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
    }
 }

+__device__ double interp_lagrange_weight(int idx, double x, int ordn)
+{
+    double w = 1.0;
+    const double xi = (double)idx;
+    for (int j = 0; j < ordn; ++j) {
+        if (j == idx) continue;
+        w *= (x - (double)j) / (xi - (double)j);
+    }
+    return w;
+}
+
+__device__ void interp_axis_window(double p,
+                                   double x0,
+                                   double dx,
+                                   int n,
+                                   int ordn,
+                                   int symmetry,
+                                   int axis,
+                                   int &base,
+                                   double &local_x)
+{
+    int cx_i = (int)((p - x0) / dx + 0.4) + 1;
+    int cx_b = cx_i - ordn / 2 + 1;
+    int cx_t = cx_b + ordn - 1;
+    int cmin = 1;
+    if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
+        cmin = -ordn / 2 + 1;
+    if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
+        cmin = -ordn / 2 + 1;
+
+    if (cx_b < cmin) {
+        cx_b = cmin;
+        cx_t = cx_b + ordn - 1;
+    }
+    if (cx_t > n) {
+        cx_t = n;
+        cx_b = cx_t + 1 - ordn;
+    }
+
+    base = cx_b;
+    if (cx_b > 0) {
+        const double xb = x0 + (double)(cx_b - 1) * dx;
+        local_x = (p - xb) / dx;
+    } else {
+        const int reflected = 1 - cx_b;
+        const double xb = x0 + (double)(reflected - 1) * dx;
+        local_x = (p + xb) / dx;
+    }
+}
+
+__device__ double load_interp_value(const double * __restrict__ mem,
+                                    int nx,
+                                    int ny,
+                                    int nz,
+                                    int all,
+                                    int state,
+                                    int fi,
+                                    int fj,
+                                    int fk,
+                                    const double * __restrict__ soa)
+{
+    double sign = 1.0;
+    int ii = fi;
+    int jj = fj;
+    int kk = fk;
+    if (ii <= 0) {
+        ii = 1 - ii;
+        sign *= soa[0];
+    }
+    if (jj <= 0) {
+        jj = 1 - jj;
+        sign *= soa[1];
+    }
+    if (kk <= 0) {
+        kk = 1 - kk;
+        sign *= soa[2];
+    }
+    if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
+        return 0.0;
+    const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
+    return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
+}
+
+__global__ void kern_interp_state_point3(const double * __restrict__ mem,
+                                         double * __restrict__ out,
+                                         int nx,
+                                         int ny,
+                                         int nz,
+                                         int all,
+                                         int state0,
+                                         int state1,
+                                         int state2,
+                                         double x0,
+                                         double y0,
+                                         double z0,
+                                         double dx,
+                                         double dy,
+                                         double dz,
+                                         double px,
+                                         double py,
+                                         double pz,
+                                         int ordn,
+                                         int symmetry,
+                                         double soa00, double soa01, double soa02,
+                                         double soa10, double soa11, double soa12,
+                                         double soa20, double soa21, double soa22)
+{
+    const int f = threadIdx.x;
+    if (f >= 3 || ordn <= 0 || ordn > 8)
+        return;
+
+    const int states[3] = {state0, state1, state2};
+    const double soa_all[9] = {
+        soa00, soa01, soa02,
+        soa10, soa11, soa12,
+        soa20, soa21, soa22
+    };
+    const double *soa = soa_all + 3 * f;
+
+    int ib, jb, kb;
+    double tx, ty, tz;
+    interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
+    interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
+    interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
+
+    double wx[8], wy[8], wz[8];
+    for (int i = 0; i < ordn; ++i) {
+        wx[i] = interp_lagrange_weight(i, tx, ordn);
+        wy[i] = interp_lagrange_weight(i, ty, ordn);
+        wz[i] = interp_lagrange_weight(i, tz, ordn);
+    }
+
+    double value = 0.0;
+    for (int k = 0; k < ordn; ++k) {
+        for (int j = 0; j < ordn; ++j) {
+            for (int i = 0; i < ordn; ++i) {
+                const double coeff = wx[i] * wy[j] * wz[k];
+                value += coeff * load_interp_value(mem, nx, ny, nz, all,
+                                                    states[f],
+                                                    ib + i, jb + j, kb + k,
+                                                    soa);
+            }
+        }
+    }
+    out[f] = value;
+}
+
 __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
                                             double * __restrict__ dst,
                                             int nx, int ny,
@@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
    return 0;
 }

+extern "C"
+int bssn_cuda_interp_state_point3(void *block_tag,
+                                  int *ex,
+                                  int state0,
+                                  int state1,
+                                  int state2,
+                                  double x0,
+                                  double y0,
+                                  double z0,
+                                  double dx,
+                                  double dy,
+                                  double dz,
+                                  double px,
+                                  double py,
+                                  double pz,
+                                  int ordn,
+                                  int symmetry,
+                                  const double *soa3,
+                                  double *out3)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (!block_tag || !ex || !out3 || !soa3)
+        return 1;
+    if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
+        state1 < 0 || state1 >= BSSN_STATE_COUNT ||
+        state2 < 0 || state2 >= BSSN_STATE_COUNT)
+        return 1;
+    if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
+        ordn <= 0 || ordn > 8 ||
+        ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
+        return 1;
+
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
+    if (bank < 0 || !ctx.resident_valid[bank])
+        return 1;
+
+    double *d_out = ensure_step_comm_buffer(ctx, 3);
+    kern_interp_state_point3<<<1, 3>>>(
+        ctx.d_resident_mem[bank], d_out,
+        ex[0], ex[1], ex[2], (int)all,
+        state0, state1, state2,
+        x0, y0, z0, dx, dy, dz,
+        px, py, pz, ordn, symmetry,
+        soa3[0], soa3[1], soa3[2],
+        soa3[3], soa3[4], soa3[5],
+        soa3[6], soa3[7], soa3[8]);
+    CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,