From 8fe60ea703eca7b1c6252fea05363c7fb6382b4f Mon Sep 17 00:00:00 2001
From: ianchb <i@4t.pw>
Date: Wed, 15 Apr 2026 00:25:53 +0800
Subject: [PATCH] Add zero matter handling and interpolation for resident state
 in CUDA BSSN

---
 AMSS_NCKU_source/bssn_class.C     | 202 ++++++++++++++++++++++++++++--
 AMSS_NCKU_source/bssn_rhs_cuda.cu |  17 ++-
 AMSS_NCKU_source/bssn_rhs_cuda.h  |   1 +
 3 files changed, 209 insertions(+), 11 deletions(-)
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index 326d8f3..c98bfd8 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u
   return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
 }
 
+bool bssn_cuda_patch_contains_point(Patch *patch, const double *point)
+{
+  if (!patch)
+    return false;
+  for (int d = 0; d < dim; d++)
+  {
+    const double h = patch->getdX(d);
+    const double lo = patch->bbox[d] + patch->lli[d] * h;
+    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
+    if (point[d] < lo || point[d] > hi)
+      return false;
+  }
+  return true;
+}
+
+bool bssn_cuda_point_in_block(Patch *patch, Block *block,
+                              const double *point, const double *DH)
+{
+  if (!patch || !block)
+    return false;
+  for (int d = 0; d < dim; d++)
+  {
+    double llb;
+    double uub;
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
+              ? block->bbox[d] + patch->lli[d] * DH[d]
+              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
+    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
+              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
+              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
+#else
+#ifdef Cell
+    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
+              ? block->bbox[d] + patch->lli[d] * DH[d]
+              : block->bbox[d] + ghost_width * DH[d];
+    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
+              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
+              : block->bbox[dim + d] - ghost_width * DH[d];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
+      return false;
+  }
+  return true;
+}
+
+int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
+{
+  if (!coords || n <= ordn)
+    return 0;
+  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
+  int start = cxi - ordn / 2;
+  if (start < 0)
+    start = 0;
+  const int max_start = n - ordn;
+  if (start > max_start)
+    start = max_start;
+  return start;
+}
+
+bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
+                                        int myrank,
+                                        const double *point,
+                                        var *forx, var *fory, var *forz,
+                                        int Symmetry,
+                                        double *shellf)
+{
+  const int ordn = 2 * ghost_width;
+  int owner_rank = -1;
+
+  shellf[0] = shellf[1] = shellf[2] = 0.0;
+
+  MyList<Patch> *PL = PatL;
+  while (PL)
+  {
+    Patch *patch = PL->data;
+    if (!bssn_cuda_patch_contains_point(patch, point))
+    {
+      PL = PL->next;
+      continue;
+    }
+
+    double DH[dim];
+    for (int d = 0; d < dim; d++)
+      DH[d] = patch->getdX(d);
+
+    MyList<Block> *BP = patch->blb;
+    while (BP)
+    {
+      Block *block = BP->data;
+      if (bssn_cuda_point_in_block(patch, block, point, DH))
+      {
+        owner_rank = block->rank;
+        if (myrank == owner_rank)
+        {
+          int interp_ordn = ordn;
+          int interp_sym = Symmetry;
+          double x = point[0];
+          double y = point[1];
+          double z = point[2];
+
+          if (bssn_cuda_has_resident_state(block) &&
+              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
+          {
+            const int sx = ordn;
+            const int sy = ordn;
+            const int sz = ordn;
+            const int region_all = sx * sy * sz;
+            const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
+            const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
+            const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
+            double packed_fields[3 * region_all];
+            var *vars[3] = {forx, fory, forz};
+            for (int f = 0; f < 3; f++)
+            {
+              if (bssn_cuda_pack_state_region_to_host_buffer(block,
+                                                             k_bssn_cuda_bh_state_indices[f],
+                                                             packed_fields + f * region_all,
+                                                             block->shape,
+                                                             i0, j0, k0,
+                                                             sx, sy, sz) != 0)
+              {
+                cout << "CUDA BH tile download failed" << endl;
+                MPI_Abort(MPI_COMM_WORLD, 1);
+              }
+              int tile_shape[3] = {sx, sy, sz};
+              f_global_interp(tile_shape,
+                              block->X[0] + i0,
+                              block->X[1] + j0,
+                              block->X[2] + k0,
+                              packed_fields + f * region_all,
+                              shellf[f],
+                              x, y, z,
+                              interp_ordn,
+                              vars[f]->SoA,
+                              interp_sym);
+            }
+          }
+          else
+          {
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[forx->sgfn], shellf[0],
+                            x, y, z, interp_ordn, forx->SoA, interp_sym);
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[fory->sgfn], shellf[1],
+                            x, y, z, interp_ordn, fory->SoA, interp_sym);
+            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
+                            block->fgfs[forz->sgfn], shellf[2],
+                            x, y, z, interp_ordn, forz->SoA, interp_sym);
+          }
+        }
+        break;
+      }
+      if (BP == patch->ble)
+        break;
+      BP = BP->next;
+    }
+
+    if (owner_rank >= 0)
+      break;
+    PL = PL->next;
+  }
+
+  if (owner_rank < 0)
+    return false;
+
+  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
+  return true;
+}
+
 void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
   MyList<Patch> *Pp = PatL;
@@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN)
             MPI_Abort(MPI_COMM_WORLD, 1);
           }
           int apply_bam_bc = 0;
+          // bssn_class does not evolve matter source fields; they remain zero-initialized.
+          int use_zero_matter = 1;
           int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
           int apply_enforce_ga = 0;
 #if (AGM == 0)
@@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN)
                                     propspeed, soa_flat, Pp->data->bbox,
                                     dT_lev, TRK4, iter_count, apply_bam_bc,
                                     Symmetry, lev, ndeps, pre,
+                                    use_zero_matter,
                                     keep_resident_state, apply_enforce_ga, chitiny))
           {
             cout << "CUDA predictor substep failed in domain: ("
@@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN)
 #if USE_CUDA_BSSN
   const bool need_analysis_state_after_predictor =
       (lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
-  const bool need_bh_state_after_predictor =
-      (BH_num > 0) && (lev == GH->levels - 1);
   if (use_cuda_resident_sync && need_analysis_state_after_predictor)
     bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
-  else if (use_cuda_resident_sync && need_bh_state_after_predictor)
-    bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
 #endif
 
 #ifdef WithShell
@@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN)
               MPI_Abort(MPI_COMM_WORLD, 1);
             }
             int apply_bam_bc = 0;
+            // bssn_class does not evolve matter source fields; they remain zero-initialized.
+            int use_zero_matter = 1;
             int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
             int apply_enforce_ga = 0;
 #if (AGM == 0)
@@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN)
                                       propspeed, soa_flat, Pp->data->bbox,
                                       dT_lev, TRK4, iter_count, apply_bam_bc,
                                       Symmetry, lev, ndeps, cor,
+                                      use_zero_matter,
                                       keep_resident_state, apply_enforce_ga, chitiny))
             {
               cout << "CUDA corrector substep failed in domain: ("
@@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN)
     }
 #endif
 
-#if USE_CUDA_BSSN
-    if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
-      bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
-#endif
-
     // swap time level
     if (iter_count < 3)
     {
@@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
 
     int lev = ilev;
 
+#if USE_CUDA_BSSN
+    if (bssn_cuda_use_resident_sync(lev) &&
+        bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
+    {
+      BH_RHS[n][0] = -shellf[0];
+      BH_RHS[n][1] = -shellf[1];
+      BH_RHS[n][2] = -shellf[2];
+      continue;
+    }
+#endif
+
 #if (PSTR == 0)
     while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index a6d70d9..310f014 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx,
     ctx.matter_ready = true;
 }
 
+static void zero_matter_cache(StepContext &ctx, size_t all)
+{
+    CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
+                          (size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
+    ctx.matter_ready = true;
+}
+
 static void bind_matter_slots(const StepContext &ctx)
 {
     for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
@@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
                           int &Lev,
                           double &eps,
                           int &co,
+                          int &use_zero_matter,
                           int &keep_resident_state,
                           int &apply_enforce_ga,
                           double &chitiny)
@@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
 
     t0 = profile ? cuda_profile_now_ms() : 0.0;
     if (RK4 == 0) {
-        upload_matter_cache(ctx, matter_host, all);
+        if (use_zero_matter) {
+            if (!ctx.matter_ready) zero_matter_cache(ctx, all);
+        } else {
+            upload_matter_cache(ctx, matter_host, all);
+        }
         CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
                               (size_t)BSSN_STATE_COUNT * bytes,
                               cudaMemcpyDeviceToDevice));
     } else if (!ctx.matter_ready) {
-        upload_matter_cache(ctx, matter_host, all);
+        if (use_zero_matter) zero_matter_cache(ctx, all);
+        else upload_matter_cache(ctx, matter_host, all);
     }
     bind_matter_slots(ctx);
     if (profile) {
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 9f473e5..ce8a334 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
                           int &Lev,
                           double &eps,
                           int &co,
+                          int &use_zero_matter,
                           int &keep_resident_state,
                           int &apply_enforce_ga,
                           double &chitiny);