Batch GA/BH subset sync with indexed GPU pack/unpack buffers

2026-04-13 20:27:30 +08:00
parent c5d1268dd1
commit e952ee8e91
4 changed files with 495 additions and 34 deletions
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -53,6 +53,12 @@ using namespace std;
 #if USE_CUDA_BSSN
 namespace {

+static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
+static const int k_bssn_cuda_ga_state_indices[12] = {
+  2, 3, 4, 5, 6, 7,
+  8, 9, 10, 11, 12, 13
+};
+
 bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
                          double **host_views,
                          double *propspeeds = nullptr,
@@ -82,11 +88,48 @@ bool bssn_cuda_use_resident_sync(int lev)
  (void)lev;
  return false;
 #else
-  return lev == 0;
+  return true;
 #endif
 }

-void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
+bool bssn_cuda_sync_subset(Block *cg,
+                           int subset_count,
+                           const int *state_indices,
+                           double **host_views,
+                           bool upload)
+{
+  if (!cg || subset_count <= 0)
+    return true;
+  if (!bssn_cuda_has_resident_state(cg))
+    return true;
+  if (upload)
+    return bssn_cuda_upload_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
+  return bssn_cuda_download_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
+}
+
+bool bssn_cuda_sync_ga_fields(Block *cg, MyList<var> *vars, bool upload)
+{
+  double *ga_fields[12];
+  int idx = 0;
+  while (vars && idx < 12)
+  {
+    ga_fields[idx++] = cg->fgfs[vars->data->sgfn];
+    vars = vars->next;
+  }
+  if (idx != 12)
+    return false;
+  return bssn_cuda_sync_subset(cg, 12, k_bssn_cuda_ga_state_indices, ga_fields, upload);
+}
+
+bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool upload)
+{
+  double *bh_fields[3] = {
+    cg->fgfs[forx->sgfn], cg->fgfs[fory->sgfn], cg->fgfs[forz->sgfn]
+  };
+  return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
+}
+
+void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
@@ -108,7 +151,32 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
          cout << "CUDA resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
-        bssn_cuda_release_step_ctx(cg);
+        if (release_ctx)
+          bssn_cuda_release_step_ctx(cg);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
+void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
+                                    int myrank,
+                                    var *forx, var *fory, var *forz)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
+      {
+        cout << "CUDA BH state subset download failed" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      if (BP == Pp->data->ble)
        break;
@@ -3187,6 +3255,24 @@ void bssn_class::Step(int lev, int YN)
        bool used_gpu_substep = false;
        bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
+        if (use_cuda_resident_sync)
+        {
+          if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, false))
+          {
+            cout << "CUDA predictor GA subset download failed" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          f_enforce_ga(cg->shape,
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
+          if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, true))
+          {
+            cout << "CUDA predictor GA subset upload failed" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+        }
        {
          double *state_in[BSSN_CUDA_STATE_COUNT];
          double *state_out[BSSN_CUDA_STATE_COUNT];
@@ -3206,7 +3292,7 @@ void bssn_class::Step(int lev, int YN)
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
-          apply_enforce_ga = 1;
+          apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
@@ -3519,6 +3605,17 @@ void bssn_class::Step(int lev, int YN)
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);

+#if USE_CUDA_BSSN
+  const bool need_analysis_state_after_predictor =
+      (lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
+  const bool need_bh_state_after_predictor =
+      (BH_num > 0) && (lev == GH->levels - 1);
+  if (use_cuda_resident_sync && need_analysis_state_after_predictor)
+    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
+  else if (use_cuda_resident_sync && need_bh_state_after_predictor)
+    bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
+#endif
+
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
@@ -3609,6 +3706,24 @@ void bssn_class::Step(int lev, int YN)
          bool used_gpu_substep = false;
          bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
+          if (use_cuda_resident_sync)
+          {
+            if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, false))
+            {
+              cout << "CUDA corrector GA subset download failed" << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            f_enforce_ga(cg->shape,
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+            if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, true))
+            {
+              cout << "CUDA corrector GA subset upload failed" << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+          }
          {
            double *state_in[BSSN_CUDA_STATE_COUNT];
            double *state_out[BSSN_CUDA_STATE_COUNT];
@@ -3628,9 +3743,9 @@ void bssn_class::Step(int lev, int YN)
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            int apply_enforce_ga = 0;
 #if (AGM == 0)
-            apply_enforce_ga = 1;
+            apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
 #elif (AGM == 1)
-            apply_enforce_ga = (iter_count == 3) ? 1 : 0;
+            apply_enforce_ga = (iter_count == 3 && !use_cuda_resident_sync) ? 1 : 0;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
@@ -3993,6 +4108,11 @@ void bssn_class::Step(int lev, int YN)
    }
 #endif

+#if USE_CUDA_BSSN
+    if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
+      bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
+#endif
+
    // swap time level
    if (iter_count < 3)
    {
@@ -4046,7 +4166,7 @@ void bssn_class::Step(int lev, int YN)
  }
 #if USE_CUDA_BSSN
  if (use_cuda_resident_sync)
-    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
+    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part