Add direct CUDA resident-state sync path and profiling hooks

2026-04-13 00:57:05 +08:00
parent 7f2a391dd2
commit 636e35bfd8
5 changed files with 1188 additions and 527 deletions
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -76,6 +76,48 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
  return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
 }

+bool bssn_cuda_use_resident_sync(int lev)
+{
+#ifdef WithShell
+  (void)lev;
+  return false;
+#else
+  return lev == 0;
+#endif
+}
+
+void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        double *state_out[BSSN_CUDA_STATE_COUNT];
+        if (!fill_bssn_cuda_views(cg, vars, state_out))
+        {
+          cout << "CUDA BSSN state list mismatch on resident state download" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
+        {
+          cout << "CUDA resident state download failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        bssn_cuda_release_step_ctx(cg);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
 } // namespace
 #endif

@@ -3058,13 +3100,18 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps

 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
-{
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
-
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
-
-// new code 2013-2-15, zjcao
+void bssn_class::Step(int lev, int YN)
+{
+  setpbh(BH_num, Porg0, Mass, BH_num_input);
+
+  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+#if USE_CUDA_BSSN
+  const bool use_cuda_resident_sync = bssn_cuda_use_resident_sync(lev);
+#else
+  const bool use_cuda_resident_sync = false;
+#endif
+
+// new code 2013-2-15, zjcao
 #if (MAPBH == 1)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -3128,15 +3175,17 @@ void bssn_class::Step(int lev, int YN)
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
-#if (AGM == 0)
-        f_enforce_ga(cg->shape,
-                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
-                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
-#endif
-
+#if (AGM == 0)
+        if (!use_cuda_resident_sync)
+          f_enforce_ga(cg->shape,
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
+                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
+#endif
+
        bool used_gpu_substep = false;
+        bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
        {
          double *state_in[BSSN_CUDA_STATE_COUNT];
@@ -3154,6 +3203,11 @@ void bssn_class::Step(int lev, int YN)
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
+          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+          int apply_enforce_ga = 0;
+#if (AGM == 0)
+          apply_enforce_ga = 1;
+#endif
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
@@ -3164,7 +3218,8 @@ void bssn_class::Step(int lev, int YN)
                                    state_in, state_out, matter,
                                    propspeed, soa_flat, Pp->data->bbox,
                                    dT_lev, TRK4, iter_count, apply_bam_bc,
-                                    Symmetry, lev, ndeps, pre))
+                                    Symmetry, lev, ndeps, pre,
+                                    keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA predictor substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -3173,6 +3228,7 @@ void bssn_class::Step(int lev, int YN)
            ERROR = 1;
          }
          used_gpu_substep = true;
+          used_gpu_resident_state = (keep_resident_state != 0);
        }
 #endif
        if (!used_gpu_substep)
@@ -3277,8 +3333,9 @@ void bssn_class::Step(int lev, int YN)
            varlrhs = varlrhs->next;
          }
        }
-        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
-      }
+        if (!used_gpu_resident_state)
+          f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
+      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
@@ -3436,9 +3493,9 @@ void bssn_class::Step(int lev, int YN)
    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
  }
 #endif
-
-  Parallel::AsyncSyncState async_pre;
-  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
+
+  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);

 #ifdef WithShell
  if (lev == 0)
@@ -3455,9 +3512,9 @@ void bssn_class::Step(int lev, int YN)
           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
           << " seconds! " << endl;
    }
-  }
-#endif
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
+  }
+#endif
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);

 #ifdef WithShell
  // Complete non-blocking error reduction and check
@@ -3530,22 +3587,24 @@ void bssn_class::Step(int lev, int YN)
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
-#if (AGM == 0)
-          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#elif (AGM == 1)
-          if (iter_count == 3)
-            f_enforce_ga(cg->shape,
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#endif
-
+#if (AGM == 0)
+          if (!use_cuda_resident_sync)
+            f_enforce_ga(cg->shape,
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+#elif (AGM == 1)
+          if (iter_count == 3 && !use_cuda_resident_sync)
+            f_enforce_ga(cg->shape,
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+#endif
+
          bool used_gpu_substep = false;
+          bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
          {
            double *state_in[BSSN_CUDA_STATE_COUNT];
@@ -3563,6 +3622,13 @@ void bssn_class::Step(int lev, int YN)
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            int apply_bam_bc = 0;
+            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+            int apply_enforce_ga = 0;
+#if (AGM == 0)
+            apply_enforce_ga = 1;
+#elif (AGM == 1)
+            apply_enforce_ga = (iter_count == 3) ? 1 : 0;
+#endif
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
@@ -3573,7 +3639,8 @@ void bssn_class::Step(int lev, int YN)
                                      state_in, state_out, matter,
                                      propspeed, soa_flat, Pp->data->bbox,
                                      dT_lev, TRK4, iter_count, apply_bam_bc,
-                                      Symmetry, lev, ndeps, cor))
+                                      Symmetry, lev, ndeps, cor,
+                                      keep_resident_state, apply_enforce_ga, chitiny))
            {
              cout << "CUDA corrector substep failed in domain: ("
                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -3582,6 +3649,7 @@ void bssn_class::Step(int lev, int YN)
              ERROR = 1;
            }
            used_gpu_substep = true;
+            used_gpu_resident_state = (keep_resident_state != 0);
          }
 #endif
          if (!used_gpu_substep)
@@ -3686,8 +3754,9 @@ void bssn_class::Step(int lev, int YN)
              varlrhs = varlrhs->next;
            }
          }
-          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
-        }
+          if (!used_gpu_resident_state)
+            f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
+        }
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
@@ -3842,9 +3911,9 @@ void bssn_class::Step(int lev, int YN)
      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }
 #endif
-
-    Parallel::AsyncSyncState async_cor;
-    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
+
+    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);

 #ifdef WithShell
    if (lev == 0)
@@ -3862,8 +3931,8 @@ void bssn_class::Step(int lev, int YN)
             << " seconds! " << endl;
      }
    }
-#endif
-    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+#endif
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);

 #ifdef WithShell
    // Complete non-blocking error reduction and check
@@ -3968,10 +4037,14 @@ void bssn_class::Step(int lev, int YN)
      }
 #endif
    }
-  }
-#if (RPS == 0)
-  // mesh refinement boundary part
-  RestrictProlong(lev, YN, BB);
+  }
+#if USE_CUDA_BSSN
+  if (use_cuda_resident_sync)
+    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
+#endif
+#if (RPS == 0)
+  // mesh refinement boundary part
+  RestrictProlong(lev, YN, BB);

 #ifdef WithShell
  if (lev == 0)