Stabilize EScalar CUDA fallback path

2026-05-03 16:05:47 +08:00
parent 4430d04ee7
commit e4c10eca0f
5 changed files with 1542 additions and 127 deletions
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -140,6 +140,88 @@ bool escalar_gpu_rk_enabled()
  return enabled != 0;
 }

+bool escalar_resident_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_ESCALAR_RESIDENT");
+    const char *experimental = getenv("AMSS_ESCALAR_RESIDENT_EXPERIMENTAL");
+    enabled = (env && atoi(env) != 0 &&
+               experimental && atoi(experimental) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool escalar_step_profile_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+int escalar_step_profile_every()
+{
+  static int every = -1;
+  if (every < 0)
+  {
+    const char *env = getenv("AMSS_ESCALAR_STEP_PROFILE_EVERY");
+    every = (env && atoi(env) > 0) ? atoi(env) : 1;
+  }
+  return every;
+}
+
+struct EScalarStepProfile
+{
+  double start;
+  double predictor_rhs;
+  double predictor_sync;
+  double analysis;
+  double corrector_rhs;
+  double corrector_sync;
+  double restrict_prolong;
+  double other_sync;
+};
+
+void escalar_profile_init(EScalarStepProfile &p)
+{
+  p.start = MPI_Wtime();
+  p.predictor_rhs = 0.0;
+  p.predictor_sync = 0.0;
+  p.analysis = 0.0;
+  p.corrector_rhs = 0.0;
+  p.corrector_sync = 0.0;
+  p.restrict_prolong = 0.0;
+  p.other_sync = 0.0;
+}
+
+void escalar_profile_add(double &bucket, double t0)
+{
+  bucket += MPI_Wtime() - t0;
+}
+
+void escalar_profile_report(const EScalarStepProfile &p, int lev, int myrank)
+{
+  if (myrank != 0 || !escalar_step_profile_enabled())
+    return;
+  static long long call_count = 0;
+  ++call_count;
+  const int every = escalar_step_profile_every();
+  if (every > 1 && (call_count % every) != 0)
+    return;
+  const double total = MPI_Wtime() - p.start;
+  fprintf(stderr,
+          "[AMSS-ESCALAR-PROFILE] call=%lld lev=%d total=%.6f pred_rhs=%.6f pred_sync=%.6f analysis=%.6f corr_rhs=%.6f corr_sync=%.6f rp=%.6f other_sync=%.6f\n",
+          call_count, lev, total, p.predictor_rhs, p.predictor_sync,
+          p.analysis, p.corrector_rhs, p.corrector_sync,
+          p.restrict_prolong, p.other_sync);
+  fflush(stderr);
+}
+
 void clear_var_list(MyList<var> *&list)
 {
  if (list)
@@ -173,6 +255,34 @@ void download_bssn_cuda_prefix_if_present(MyList<Patch> *PatL,
  }
 }

+void download_escalar_cuda_pair_if_present(MyList<Patch> *PatL,
+                                           var *Sphi_var,
+                                           var *Spi_var,
+                                           int myrank)
+{
+  if (!Sphi_var || !Spi_var)
+    return;
+  while (PatL)
+  {
+    MyList<Block> *BP = PatL->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        bssn_cuda_escalar_download_fields_if_present(
+            cg, cg->shape,
+            cg->fgfs[Sphi_var->sgfn],
+            cg->fgfs[Spi_var->sgfn]);
+      }
+      if (BP == PatL->data->ble)
+        break;
+      BP = BP->next;
+    }
+    PatL = PatL->next;
+  }
+}
+
 int run_bssn_escalar_cuda_substep(Block *cg,
                                  MyList<var> *state_in_list,
                                  MyList<var> *state_out_list,
@@ -992,8 +1102,8 @@ void bssnEScalar_class::Read_Pablo()

 //================================================================================================

-void bssnEScalar_class::Step(int lev, int YN)
-{
+void bssnEScalar_class::Step(int lev, int YN)
+{
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
@@ -1003,15 +1113,18 @@ void bssnEScalar_class::Step(int lev, int YN)
  if (lev < GH->movls)
    ndeps = numepsb;
  double TRK4 = PhysTime;
-  int iter_count = 0; // count RK4 substeps
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-
-  MyList<ss_patch> *sPp;
-  // Predictor
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
+  int iter_count = 0; // count RK4 substeps
+  int pre = 0, cor = 1;
+  int ERROR = 0;
+  EScalarStepProfile escalar_profile;
+  escalar_profile_init(escalar_profile);
+
+  MyList<ss_patch> *sPp;
+  // Predictor
+  const double escalar_profile_predictor_rhs_start = MPI_Wtime();
+  MyList<Patch> *Pp = GH->PatL[lev];
+  while (Pp)
+  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
      {
@@ -1101,6 +1214,8 @@ void bssnEScalar_class::Step(int lev, int YN)
          {
            if (scalar_gpu_rk_done)
            {
+              if (!escalar_resident_enabled())
+              {
 #ifndef WithShell
              if (lev > 0) // fix BD point
 #endif
@@ -1112,6 +1227,7 @@ void bssnEScalar_class::Step(int lev, int YN)
                                  cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
                                  varl0->data->SoA,
                                  Symmetry, cor);
+              }

              varl0 = varl0->next;
              varl = varl->next;
@@ -1157,11 +1273,12 @@ void bssnEScalar_class::Step(int lev, int YN)
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-  // check error information
-  {
+    }
+    Pp = Pp->next;
+  }
+  escalar_profile_add(escalar_profile.predictor_rhs, escalar_profile_predictor_rhs_start);
+  // check error information
+  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
@@ -1325,10 +1442,14 @@ void bssnEScalar_class::Step(int lev, int YN)
 #endif

 #if USE_CUDA_BSSN
+  const double escalar_profile_predictor_sync_start = MPI_Wtime();
  Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_pre, Symmetry, sync_cache_pre[lev]);
  Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_pre, Symmetry, sync_cache_scalar_pre[lev]);
+  escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
 #else
+  const double escalar_profile_predictor_sync_start = MPI_Wtime();
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+  escalar_profile_add(escalar_profile.predictor_sync, escalar_profile_predictor_sync_start);
 #endif

 #ifdef WithShell
@@ -1381,21 +1502,28 @@ void bssnEScalar_class::Step(int lev, int YN)
      }
    }
  }
-  // data analysis part
-  // Warning NOTE: the variables1 are used as temp storege room
-  if (lev == a_lev)
-  {
-    AnalysisStuff_EScalar(lev, dT_lev);
-  }
-  // corrector
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
+  // data analysis part
+  // Warning NOTE: the variables1 are used as temp storege room
+  if (lev == a_lev)
+  {
+    const double escalar_profile_analysis_start = MPI_Wtime();
+#if USE_CUDA_BSSN
+    if (escalar_resident_enabled())
+      download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi, Spi, myrank);
+#endif
+    AnalysisStuff_EScalar(lev, dT_lev);
+    escalar_profile_add(escalar_profile.analysis, escalar_profile_analysis_start);
+  }
+  // corrector
+  for (iter_count = 1; iter_count < 4; iter_count++)
+  {
    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
-    if (iter_count == 1 || iter_count == 3)
-      TRK4 += dT_lev / 2;
-    Pp = GH->PatL[lev];
-    while (Pp)
-    {
+    if (iter_count == 1 || iter_count == 3)
+      TRK4 += dT_lev / 2;
+    const double escalar_profile_corrector_rhs_start = MPI_Wtime();
+    Pp = GH->PatL[lev];
+    while (Pp)
+    {
      MyList<Block> *BP = Pp->data->blb;
      while (BP)
      {
@@ -1494,6 +1622,8 @@ void bssnEScalar_class::Step(int lev, int YN)
            {
              if (scalar_gpu_rk_done)
              {
+                if (!escalar_resident_enabled())
+                {
 #ifndef WithShell
                if (lev > 0) // fix BD point
 #endif
@@ -1505,6 +1635,7 @@ void bssnEScalar_class::Step(int lev, int YN)
                                    cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
                                    varl0->data->SoA,
                                    Symmetry, cor);
+                }

                varl0 = varl0->next;
                varl = varl->next;
@@ -1552,11 +1683,12 @@ void bssnEScalar_class::Step(int lev, int YN)
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
-      }
-      Pp = Pp->next;
-    }
-
-    // check error information
+      }
+      Pp = Pp->next;
+    }
+    escalar_profile_add(escalar_profile.corrector_rhs, escalar_profile_corrector_rhs_start);
+
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
@@ -1731,10 +1863,14 @@ void bssnEScalar_class::Step(int lev, int YN)
 #endif

 #if USE_CUDA_BSSN
+    const double escalar_profile_corrector_sync_start = MPI_Wtime();
    Parallel::Sync_cached(GH->PatL[lev], BSSNSynchList_cor, Symmetry, sync_cache_cor[lev]);
    Parallel::Sync_cached(GH->PatL[lev], ScalarSynchList_cor, Symmetry, sync_cache_scalar_cor[lev]);
+    escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
 #else
+    const double escalar_profile_corrector_sync_start = MPI_Wtime();
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+    escalar_profile_add(escalar_profile.corrector_sync, escalar_profile_corrector_sync_start);
 #endif

 #ifdef WithShell
@@ -1837,17 +1973,21 @@ void bssnEScalar_class::Step(int lev, int YN)

 #if (RPS == 0)
  // mesh refinement boundary part
+  const double escalar_profile_rp_start = MPI_Wtime();
 #if USE_CUDA_BSSN
  {
    const char *mixed_env = getenv("AMSS_ESCALAR_MIXED_GPU_RP");
    const bool mixed_gpu_rp = (mixed_env && atoi(mixed_env) != 0);
    const char *split_env = getenv("AMSS_ESCALAR_SPLIT_RP");
    const bool split_rp = (split_env && atoi(split_env) != 0);
+    if (escalar_resident_enabled() && !split_rp)
+      download_escalar_cuda_pair_if_present(GH->PatL[lev], Sphi1, Spi1, myrank);
    if (!mixed_gpu_rp && !split_rp)
      download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank);
  }
 #endif
  RestrictProlong(lev, YN, BB);
+  escalar_profile_add(escalar_profile.restrict_prolong, escalar_profile_rp_start);

 #ifdef WithShell
  if (lev == 0)
@@ -1910,18 +2050,19 @@ void bssnEScalar_class::Step(int lev, int YN)
  }
 #endif
  // for black hole position
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      Porg0[ithBH][0] = Porg1[ithBH][0];
      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-}
-
-//================================================================================================
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+  escalar_profile_report(escalar_profile, lev, myrank);
+}
+
+//================================================================================================