Optimize BSSN CUDA resident AMR prolong path

2026-04-30 10:58:15 +08:00
parent 1ee229a91f
commit 18e9c9cc50
3 changed files with 778 additions and 78 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -190,6 +190,25 @@ bool cuda_build_bssn_host_views(Block *block,
  }
  return v == 0;
 }
+
+bool cuda_build_bssn_soa(MyList<var> *vars,
+                         int state_count,
+                         double *soa_flat)
+{
+  if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT)
+    return false;
+  MyList<var> *v = vars;
+  for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
+  {
+    if (!v)
+      return false;
+    soa_flat[3 * i + 0] = v->data->SoA[0];
+    soa_flat[3 * i + 1] = v->data->SoA[1];
+    soa_flat[3 * i + 2] = v->data->SoA[2];
+    v = v->next;
+  }
+  return v == 0;
+}
 #endif

 #if USE_CUDA_BSSN || USE_CUDA_Z4C
@@ -198,6 +217,9 @@ int fortran_idint(double x)
  return (int)x;
 }

+bool cuda_amr_restrict_device_enabled();
+bool cuda_amr_prolong_device_enabled();
+
 bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
                                   const Parallel::gridseg *dst,
                                   int first_fine[3])
@@ -226,7 +248,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
    const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1;
    const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1;
    first_fine[d] = 2 * lbc - lbf - 1;
-    if (first_fine[d] - 2 < 0)
+    if (first_fine[d] < 0)
      return false;
    if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d])
      return false;
@@ -271,7 +293,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
    const int first_coarse = first_fine_ii[d] / 2 - coarse_lb[d];
    const int last_fine_ii = first_fine_ii[d] + dst->shape[d] - 1;
    const int last_coarse = last_fine_ii / 2 - coarse_lb[d];
-    if (first_coarse - 2 < 0)
+    if (first_coarse < -1)
      return false;
    if (last_coarse + 3 >= src->Bg->shape[d])
      return false;
@@ -306,13 +328,21 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
 #elif USE_CUDA_BSSN
  if (bssn_cuda_has_resident_state(src->Bg) == 0)
    return false;
-  if (type == 1)
-    return true;
-  int a[3], b[3];
-  if (type == 2)
-    return cuda_cell_gw3_restrict_params(src, dst, a);
-  if (type == 3)
-    return cuda_cell_gw3_prolong_params(src, dst, a, b);
+	  if (type == 1)
+	    return true;
+	  int a[3], b[3];
+	  if (type == 2)
+	  {
+	    if (!cuda_amr_restrict_device_enabled())
+	      return false;
+	    return cuda_cell_gw3_restrict_params(src, dst, a);
+	  }
+	  if (type == 3)
+	  {
+	    if (!cuda_amr_prolong_device_enabled())
+	      return false;
+	    return cuda_cell_gw3_prolong_params(src, dst, a, b);
+	  }
  return false;
 #else
  (void)type;
@@ -427,6 +457,28 @@ bool cuda_aware_mpi_enabled()
  return enabled != 0;
 }

+bool cuda_amr_restrict_device_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_DEVICE");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool cuda_amr_prolong_device_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_PROLONG_DEVICE");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
 bool cuda_mpi_diag_enabled()
 {
  static int enabled = -1;
@@ -438,6 +490,17 @@ bool cuda_mpi_diag_enabled()
  return enabled != 0 || sync_profile_enabled();
 }

+int cuda_mpi_diag_limit()
+{
+  static int limit = -1;
+  if (limit < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_MPI_DIAG_LIMIT");
+    limit = (env && atoi(env) > 0) ? atoi(env) : 10;
+  }
+  return limit;
+}
+
 double *alloc_device_comm_buffer(int length)
 {
  if (length <= 0)
@@ -486,9 +549,11 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
    return false;
  const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
-  bool ok = false;
-  double *views[BSSN_CUDA_STATE_COUNT];
-  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
+	  bool ok = false;
+	  double *views[BSSN_CUDA_STATE_COUNT];
+	  double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+	  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
+	  const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
  if (type == 1)
  {
    const int i0 = cuda_seg_begin(dst, src->Bg, 0);
@@ -509,14 +574,15 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
    int first_fine[3];
    if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
      return false;
-    ok = have_views
-             ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
-                   src->Bg, views, state_count, buffer, src->Bg->shape,
-                   dst->shape[0], dst->shape[1], dst->shape[2],
-                   first_fine[0], first_fine[1], first_fine[2]) == 0
-             : bssn_cuda_restrict_state_batch_to_device_buffer(
-                   src->Bg, state_count, buffer, src->Bg->shape,
-                   dst->shape[0], dst->shape[1], dst->shape[2],
+	    ok = have_views
+	             ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
+	                   src->Bg, views, state_count, buffer, src->Bg->shape,
+	                   dst->shape[0], dst->shape[1], dst->shape[2],
+	                   first_fine[0], first_fine[1], first_fine[2],
+	                   have_soa ? soa_flat : 0) == 0
+	             : bssn_cuda_restrict_state_batch_to_device_buffer(
+	                   src->Bg, state_count, buffer, src->Bg->shape,
+	                   dst->shape[0], dst->shape[1], dst->shape[2],
                   first_fine[0], first_fine[1], first_fine[2]) == 0;
  }
  else if (type == 3)
@@ -524,13 +590,14 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
    int first_fine_ii[3], coarse_lb[3];
    if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
      return false;
-    ok = have_views
-             ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
-                   src->Bg, views, state_count, buffer, src->Bg->shape,
-                   dst->shape[0], dst->shape[1], dst->shape[2],
-                   first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
-                   coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0
-             : bssn_cuda_prolong_state_batch_to_device_buffer(
+	    ok = have_views
+	             ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
+	                   src->Bg, views, state_count, buffer, src->Bg->shape,
+	                   dst->shape[0], dst->shape[1], dst->shape[2],
+	                   first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+	                   coarse_lb[0], coarse_lb[1], coarse_lb[2],
+	                   have_soa ? soa_flat : 0) == 0
+	             : bssn_cuda_prolong_state_batch_to_device_buffer(
                   src->Bg, state_count, buffer, src->Bg->shape,
                   dst->shape[0], dst->shape[1], dst->shape[2],
                   first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
@@ -643,19 +710,39 @@ bool cuda_flush_device_segment_batch(Block *block,
                                     int state_count,
                                     const std::vector<int> &meta,
                                     int dir,
+                                     int type,
                                     MyList<var> *vars)
 {
  if (!block || meta.empty())
    return true;
-  const int segment_count = (int)(meta.size() / 8);
-  double *views[BSSN_CUDA_STATE_COUNT];
-  const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
-  if (dir == PACK)
+	  const int stride = (dir == PACK && type == 3) ? 11 : 8;
+	  const int segment_count = (int)(meta.size() / stride);
+	  double *views[BSSN_CUDA_STATE_COUNT];
+	  double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+	  const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
+	  const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
+	  if (dir == PACK)
+	  {
+	    if (type == 2)
+	      return have_views
+	                 ? bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(
+	                       block, views, state_count, data, block->shape, segment_count,
+	                       meta.data(), have_soa ? soa_flat : 0) == 0
+	                 : bssn_cuda_restrict_state_segments_to_device_buffer(
+	                       block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+	    if (type == 3)
+	      return have_views
+	                 ? bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(
+	                       block, views, state_count, data, block->shape, segment_count,
+	                       meta.data(), have_soa ? soa_flat : 0) == 0
+	                 : bssn_cuda_prolong_state_segments_to_device_buffer(
+	                       block, state_count, data, block->shape, segment_count, meta.data()) == 0;
    return have_views
               ? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(
                     block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
               : bssn_cuda_pack_state_segments_to_device_buffer(
                     block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+  }
  return have_views
             ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(
                   block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
@@ -685,6 +772,7 @@ int cuda_data_packer_device_batched(double *data,

  int size_out = 0;
  Block *batch_block = 0;
+  int batch_type = 0;
  std::vector<int> batch_meta;
  batch_meta.reserve(64);

@@ -702,42 +790,72 @@ int cuda_data_packer_device_batched(double *data,
        type = 2;
      else
        type = 3;
-      if (type != 1)
-        return -1;

      Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
      if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
          (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
        return -1;

-      if (batch_block && batch_block != block)
+      if (batch_block && (batch_block != block || batch_type != type))
      {
        MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
-        if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
+        if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
          return -1;
        batch_meta.clear();
      }
      batch_block = block;
+      batch_type = type;

-      const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
-                                   : cuda_seg_begin(dst->data, block, 0);
-      const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
-                                   : cuda_seg_begin(dst->data, block, 1);
-      const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
-                                   : cuda_seg_begin(dst->data, block, 2);
      const int sx = dst->data->shape[0];
      const int sy = dst->data->shape[1];
      const int sz = dst->data->shape[2];
      const int region_all = sx * sy * sz;

-      batch_meta.push_back(i0);
-      batch_meta.push_back(j0);
-      batch_meta.push_back(k0);
-      batch_meta.push_back(sx);
-      batch_meta.push_back(sy);
-      batch_meta.push_back(sz);
-      batch_meta.push_back(region_all);
-      batch_meta.push_back(size_out);
+      if (dir == UNPACK || type == 1)
+      {
+        const int i0 = cuda_seg_begin(dst->data, block, 0);
+        const int j0 = cuda_seg_begin(dst->data, block, 1);
+        const int k0 = cuda_seg_begin(dst->data, block, 2);
+        batch_meta.push_back(i0);
+        batch_meta.push_back(j0);
+        batch_meta.push_back(k0);
+        batch_meta.push_back(sx);
+        batch_meta.push_back(sy);
+        batch_meta.push_back(sz);
+        batch_meta.push_back(region_all);
+        batch_meta.push_back(size_out);
+      }
+      else if (type == 2)
+      {
+        int first_fine[3];
+        if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
+          return -1;
+        batch_meta.push_back(sx);
+        batch_meta.push_back(sy);
+        batch_meta.push_back(sz);
+        batch_meta.push_back(region_all);
+        batch_meta.push_back(size_out);
+        batch_meta.push_back(first_fine[0]);
+        batch_meta.push_back(first_fine[1]);
+        batch_meta.push_back(first_fine[2]);
+      }
+      else
+      {
+        int first_fine_ii[3], coarse_lb[3];
+        if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
+          return -1;
+        batch_meta.push_back(sx);
+        batch_meta.push_back(sy);
+        batch_meta.push_back(sz);
+        batch_meta.push_back(region_all);
+        batch_meta.push_back(size_out);
+        batch_meta.push_back(first_fine_ii[0]);
+        batch_meta.push_back(first_fine_ii[1]);
+        batch_meta.push_back(first_fine_ii[2]);
+        batch_meta.push_back(coarse_lb[0]);
+        batch_meta.push_back(coarse_lb[1]);
+        batch_meta.push_back(coarse_lb[2]);
+      }

      size_out += state_count * region_all;
    }
@@ -748,7 +866,7 @@ int cuda_data_packer_device_batched(double *data,
  if (batch_block)
  {
    MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
-    if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
+    if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
      return -1;
  }
  return size_out;
@@ -796,6 +914,89 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
  return has_work;
 }

+struct CudaEligibilityStats
+{
+  int active;
+  int type1;
+  int type2;
+  int type3;
+  int null_seg;
+  int no_resident;
+  int param_fail;
+  int unsupported_state;
+};
+
+void cuda_collect_eligibility_stats(MyList<Parallel::gridseg> *src,
+                                    MyList<Parallel::gridseg> *dst,
+                                    int rank_in,
+                                    int dir,
+                                    int myrank,
+                                    int state_count,
+                                    CudaEligibilityStats &stats)
+{
+  if (!cuda_device_state_count_supported(state_count))
+  {
+    stats.unsupported_state++;
+    return;
+  }
+  while (src && dst)
+  {
+    const bool active =
+        (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
+    if (active)
+    {
+      stats.active++;
+      if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
+      {
+        stats.null_seg++;
+        src = src->next;
+        dst = dst->next;
+        continue;
+      }
+      int type;
+      if (src->data->Bg->lev == dst->data->Bg->lev)
+        type = 1;
+      else if (src->data->Bg->lev > dst->data->Bg->lev)
+        type = 2;
+      else
+        type = 3;
+      if (type == 1) stats.type1++;
+      else if (type == 2) stats.type2++;
+      else stats.type3++;
+
+#if USE_CUDA_BSSN
+      if (dir == PACK)
+      {
+        if (bssn_cuda_has_resident_state(src->data->Bg) == 0)
+          stats.no_resident++;
+        else if (type == 2)
+        {
+          int first_fine[3];
+          if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
+            stats.param_fail++;
+        }
+        else if (type == 3)
+        {
+          int first_fine_ii[3], coarse_lb[3];
+          if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
+            stats.param_fail++;
+        }
+      }
+      else
+      {
+        if (bssn_cuda_has_resident_state(dst->data->Bg) == 0)
+          stats.no_resident++;
+      }
+#else
+      (void)type;
+#endif
+    }
+    src = src->next;
+    dst = dst->next;
+  }
+}
+
 bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
                                  MyList<Parallel::gridseg> *dst,
                                  int rank_in,
@@ -5379,19 +5580,33 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
      cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0;
      cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0;
    }
-    if (cuda_mpi_diag_enabled())
-    {
-      static int diag_reported = 0;
-      int rep = diag_reported;
-      if (myrank == 0 && rep < 10)
-      {
-        if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
-          fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
-                          "device_recvs=%d cuda_aware_mpi=%d\n",
-                  myrank, cuda_device_sends, cuda_device_recvs,
-                  cuda_aware_mpi_enabled() ? 1 : 0);
-      }
-    }
+	    if (cuda_mpi_diag_enabled())
+	    {
+	      static int diag_reported = 0;
+	      int rep = diag_reported;
+	      if (myrank == 0 && rep < cuda_mpi_diag_limit())
+	      {
+	        if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
+	        {
+	          CudaEligibilityStats send_stats = {};
+	          CudaEligibilityStats recv_stats = {};
+	          for (int n = 0; n < cpusize; n++)
+	          {
+	            cuda_collect_eligibility_stats(src[myrank], dst[myrank], n, PACK, myrank, state_count, send_stats);
+	            cuda_collect_eligibility_stats(src[n], dst[n], n, UNPACK, myrank, state_count, recv_stats);
+	          }
+	          fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
+	                          "device_recvs=%d cuda_aware_mpi=%d send_active=%d type=[%d,%d,%d] "
+	                          "send_nores=%d send_param=%d recv_active=%d recv_type=[%d,%d,%d] recv_nores=%d\n",
+	                  myrank, cuda_device_sends, cuda_device_recvs,
+	                  cuda_aware_mpi_enabled() ? 1 : 0,
+	                  send_stats.active, send_stats.type1, send_stats.type2, send_stats.type3,
+	                  send_stats.no_resident, send_stats.param_fail,
+	                  recv_stats.active, recv_stats.type1, recv_stats.type2, recv_stats.type3,
+	                  recv_stats.no_resident);
+	        }
+	      }
+	    }
  }
  else
  {
@@ -5688,7 +5903,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
    {
      static int diag_reported = 0;
      int rep = diag_reported;
-      if (myrank == 0 && rep < 20)
+	      if (myrank == 0 && rep < cuda_mpi_diag_limit())
      {
        if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
          fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d "