Cover Z4C CUDA AMR restrict prolong

2026-05-07 19:49:09 +08:00
parent 0076b3ca18
commit c4d8d41b25
6 changed files with 321 additions and 32 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -174,6 +174,27 @@ int cuda_state_var_count(MyList<var> *src_vars, MyList<var> *dst_vars)
  return (src_vars || dst_vars) ? -1 : count;
 }

+#if USE_CUDA_BSSN || USE_CUDA_Z4C
+bool cuda_build_state_soa(MyList<var> *vars,
+                          int state_count,
+                          double *soa_flat)
+{
+  if (!vars || !soa_flat || state_count <= 0)
+    return false;
+  MyList<var> *v = vars;
+  for (int i = 0; i < state_count; ++i)
+  {
+    if (!v)
+      return false;
+    soa_flat[3 * i + 0] = v->data->SoA[0];
+    soa_flat[3 * i + 1] = v->data->SoA[1];
+    soa_flat[3 * i + 2] = v->data->SoA[2];
+    v = v->next;
+  }
+  return v == 0;
+}
+#endif
+
 #if USE_CUDA_BSSN
 bool cuda_build_bssn_host_views(Block *block,
                                MyList<var> *vars,
@@ -201,17 +222,7 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
  if (!vars || !soa_flat ||
      state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
    return false;
-  MyList<var> *v = vars;
-  for (int i = 0; i < state_count; ++i)
-  {
-    if (!v)
-      return false;
-    soa_flat[3 * i + 0] = v->data->SoA[0];
-    soa_flat[3 * i + 1] = v->data->SoA[1];
-    soa_flat[3 * i + 2] = v->data->SoA[2];
-    v = v->next;
-  }
-  return v == 0;
+  return cuda_build_state_soa(vars, state_count, soa_flat);
 }
 #endif

@@ -234,7 +245,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
                                   const Parallel::gridseg *dst,
                                   int first_fine[3])
 {
-#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
+#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
 #if ghost_width == 4
  const int stencil_hi = 4;
 #else
@@ -280,7 +291,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
                                  int first_fine_ii[3],
                                  int coarse_lb[3])
 {
-#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
+#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
 #if ghost_width == 4
  const int stencil_hi = 4;
 #else
@@ -355,9 +366,24 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
  if (!src || !dst || !src->Bg)
    return false;
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  if (type != 1)
+  if (z4c_cuda_has_resident_state(src->Bg) == 0)
    return false;
-  return z4c_cuda_has_resident_state(src->Bg) != 0;
+  if (type == 1)
+    return true;
+  int a[3], b[3];
+  if (type == 2)
+  {
+    if (!cuda_amr_restrict_device_enabled())
+      return false;
+    return cuda_cell_gw3_restrict_params(src, dst, a);
+  }
+  if (type == 3)
+  {
+    if (!cuda_amr_prolong_device_enabled())
+      return false;
+    return cuda_cell_gw3_prolong_params(src, dst, a, b);
+  }
+  return false;
 #elif USE_CUDA_BSSN
  if (bssn_cuda_has_resident_state(src->Bg) == 0)
    return false;
@@ -388,8 +414,6 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
  if (type < 1 || type > 3 || !dst || !dst->Bg)
    return false;
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  if (type != 1)
-    return false;
  return z4c_cuda_has_resident_state(dst->Bg) != 0;
 #elif USE_CUDA_BSSN
  return bssn_cuda_has_resident_state(dst->Bg) != 0;
@@ -431,7 +455,7 @@ bool cuda_direct_pack_segment(double *buffer,

  if (type == 2 || type == 3)
  {
-#if USE_CUDA_BSSN
+#if USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))
    if (!cuda_amr_host_staged_enabled())
      return false;
    const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
@@ -788,16 +812,43 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
 #if USE_CUDA_Z4C && (ABEtype == 2)
  if (state_count == Z4C_CUDA_STATE_COUNT)
  {
-    if (type != 1)
-      return false;
    const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
-    const int i0 = cuda_seg_begin(dst, src->Bg, 0);
-    const int j0 = cuda_seg_begin(dst, src->Bg, 1);
-    const int k0 = cuda_seg_begin(dst, src->Bg, 2);
-    const bool ok = z4c_cuda_pack_state_batch_to_device_buffer(
-        src->Bg, state_count, buffer, src->Bg->shape,
-        i0, j0, k0,
-        dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    bool ok = false;
+    double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
+    const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
+    if (type == 1)
+    {
+      const int i0 = cuda_seg_begin(dst, src->Bg, 0);
+      const int j0 = cuda_seg_begin(dst, src->Bg, 1);
+      const int k0 = cuda_seg_begin(dst, src->Bg, 2);
+      ok = z4c_cuda_pack_state_batch_to_device_buffer(
+          src->Bg, state_count, buffer, src->Bg->shape,
+          i0, j0, k0,
+          dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    }
+    else if (type == 2)
+    {
+      int first_fine[3];
+      if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
+        return false;
+      ok = z4c_cuda_restrict_state_batch_to_device_buffer(
+          src->Bg, state_count, buffer, src->Bg->shape,
+          dst->shape[0], dst->shape[1], dst->shape[2],
+          first_fine[0], first_fine[1], first_fine[2],
+          have_soa ? soa_flat : 0) == 0;
+    }
+    else if (type == 3)
+    {
+      int first_fine_ii[3], coarse_lb[3];
+      if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
+        return false;
+      ok = z4c_cuda_prolong_state_batch_to_device_buffer(
+          src->Bg, state_count, buffer, src->Bg->shape,
+          dst->shape[0], dst->shape[1], dst->shape[2],
+          first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+          coarse_lb[0], coarse_lb[1], coarse_lb[2],
+          have_soa ? soa_flat : 0) == 0;
+    }
    if (sync_profile_enabled())
      sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
    return ok;