Checkpoint Z4C CUDA optimization progress

2026-05-02 08:55:25 +08:00
parent a5c8188305
commit fcd98649f6
4 changed files with 180 additions and 24 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -193,6 +193,28 @@ bool cuda_build_bssn_host_views(Block *block,
 }
 #endif

+#if USE_CUDA_Z4C && (ABEtype == 2)
+bool cuda_build_z4c_host_views(Block *block,
+                               MyList<var> *vars,
+                               int state_count,
+                               double **views)
+{
+  if (!block || !vars || !views || state_count != Z4C_CUDA_STATE_COUNT)
+    return false;
+  MyList<var> *v = vars;
+  for (int i = 0; i < Z4C_CUDA_STATE_COUNT; ++i)
+  {
+    if (!v)
+      return false;
+    views[i] = block->fgfs[v->data->sgfn];
+    if (!views[i])
+      return false;
+    v = v->next;
+  }
+  return v == 0;
+}
+#endif
+
 bool cuda_build_state_soa(MyList<var> *vars,
                          int state_count,
                          double *soa_flat)
@@ -220,6 +242,8 @@ int fortran_idint(double x)

 bool cuda_amr_restrict_device_enabled();
 bool cuda_amr_prolong_device_enabled();
+bool cuda_z4c_amr_prolong_device_enabled();
+bool cuda_z4c_amr_unpack_device_enabled();
 bool cuda_amr_restrict_compare_enabled();
 bool cuda_amr_restrict_batch_enabled();
 bool cuda_device_segment_batch_enabled();
@@ -324,7 +348,8 @@ bool cuda_state_count_direct_supported(int state_count)
 #endif
 }

-bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
+bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst,
+                          int type, MyList<var> *VarLists = 0)
 {
  if (!src || !dst || !src->Bg)
    return false;
@@ -342,7 +367,7 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
  }
  if (type == 2 && !cuda_amr_restrict_device_enabled())
    return false;
-  if (type == 3 && !cuda_amr_prolong_device_enabled())
+  if (type == 3 && (!cuda_amr_prolong_device_enabled() || !cuda_z4c_amr_prolong_device_enabled()))
    return false;
  if (type == 2) {
    int a[3];
@@ -354,7 +379,17 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
    if (!cuda_cell_gw3_prolong_params(src, dst, a, b))
      return false;
  }
-  return z4c_cuda_has_resident_state(src->Bg) != 0;
+  if (z4c_cuda_has_resident_state(src->Bg) == 0)
+    return false;
+  if (type != 1 && VarLists)
+  {
+    double *view_ptrs[Z4C_CUDA_STATE_COUNT];
+    if (!cuda_build_z4c_host_views(src->Bg, VarLists, Z4C_CUDA_STATE_COUNT, view_ptrs))
+      return false;
+    if (z4c_cuda_resident_state_matches(src->Bg, view_ptrs) == 0)
+      return false;
+  }
+  return true;
 #elif USE_CUDA_BSSN
  if (bssn_cuda_has_resident_state(src->Bg) == 0)
    return false;
@@ -380,12 +415,24 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
 #endif
 }

-bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
+bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var> *VarListd = 0)
 {
  if (type < 1 || type > 3 || !dst || !dst->Bg)
    return false;
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  return z4c_cuda_has_resident_state(dst->Bg) != 0;
+  if (type != 1 && !cuda_z4c_amr_unpack_device_enabled())
+    return false;
+  if (z4c_cuda_has_resident_state(dst->Bg) == 0)
+    return false;
+  if (type != 1 && VarListd)
+  {
+    double *view_ptrs[Z4C_CUDA_STATE_COUNT];
+    if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
+      return false;
+    if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
+      return false;
+  }
+  return true;
 #elif USE_CUDA_BSSN
  return bssn_cuda_has_resident_state(dst->Bg) != 0;
 #else
@@ -507,6 +554,28 @@ bool cuda_amr_prolong_device_enabled()
  return enabled != 0;
 }

+bool cuda_z4c_amr_prolong_device_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_Z4C_AMR_PROLONG_DEVICE");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool cuda_z4c_amr_unpack_device_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_Z4C_AMR_UNPACK_DEVICE");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
 bool cuda_amr_restrict_compare_enabled()
 {
  static int enabled = -1;
@@ -1109,8 +1178,8 @@ int cuda_data_packer_device_batched(double *data,
        return -1;

      Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
-      if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
-          (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
+      if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type, VarLists)) ||
+          (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type, VarListd)))
        return -1;

      if (batch_block && (batch_block != block || batch_type != type))
@@ -1195,7 +1264,9 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
                                   int rank_in,
                                   int dir,
                                   int myrank,
-                                   int state_count)
+                                   int state_count,
+                                   MyList<var> *VarLists,
+                                   MyList<var> *VarListd)
 {
  bool has_work = false;
  while (src && dst)
@@ -1215,12 +1286,12 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
        type = 3;
      if (dir == PACK)
      {
-        if (!cuda_can_direct_pack(src->data, dst->data, type))
+        if (!cuda_can_direct_pack(src->data, dst->data, type, VarLists))
          return false;
      }
      else
      {
-        if (!cuda_can_direct_unpack(dst->data, type))
+        if (!cuda_can_direct_unpack(dst->data, type, VarListd))
          return false;
      }
    }
@@ -1318,11 +1389,13 @@ bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
                                  MyList<Parallel::gridseg> *dst,
                                  int rank_in,
                                  int state_count,
-                                  int myrank)
+                                  int myrank,
+                                  MyList<var> *VarLists,
+                                  MyList<var> *VarListd)
 {
  if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
    return false;
-  if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count))
+  if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count, VarLists, VarListd))
    return false;
  return true;
 }
@@ -1331,11 +1404,13 @@ bool cuda_recv_to_device_eligible(MyList<Parallel::gridseg> *src,
                                  MyList<Parallel::gridseg> *dst,
                                  int rank_in,
                                  int state_count,
-                                  int myrank)
+                                  int myrank,
+                                  MyList<var> *VarLists,
+                                  MyList<var> *VarListd)
 {
  if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
    return false;
-  if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count))
+  if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count, VarLists, VarListd))
    return false;
  return true;
 }
@@ -5133,7 +5208,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
          bool handled_by_cuda = false;
          if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) &&
              cuda_state_count_direct_supported(state_count) &&
-              cuda_can_direct_pack(src->data, dst->data, type))
+              cuda_can_direct_pack(src->data, dst->data, type, VarLists))
          {
            if (s_cuda_aware_pack_active) {
              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
@@ -5148,7 +5223,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
          }
          else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) &&
                   cuda_state_count_direct_supported(state_count) &&
-                   cuda_can_direct_unpack(dst->data, type))
+                   cuda_can_direct_unpack(dst->data, type, VarListd))
          {
            if (s_cuda_aware_pack_active) {
              handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd);
@@ -5229,6 +5304,24 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
 	            if (cuda_state_count_direct_supported(state_count) &&
 	                dst->data && dst->data->Bg)
 	            {
+#if USE_CUDA_Z4C && (ABEtype == 2)
+              const char *z4c_amr_env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE");
+              if (state_count == Z4C_CUDA_STATE_COUNT && type != 1 &&
+                  z4c_amr_env && atoi(z4c_amr_env) != 0)
+              {
+                double *views[Z4C_CUDA_STATE_COUNT];
+                if (cuda_build_z4c_host_views(dst->data->Bg, VarListd, state_count, views) &&
+                    z4c_cuda_resident_state_matches(dst->data->Bg, views) != 0)
+                {
+                  if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
+                  {
+                    cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
+                    MPI_Abort(MPI_COMM_WORLD, 1);
+                  }
+                }
+              }
+              else
+#endif
              if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
              {
                cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
@@ -5906,8 +5999,8 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
  {
    for (int n = 0; n < cpusize; n++)
    {
-      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
-      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
+      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank, VarList1, VarList2) ? 1 : 0;
+      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank, VarList1, VarList2) ? 1 : 0;
    }
    cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
    for (int n = 0; n < cpusize; n++)
@@ -6225,8 +6318,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
  {
    for (int n = 0; n < cpusize; n++)
    {
-      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
-      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
+      cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank, VarList, VarList) ? 1 : 0;
+      cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank, VarList, VarList) ? 1 : 0;
    }
    cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
    for (int n = 0; n < cpusize; n++)