From 1064a68d1680fa13ff7790ae16c8716dd99b65ba Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Thu, 7 May 2026 21:38:16 +0800
Subject: [PATCH] Optimize BSSN-EM 8th-order AMR transfers

---
 AMSS_NCKU_source/Parallel.C | 235 ++++++++++++++++++++++++++++++++++++
 makefile_and_run.py         |   6 +
 2 files changed, 241 insertions(+)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 2d550fa..332f9b5 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data,
   }
   return size_out;
 }
+
+bool cuda_ensure_bssn_block_resident_for_pack(Block *block,
+                                              MyList<var> *vars,
+                                              int state_count,
+                                              std::vector<Block *> &uploaded)
+{
+  if (!block)
+    return false;
+  if (bssn_cuda_has_resident_state(block) != 0)
+    return true;
+  for (size_t i = 0; i < uploaded.size(); ++i)
+  {
+    if (uploaded[i] == block)
+      return bssn_cuda_has_resident_state(block) != 0;
+  }
+
+  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+  if (!cuda_build_bssn_host_views(block, vars, state_count, views))
+    return false;
+  if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0)
+    return false;
+  uploaded.push_back(block);
+  return bssn_cuda_has_resident_state(block) != 0;
+}
+
+void cuda_host_batch_diag(const char *reason, int state_count, int type)
+{
+  static int reported = 0;
+  const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG");
+  if (!env || atoi(env) == 0 || reported >= 32)
+    return;
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  fprintf(stderr,
+          "[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n",
+          rank, reason ? reason : "unknown", state_count, type);
+  fflush(stderr);
+  reported++;
+}
+
+int cuda_transfer_active_length_if_batched_eligible(MyList<Parallel::gridseg> *src,
+                                                    MyList<Parallel::gridseg> *dst,
+                                                    int rank_in,
+                                                    int dir,
+                                                    MyList<var> *VarLists,
+                                                    int state_count,
+                                                    int myrank,
+                                                    std::vector<Block *> &uploaded)
+{
+  if (dir != PACK && dir != UNPACK)
+  {
+    cuda_host_batch_diag("bad_dir", state_count, -1);
+    return -1;
+  }
+  if (!cuda_device_segment_batch_enabled())
+  {
+    cuda_host_batch_diag("segment_batch_off", state_count, -1);
+    return -1;
+  }
+  if (!cuda_device_state_count_supported(state_count))
+  {
+    cuda_host_batch_diag("unsupported_state_count", state_count, -1);
+    return -1;
+  }
+  if (cuda_amr_restrict_compare_enabled())
+  {
+    cuda_host_batch_diag("compare_enabled", state_count, -1);
+    return -1;
+  }
+
+  int total = 0;
+  bool has_work = false;
+  bool has_amr = false;
+  while (src && dst)
+  {
+    const bool active =
+        (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
+    if (active)
+    {
+      has_work = true;
+      if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
+      {
+        cuda_host_batch_diag("null_segment", state_count, -1);
+        return -1;
+      }
+      int type;
+      if (src->data->Bg->lev == dst->data->Bg->lev)
+        type = 1;
+      else if (src->data->Bg->lev > dst->data->Bg->lev)
+        type = 2;
+      else
+        type = 3;
+
+      if (type == 2 || type == 3)
+        has_amr = true;
+      if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
+      {
+        cuda_host_batch_diag("restrict_batch_off", state_count, type);
+        return -1;
+      }
+      if (dir == PACK)
+      {
+        if ((type == 2 || type == 3) &&
+            !cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists,
+                                                      state_count, uploaded))
+        {
+          cuda_host_batch_diag("resident_upload_failed", state_count, type);
+          return -1;
+        }
+        if (!cuda_can_direct_pack(src->data, dst->data, type))
+        {
+          cuda_host_batch_diag("direct_pack_ineligible", state_count, type);
+          return -1;
+        }
+      }
+      else
+      {
+        if (!cuda_can_direct_unpack(dst->data, type))
+        {
+          cuda_host_batch_diag("direct_unpack_ineligible", state_count, type);
+          return -1;
+        }
+      }
+
+      total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
+    }
+    src = src->next;
+    dst = dst->next;
+  }
+  if (!has_work)
+    return 0;
+  if (!has_amr)
+  {
+    cuda_host_batch_diag("no_amr_segment", state_count, -1);
+    return -1;
+  }
+  return total;
+}
+
+int cuda_data_packer_host_staged_batched(double *host_data,
+                                         MyList<Parallel::gridseg> *src,
+                                         MyList<Parallel::gridseg> *dst,
+                                         int rank_in,
+                                         int dir,
+                                         MyList<var> *VarLists,
+                                         MyList<var> *VarListd,
+                                         int Symmetry)
+{
+  if (!host_data || !cuda_amr_host_staged_enabled())
+  {
+    cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1);
+    return -1;
+  }
+
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  const int state_count = cuda_state_var_count(VarLists, VarListd);
+  if (state_count < 0)
+  {
+    cuda_host_batch_diag("var_list_mismatch", state_count, -1);
+    return -1;
+  }
+
+  std::vector<Block *> uploaded;
+  const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in,
+                                                                    dir, VarLists,
+                                                                    state_count, myrank,
+                                                                    uploaded);
+  if (total <= 0)
+    return total;
+
+  static double *stage_dev = 0;
+  static int stage_cap = 0;
+  if (total > stage_cap)
+  {
+    free_device_comm_buffer(stage_dev);
+    stage_dev = alloc_device_comm_buffer(total);
+    stage_cap = total;
+  }
+
+  if (dir == UNPACK)
+  {
+    cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double),
+                                 cudaMemcpyHostToDevice);
+    if (h2d != cudaSuccess)
+    {
+      fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n",
+              (int)h2d);
+      return -1;
+    }
+  }
+
+  const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+  const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir,
+                                                     VarLists, VarListd, Symmetry);
+  if (packed != total)
+  {
+    cuda_host_batch_diag("device_batched_failed", state_count, -1);
+    return -1;
+  }
+
+  if (dir == PACK)
+  {
+    cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double),
+                                 cudaMemcpyDeviceToHost);
+    if (d2h != cudaSuccess)
+    {
+      fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n",
+              (int)d2h);
+      return -1;
+    }
+  }
+
+  if (sync_profile_enabled())
+  {
+    const double dt = MPI_Wtime() - t0;
+    if (dir == PACK)
+      sync_profile_stats().direct_pack_sec += dt;
+    else
+      sync_profile_stats().direct_unpack_sec += dt;
+  }
+  return total;
+}
 #endif
 
 bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
@@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
     MPI_Abort(MPI_COMM_WORLD, 1);
   }
 
+#if USE_CUDA_BSSN
+  if (data && (dir == PACK || dir == UNPACK))
+  {
+    const int batched = cuda_data_packer_host_staged_batched(data, src, dst, rank_in, dir,
+                                                             VarLists, VarListd, Symmetry);
+    if (batched >= 0)
+      return batched;
+  }
+#endif
+
   int type; /* 1 copy, 2 restrict, 3 prolong */
   if (src->data->Bg->lev == dst->data->Bg->lev)
     type = 1;
diff --git a/makefile_and_run.py b/makefile_and_run.py
index 2edd4ef..6402c3e 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -167,6 +167,12 @@ def _gpu_runtime_env():
             "AMSS_INTERP_GPU": "0",
             "AMSS_CUDA_AWARE_MPI": "0",
         })
+    if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
+        defaults.update({
+            "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
+            "AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
+            "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
+        })
     if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
         defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
     if getattr(input_data, "Equation_Class", "") == "Z4C":