From 35b6ceff023fb0753423b0cabd8d1fe694cb12d6 Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Fri, 1 May 2026 18:03:04 +0800
Subject: [PATCH] Broaden cached CUDA sync paths

---
 AMSS_NCKU_source/Parallel.C          | 42 +++++++++++++++++++
 AMSS_NCKU_source/Z4c_class.C         |  4 +-
 AMSS_NCKU_source/bssnEM_class.C      |  4 +-
 AMSS_NCKU_source/bssnEScalar_class.C |  4 +-
 AMSS_NCKU_source/z4c_rhs_cuda.cu     | 60 ++++++++++++++++++++++++++++
 AMSS_NCKU_source/z4c_rhs_cuda.h      | 14 +++++++
 6 files changed, 122 insertions(+), 6 deletions(-)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 3e941ff..3174cc2 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -608,6 +608,24 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
                                         MyList<var> *VarLists,
                                         int Symmetry)
 {
+#if USE_CUDA_Z4C && (ABEtype == 2)
+  if (state_count == Z4C_CUDA_STATE_COUNT)
+  {
+    if (type != 1)
+      return false;
+    const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+    const int i0 = cuda_seg_begin(dst, src->Bg, 0);
+    const int j0 = cuda_seg_begin(dst, src->Bg, 1);
+    const int k0 = cuda_seg_begin(dst, src->Bg, 2);
+    const bool ok = z4c_cuda_pack_state_batch_to_device_buffer(
+        src->Bg, state_count, buffer, src->Bg->shape,
+        i0, j0, k0,
+        dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    if (sync_profile_enabled())
+      sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
+    return ok;
+  }
+#endif
 #if USE_CUDA_BSSN
   if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
     return false;
@@ -777,6 +795,22 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
                                             int state_count,
                                             MyList<var> *VarListd)
 {
+#if USE_CUDA_Z4C && (ABEtype == 2)
+  if (state_count == Z4C_CUDA_STATE_COUNT)
+  {
+    const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+    const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
+    const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
+    const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
+    const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
+        dst->Bg, state_count, buffer, dst->Bg->shape,
+        i0, j0, k0,
+        dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+    if (sync_profile_enabled())
+      sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
+    return ok;
+  }
+#endif
 #if USE_CUDA_BSSN
   if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
     return false;
@@ -856,6 +890,10 @@ bool cuda_unpack_host_region_to_resident(Block *block,
 
 bool cuda_device_state_count_supported(int state_count)
 {
+#if USE_CUDA_Z4C && (ABEtype == 2)
+  if (state_count == Z4C_CUDA_STATE_COUNT)
+    return true;
+#endif
 #if USE_CUDA_BSSN
   return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
 #else
@@ -933,6 +971,10 @@ int cuda_data_packer_device_batched(double *data,
   const int state_count = cuda_state_var_count(VarLists, VarListd);
   if (!cuda_device_state_count_supported(state_count))
     return -1;
+#if USE_CUDA_Z4C && (ABEtype == 2)
+  if (state_count == Z4C_CUDA_STATE_COUNT)
+    return -1;
+#endif
 
   int size_out = 0;
   Block *batch_block = 0;
diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C
index 741919f..41afb8a 100644
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -537,7 +537,7 @@ void Z4c_class::Step(int lev, int YN)
     MPI_Abort(MPI_COMM_WORLD, 1);
   }
 
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
 
   if (BH_num > 0 && lev == GH->levels - 1)
   {
@@ -635,7 +635,7 @@ void Z4c_class::Step(int lev, int YN)
       MPI_Abort(MPI_COMM_WORLD, 1);
     }
 
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
 
     if (BH_num > 0 && lev == GH->levels - 1)
     {
diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C
index e06b701..0a9e199 100644
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -1221,7 +1221,7 @@ void bssnEM_class::Step(int lev, int YN)
   }
 #endif
 
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
 
 #ifdef WithShell
   if (lev == 0)
@@ -1683,7 +1683,7 @@ void bssnEM_class::Step(int lev, int YN)
     }
 #endif
 
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
 
 #ifdef WithShell
     if (lev == 0)
diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C
index c1e71cd..48c70bc 100644
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -993,7 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN)
   }
 #endif
 
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
 
 #ifdef WithShell
   if (lev == 0)
@@ -1349,7 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN)
     }
 #endif
 
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
 
 #ifdef WithShell
     if (lev == 0)
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu
index 79a11c1..8addc3e 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -5224,6 +5224,36 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
     }
 }
 
+static void copy_state_region_packed_batch_device_cuda(void *block_tag,
+                                                       int state_count,
+                                                       double *device_buffer,
+                                                       const int *ex,
+                                                       int i0, int j0, int k0,
+                                                       int sx, int sy, int sz,
+                                                       int pack_not_unpack)
+{
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return;
+
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const int region_all = sx * sy * sz;
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+
+    if (pack_not_unpack) {
+        kern_pack_state_region_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
+            i0, j0, k0, sx, sy, sz, region_all, state_count,
+            ex[0] * ex[1] * ex[2]);
+    } else {
+        kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
+            i0, j0, k0, sx, sy, sz, region_all, state_count,
+            ex[0] * ex[1] * ex[2]);
+        ctx.state_ready = true;
+    }
+}
+
 static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
 {
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
@@ -7451,6 +7481,36 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
     return 0;
 }
 
+extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
+                                                          int state_count,
+                                                          double *device_buffer,
+                                                          int *ex,
+                                                          int i0, int j0, int k0,
+                                                          int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
+                                               i0, j0, k0, sx, sy, sz, 1);
+    return 0;
+}
+
+extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
+                                                              int state_count,
+                                                              double *device_buffer,
+                                                              int *ex,
+                                                              int i0, int j0, int k0,
+                                                              int sx, int sy, int sz)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
+                                               i0, j0, k0, sx, sy, sz, 0);
+    return 0;
+}
+
 extern "C" int z4c_cuda_download_state_subset(void *block_tag,
                                               int *ex,
                                               int subset_count,
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h
index a1be80b..212965a 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -60,6 +60,20 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 
+int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
+                                               int state_count,
+                                               double *device_buffer,
+                                               int *ex,
+                                               int i0, int j0, int k0,
+                                               int sx, int sy, int sz);
+
+int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
+                                                   int state_count,
+                                                   double *device_buffer,
+                                                   int *ex,
+                                                   int i0, int j0, int k0,
+                                                   int sx, int sy, int sz);
+
 int z4c_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,