From 4484635f0d9197547744c073e940cd1727d9b4ca Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Thu, 9 Apr 2026 18:49:11 +0800
Subject: [PATCH] Pack sync send buffers directly from GPU state

---
 AMSS_NCKU_source/Parallel.C       | 138 ++++++++++++++++++++++++------
 AMSS_NCKU_source/bssn_cuda_step.C |  25 +++++-
 AMSS_NCKU_source/bssn_gpu.cu      |  68 +++++++++++++++
 AMSS_NCKU_source/bssn_gpu.h       |   7 ++
 4 files changed, 210 insertions(+), 28 deletions(-)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 3ecb14b..5ebb1b2 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3,6 +3,7 @@
 #include "fmisc.h"
 #include "prolongrestrict.h"
 #include "bssn_cuda_ops.h"
+#include "bssn_gpu.h"
 #include "misc.h"
 #include "parameters.h"
 #include <cstring>
@@ -43,6 +44,85 @@ struct ParallelTransferContextGuard
   }
 };
 
+bool parallel_can_gpu_pack_segments(MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst,
+                                    int rank_in, MyList<var> *VarLists, MyList<var> *VarListd)
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (!src || !dst)
+    return false;
+
+  if (src->data->Bg->lev != dst->data->Bg->lev)
+    return false;
+
+  while (src && dst)
+  {
+    if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank))
+    {
+      MyList<var> *varls = VarLists;
+      MyList<var> *varld = VarListd;
+      while (varls && varld)
+      {
+        (void)varld;
+        if (!bssn_gpu_find_device_buffer(src->data->Bg->fgfs[varls->data->sgfn]))
+          return false;
+        varls = varls->next;
+        varld = varld->next;
+      }
+      if (varls || varld)
+        return false;
+    }
+    src = src->next;
+    dst = dst->next;
+  }
+  return true;
+}
+
+bool parallel_gpu_pack_segments(double *data,
+                                MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst,
+                                int rank_in, MyList<var> *VarLists, MyList<var> *VarListd)
+{
+  if (!data)
+    return false;
+
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (!src || !dst)
+    return false;
+  if (src->data->Bg->lev != dst->data->Bg->lev)
+    return false;
+
+  int size_out = 0;
+  while (src && dst)
+  {
+    if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank))
+    {
+      MyList<var> *varls = VarLists;
+      MyList<var> *varld = VarListd;
+      while (varls && varld)
+      {
+        (void)varld;
+        if (bssn_gpu_stage_download_region_to_buffer(src->data->Bg->fgfs[varls->data->sgfn],
+                                                     src->data->Bg->shape,
+                                                     src->data->Bg->bbox,
+                                                     src->data->Bg->bbox + dim,
+                                                     dst->data->shape,
+                                                     dst->data->llb,
+                                                     data + size_out))
+          return false;
+        size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
+        varls = varls->next;
+        varld = varld->next;
+      }
+      if (varls || varld)
+        return false;
+    }
+    src = src->next;
+    dst = dst->next;
+  }
+  return true;
+}
+
 void parallel_report_mpi_error(const char *context, int errcode, int req_no)
 {
   char errstr[MPI_MAX_ERROR_STRING];
@@ -4843,27 +4923,30 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
 
   for (int node = 0; node < cpusize; node++)
   {
-    if (node == myrank)
-    {
-      int length;
+    if (node == myrank)
+    {
+      int length;
       if (!cache.lengths_valid) {
         length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
         cache.recv_lengths[node] = length;
       } else {
         length = cache.recv_lengths[node];
       }
-      if (length > 0)
-      {
-        if (length > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[length];
-          cache.recv_buf_caps[node] = length;
-        }
-        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-      }
-    }
-    else
+      if (length > 0)
+      {
+        if (length > cache.recv_buf_caps[node])
+        {
+          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
+          cache.recv_bufs[node] = new double[length];
+          cache.recv_buf_caps[node] = length;
+        }
+        bssn_gpu_prepare_host_buffer(cache.recv_bufs[node], length);
+        if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) ||
+            !parallel_gpu_pack_segments(cache.recv_bufs[node], src[myrank], dst[myrank], node, VarList, VarList))
+          data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+      }
+    }
+    else
     {
       int slength;
       if (!cache.lengths_valid) {
@@ -4872,17 +4955,20 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
       } else {
         slength = cache.send_lengths[node];
       }
-      if (slength > 0)
-      {
-        if (slength > cache.send_buf_caps[node])
-        {
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-          cache.send_bufs[node] = new double[slength];
-          cache.send_buf_caps[node] = slength;
-        }
-        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
-        state.req_node[state.req_no] = node;
-        state.req_is_recv[state.req_no] = 0;
+      if (slength > 0)
+      {
+        if (slength > cache.send_buf_caps[node])
+        {
+          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
+          cache.send_bufs[node] = new double[slength];
+          cache.send_buf_caps[node] = slength;
+        }
+        bssn_gpu_prepare_host_buffer(cache.send_bufs[node], slength);
+        if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) ||
+            !parallel_gpu_pack_segments(cache.send_bufs[node], src[myrank], dst[myrank], node, VarList, VarList))
+          data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
+        state.req_node[state.req_no] = node;
+        state.req_is_recv[state.req_no] = 0;
         MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
       }
       int rlength;
diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C
index e4a759a..4be8d1e 100644
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -306,6 +306,26 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
         return true;
       };
 
+  auto can_pack_sync_from_device =
+      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
+        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
+          return false;
+
+        MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
+        while (seg)
+        {
+          MyList<var> *var_it = var_list;
+          while (var_it)
+          {
+            if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
+              return false;
+            var_it = var_it->next;
+          }
+          seg = seg->next;
+        }
+        return true;
+      };
+
   MyList<Patch> *Pp = GH->PatL[lev];
   while (Pp)
   {
@@ -341,7 +361,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
     Pp = Pp->next;
   }
 
-  if (!ERROR && sync_cache_pre[lev].valid)
+  if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev]))
     refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
 
   MPI_Request err_req_pre;
@@ -433,7 +453,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
       Pp = Pp->next;
     }
 
-    if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3)
+    if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 &&
+        !can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev]))
       refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
 
     MPI_Request err_req_cor;
diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu
index efcef1f..730bdec 100644
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -1099,6 +1099,74 @@ int bssn_gpu_stage_download_region(double *host_ptr,
 	return 0;
 }
 
+int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
+                                             const int *full_shape,
+                                             const double *full_llb,
+                                             const double *full_uub,
+                                             const int *region_shape,
+                                             const double *region_llb,
+                                             double *host_dst_ptr)
+{
+	if (!host_src_ptr || !host_dst_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb)
+		return 1;
+
+	const double *device_ptr = bssn_gpu_find_device_buffer(host_src_ptr);
+	if (!device_ptr)
+		return 1;
+
+	int start[3] = {0, 0, 0};
+	for (int i = 0; i < 3; ++i)
+	{
+		if (full_shape[i] <= 0 || region_shape[i] <= 0)
+			return 1;
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i] - 1);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#ifdef Cell
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i]);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+		if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i])
+			return 1;
+	}
+
+	cudaMemcpy3DParms parms = {};
+	parms.srcPtr = make_cudaPitchedPtr(const_cast<double *>(device_ptr),
+	                                   static_cast<size_t>(full_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(full_shape[0]),
+	                                   static_cast<size_t>(full_shape[1]));
+	parms.dstPtr = make_cudaPitchedPtr(host_dst_ptr,
+	                                   static_cast<size_t>(region_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(region_shape[0]),
+	                                   static_cast<size_t>(region_shape[1]));
+	parms.srcPos = make_cudaPos(static_cast<size_t>(start[0]) * sizeof(double),
+	                            static_cast<size_t>(start[1]),
+	                            static_cast<size_t>(start[2]));
+	parms.dstPos = make_cudaPos(0, 0, 0);
+	parms.extent = make_cudaExtent(static_cast<size_t>(region_shape[0]) * sizeof(double),
+	                               static_cast<size_t>(region_shape[1]),
+	                               static_cast<size_t>(region_shape[2]));
+	parms.kind = cudaMemcpyDeviceToHost;
+
+	cudaError_t err = cudaMemcpy3D(&parms);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaMemcpy3D(D2H region->buffer) failed: " << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+
+	return 0;
+}
+
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
 	if(_t == 0)
diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h
index d4561e8..1e89ca0 100644
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -84,6 +84,13 @@ int bssn_gpu_stage_download_region(double *host_ptr,
                                    const double *full_uub,
                                    const int *region_shape,
                                    const double *region_llb);
+int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
+                                             const int *full_shape,
+                                             const double *full_llb,
+                                             const double *full_uub,
+                                             const int *region_shape,
+                                             const double *region_llb,
+                                             double *host_dst_ptr);
 
 /** Init GPU side data in GPUMeta. */
 // void init_fluid_meta_gpu(GPUMeta *gpu_meta);