From 30b778daa3283a5bc8ae35a2e8c4b3f5db5451fd Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Fri, 1 May 2026 18:51:19 +0800
Subject: [PATCH] Save Z4C CUDA transfer progress

---
 AMSS_NCKU_Input.py               |  2 +-
 AMSS_NCKU_source/macrodef.h      |  3 ++-
 AMSS_NCKU_source/z4c_rhs_cuda.cu | 25 +++++++++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/AMSS_NCKU_Input.py b/AMSS_NCKU_Input.py
index 73af547..40d252b 100755
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -31,7 +31,7 @@ GPU_Part         = 0.0
 ## Setting the physical system and numerical method
 
 Symmetry                 = "equatorial-symmetry"   ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
-Equation_Class           = "BSSN"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
+Equation_Class           = "Z4C"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
                                                    ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h
index f55e4f3..7160d1c 100644
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -10,7 +10,7 @@
 
 #define GaussInt
 
-#define ABEtype 0
+#define ABEtype 2
 
 //#define With_AHF
 #define Psi4type 0
@@ -167,3 +167,4 @@
 #define TINY 1e-10
 
 #endif   /* MICRODEF_H */
+
diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu
index 8addc3e..d6a46d8 100644
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -4676,6 +4676,18 @@ static void compute_patch_boundary_flags(int *ex,
 static void upload_state_inputs(double **state_host, size_t all)
 {
     const size_t bytes = all * sizeof(double);
+    static int direct_upload = -1;
+    if (direct_upload < 0) {
+        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
+        direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
+    }
+    if (direct_upload) {
+        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+            CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
+                                       bytes, cudaMemcpyHostToDevice));
+        }
+        return;
+    }
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
     }
@@ -5259,6 +5271,19 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
+    static int direct_download = -1;
+    if (direct_download < 0) {
+        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
+        direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
+    }
+    if (direct_download) {
+        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_state_curr[i],
+                                       bytes, cudaMemcpyDeviceToHost));
+        }
+        CUDA_CHECK(cudaDeviceSynchronize());
+        return;
+    }
     CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
                           (size_t)BSSN_STATE_COUNT * bytes,
                           cudaMemcpyDeviceToHost));