Support 2nd and 8th order CUDA AMR paths

2026-05-07 20:31:26 +08:00
parent c4d8d41b25
commit dcc83bafcb
4 changed files with 150 additions and 18 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -245,11 +245,15 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
                                   const Parallel::gridseg *dst,
                                   int first_fine[3])
 {
-#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
+#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5)
-#if ghost_width == 4
+#if ghost_width == 5
  const int stencil_hi = 5;
 #elif ghost_width == 4
  const int stencil_hi = 4;
-#else
+#elif ghost_width == 3
  const int stencil_hi = 3;
 #else
  const int stencil_hi = 2;
 #endif
  if (!src || !dst || !src->Bg || !dst->Bg)
    return false;
@@ -291,11 +295,15 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
                                  int first_fine_ii[3],
                                  int coarse_lb[3])
 {
-#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4))
+#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5)
-#if ghost_width == 4
+#if ghost_width == 5
  const int stencil_hi = 5;
 #elif ghost_width == 4
  const int stencil_hi = 4;
-#else
+#elif ghost_width == 3
  const int stencil_hi = 3;
 #else
  const int stencil_hi = 2;
 #endif
  if (!src || !dst || !src->Bg || !dst->Bg)
    return false;
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -7622,7 +7622,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
@@ -7630,13 +7639,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7708,7 +7723,16 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s
    const int offset = m[4];
    const int fi0 = m[5], fj0 = m[6], fk0 = m[7];
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
@@ -7716,13 +7740,19 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7768,7 +7798,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
@@ -7781,7 +7826,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
@@ -7792,6 +7837,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7847,7 +7901,22 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
    const int ii0 = m[5], jj0 = m[6], kk0 = m[7];
    const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10];
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
@@ -7860,7 +7929,7 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
@@ -7871,6 +7940,15 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -5232,7 +5232,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
@@ -5240,13 +5249,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -5289,7 +5304,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
 {
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
-#if ghost_width == 4
+#if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
 #elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
@@ -5302,7 +5332,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
-#else
+#elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
@@ -5313,6 +5343,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
 #else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
 #endif
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -138,6 +138,7 @@ def _stop_cuda_mps(runtime_env):
 def _gpu_runtime_env():
    runtime_env = os.environ.copy()
    finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip()
    defaults = {
        "AMSS_EVOLVE_TIMING": "1",
@@ -160,6 +161,12 @@ def _gpu_runtime_env():
        "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
        "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
    }
    if finite_difference in ("2nd-order", "8th-order"):
        defaults.update({
            "AMSS_INTERP_FAST": "0",
            "AMSS_INTERP_GPU": "0",
            "AMSS_CUDA_AWARE_MPI": "0",
        })
    if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
        defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
    if getattr(input_data, "Equation_Class", "") == "Z4C":