Support 2nd and 8th order CUDA AMR paths

This commit is contained in:
2026-05-07 20:31:26 +08:00
parent c4d8d41b25
commit dcc83bafcb
4 changed files with 150 additions and 18 deletions

View File

@@ -245,11 +245,15 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
const Parallel::gridseg *dst, const Parallel::gridseg *dst,
int first_fine[3]) int first_fine[3])
{ {
#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) #if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5)
#if ghost_width == 4 #if ghost_width == 5
const int stencil_hi = 5;
#elif ghost_width == 4
const int stencil_hi = 4; const int stencil_hi = 4;
#else #elif ghost_width == 3
const int stencil_hi = 3; const int stencil_hi = 3;
#else
const int stencil_hi = 2;
#endif #endif
if (!src || !dst || !src->Bg || !dst->Bg) if (!src || !dst || !src->Bg || !dst->Bg)
return false; return false;
@@ -291,11 +295,15 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
int first_fine_ii[3], int first_fine_ii[3],
int coarse_lb[3]) int coarse_lb[3])
{ {
#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) #if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5)
#if ghost_width == 4 #if ghost_width == 5
const int stencil_hi = 5;
#elif ghost_width == 4
const int stencil_hi = 4; const int stencil_hi = 4;
#else #elif ghost_width == 3
const int stencil_hi = 3; const int stencil_hi = 3;
#else
const int stencil_hi = 2;
#endif #endif
if (!src || !dst || !src->Bg || !dst->Bg) if (!src || !dst || !src->Bg || !dst->Bg)
return false; return false;

View File

@@ -7622,7 +7622,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
{ {
const int state_index = blockIdx.y; const int state_index = blockIdx.y;
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 35.0 / 65536.0;
const double c2 = -405.0 / 65536.0;
const double c3 = 567.0 / 16384.0;
const double c4 = -2205.0 / 16384.0;
const double c5 = 19845.0 / 32768.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -5.0 / 2048.0; const double c1 = -5.0 / 2048.0;
const double c2 = 49.0 / 2048.0; const double c2 = 49.0 / 2048.0;
const double c3 = -245.0 / 2048.0; const double c3 = -245.0 / 2048.0;
@@ -7630,13 +7639,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 3.0 / 256.0; const double c1 = 3.0 / 256.0;
const double c2 = -25.0 / 256.0; const double c2 = -25.0 / 256.0;
const double c3 = 75.0 / 128.0; const double c3 = 75.0 / 128.0;
const int offs[6] = {-2, -1, 0, 1, 2, 3}; const int offs[6] = {-2, -1, 0, 1, 2, 3};
const double w[6] = {c1, c2, c3, c3, c2, c1}; const double w[6] = {c1, c2, c3, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -1.0 / 16.0;
const double c2 = 9.0 / 16.0;
const int offs[4] = {-1, 0, 1, 2};
const double w[4] = {c1, c2, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7708,7 +7723,16 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s
const int offset = m[4]; const int offset = m[4];
const int fi0 = m[5], fj0 = m[6], fk0 = m[7]; const int fi0 = m[5], fj0 = m[6], fk0 = m[7];
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 35.0 / 65536.0;
const double c2 = -405.0 / 65536.0;
const double c3 = 567.0 / 16384.0;
const double c4 = -2205.0 / 16384.0;
const double c5 = 19845.0 / 32768.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -5.0 / 2048.0; const double c1 = -5.0 / 2048.0;
const double c2 = 49.0 / 2048.0; const double c2 = 49.0 / 2048.0;
const double c3 = -245.0 / 2048.0; const double c3 = -245.0 / 2048.0;
@@ -7716,13 +7740,19 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 3.0 / 256.0; const double c1 = 3.0 / 256.0;
const double c2 = -25.0 / 256.0; const double c2 = -25.0 / 256.0;
const double c3 = 75.0 / 128.0; const double c3 = 75.0 / 128.0;
const int offs[6] = {-2, -1, 0, 1, 2, 3}; const int offs[6] = {-2, -1, 0, 1, 2, 3};
const double w[6] = {c1, c2, c3, c3, c2, c1}; const double w[6] = {c1, c2, c3, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -1.0 / 16.0;
const double c2 = 9.0 / 16.0;
const int offs[4] = {-1, 0, 1, 2};
const double w[4] = {c1, c2, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7768,7 +7798,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
{ {
const int state_index = blockIdx.y; const int state_index = blockIdx.y;
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 13585.0 / 33554432.0;
const double c2 = -159885.0 / 33554432.0;
const double c3 = 230945.0 / 8388608.0;
const double c4 = -969969.0 / 8388608.0;
const double c5 = 14549535.0 / 16777216.0;
const double c6 = 4849845.0 / 16777216.0;
const double c7 = -692835.0 / 8388608.0;
const double c8 = 188955.0 / 8388608.0;
const double c9 = -138567.0 / 33554432.0;
const double c10 = 12155.0 / 33554432.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -495.0 / 262144.0; const double c1 = -495.0 / 262144.0;
const double c2 = 5005.0 / 262144.0; const double c2 = 5005.0 / 262144.0;
const double c3 = -27027.0 / 262144.0; const double c3 = -27027.0 / 262144.0;
@@ -7781,7 +7826,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 77.0 / 8192.0; const double c1 = 77.0 / 8192.0;
const double c2 = -693.0 / 8192.0; const double c2 = -693.0 / 8192.0;
const double c3 = 3465.0 / 4096.0; const double c3 = 3465.0 / 4096.0;
@@ -7792,6 +7837,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wl[6] = {c1, c2, c3, c4, c5, c6};
const double wr[6] = {c6, c5, c4, c3, c2, c1}; const double wr[6] = {c6, c5, c4, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -7.0 / 128.0;
const double c2 = 105.0 / 128.0;
const double c3 = 35.0 / 128.0;
const double c4 = -5.0 / 128.0;
const int offs[4] = {-1, 0, 1, 2};
const double wl[4] = {c1, c2, c3, c4};
const double wr[4] = {c4, c3, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -7847,7 +7901,22 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
const int ii0 = m[5], jj0 = m[6], kk0 = m[7]; const int ii0 = m[5], jj0 = m[6], kk0 = m[7];
const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10]; const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10];
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 13585.0 / 33554432.0;
const double c2 = -159885.0 / 33554432.0;
const double c3 = 230945.0 / 8388608.0;
const double c4 = -969969.0 / 8388608.0;
const double c5 = 14549535.0 / 16777216.0;
const double c6 = 4849845.0 / 16777216.0;
const double c7 = -692835.0 / 8388608.0;
const double c8 = 188955.0 / 8388608.0;
const double c9 = -138567.0 / 33554432.0;
const double c10 = 12155.0 / 33554432.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -495.0 / 262144.0; const double c1 = -495.0 / 262144.0;
const double c2 = 5005.0 / 262144.0; const double c2 = 5005.0 / 262144.0;
const double c3 = -27027.0 / 262144.0; const double c3 = -27027.0 / 262144.0;
@@ -7860,7 +7929,7 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 77.0 / 8192.0; const double c1 = 77.0 / 8192.0;
const double c2 = -693.0 / 8192.0; const double c2 = -693.0 / 8192.0;
const double c3 = 3465.0 / 4096.0; const double c3 = 3465.0 / 4096.0;
@@ -7871,6 +7940,15 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr
const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wl[6] = {c1, c2, c3, c4, c5, c6};
const double wr[6] = {c6, c5, c4, c3, c2, c1}; const double wr[6] = {c6, c5, c4, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -7.0 / 128.0;
const double c2 = 105.0 / 128.0;
const double c3 = 35.0 / 128.0;
const double c4 = -5.0 / 128.0;
const int offs[4] = {-1, 0, 1, 2};
const double wl[4] = {c1, c2, c3, c4};
const double wr[4] = {c4, c3, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;

View File

@@ -5232,7 +5232,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
{ {
const int state_index = blockIdx.y; const int state_index = blockIdx.y;
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 35.0 / 65536.0;
const double c2 = -405.0 / 65536.0;
const double c3 = 567.0 / 16384.0;
const double c4 = -2205.0 / 16384.0;
const double c5 = 19845.0 / 32768.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -5.0 / 2048.0; const double c1 = -5.0 / 2048.0;
const double c2 = 49.0 / 2048.0; const double c2 = 49.0 / 2048.0;
const double c3 = -245.0 / 2048.0; const double c3 = -245.0 / 2048.0;
@@ -5240,13 +5249,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 3.0 / 256.0; const double c1 = 3.0 / 256.0;
const double c2 = -25.0 / 256.0; const double c2 = -25.0 / 256.0;
const double c3 = 75.0 / 128.0; const double c3 = 75.0 / 128.0;
const int offs[6] = {-2, -1, 0, 1, 2, 3}; const int offs[6] = {-2, -1, 0, 1, 2, 3};
const double w[6] = {c1, c2, c3, c3, c2, c1}; const double w[6] = {c1, c2, c3, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -1.0 / 16.0;
const double c2 = 9.0 / 16.0;
const int offs[4] = {-1, 0, 1, 2};
const double w[4] = {c1, c2, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;
@@ -5289,7 +5304,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
{ {
const int state_index = blockIdx.y; const int state_index = blockIdx.y;
if (state_index >= state_count) return; if (state_index >= state_count) return;
#if ghost_width == 4 #if ghost_width == 5
const double c1 = 13585.0 / 33554432.0;
const double c2 = -159885.0 / 33554432.0;
const double c3 = 230945.0 / 8388608.0;
const double c4 = -969969.0 / 8388608.0;
const double c5 = 14549535.0 / 16777216.0;
const double c6 = 4849845.0 / 16777216.0;
const double c7 = -692835.0 / 8388608.0;
const double c8 = 188955.0 / 8388608.0;
const double c9 = -138567.0 / 33554432.0;
const double c10 = 12155.0 / 33554432.0;
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 10;
#elif ghost_width == 4
const double c1 = -495.0 / 262144.0; const double c1 = -495.0 / 262144.0;
const double c2 = 5005.0 / 262144.0; const double c2 = 5005.0 / 262144.0;
const double c3 = -27027.0 / 262144.0; const double c3 = -27027.0 / 262144.0;
@@ -5302,7 +5332,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
const int nst = 8; const int nst = 8;
#else #elif ghost_width == 3
const double c1 = 77.0 / 8192.0; const double c1 = 77.0 / 8192.0;
const double c2 = -693.0 / 8192.0; const double c2 = -693.0 / 8192.0;
const double c3 = 3465.0 / 4096.0; const double c3 = 3465.0 / 4096.0;
@@ -5313,6 +5343,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_
const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wl[6] = {c1, c2, c3, c4, c5, c6};
const double wr[6] = {c6, c5, c4, c3, c2, c1}; const double wr[6] = {c6, c5, c4, c3, c2, c1};
const int nst = 6; const int nst = 6;
#else
const double c1 = -7.0 / 128.0;
const double c2 = 105.0 / 128.0;
const double c3 = 35.0 / 128.0;
const double c4 = -5.0 / 128.0;
const int offs[4] = {-1, 0, 1, 2};
const double wl[4] = {c1, c2, c3, c4};
const double wr[4] = {c4, c3, c2, c1};
const int nst = 4;
#endif #endif
for (int local = blockIdx.x * blockDim.x + threadIdx.x; for (int local = blockIdx.x * blockDim.x + threadIdx.x;

View File

@@ -138,6 +138,7 @@ def _stop_cuda_mps(runtime_env):
def _gpu_runtime_env(): def _gpu_runtime_env():
runtime_env = os.environ.copy() runtime_env = os.environ.copy()
finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip()
defaults = { defaults = {
"AMSS_EVOLVE_TIMING": "1", "AMSS_EVOLVE_TIMING": "1",
@@ -160,6 +161,12 @@ def _gpu_runtime_env():
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
} }
if finite_difference in ("2nd-order", "8th-order"):
defaults.update({
"AMSS_INTERP_FAST": "0",
"AMSS_INTERP_GPU": "0",
"AMSS_CUDA_AWARE_MPI": "0",
})
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"): if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1" defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
if getattr(input_data, "Equation_Class", "") == "Z4C": if getattr(input_data, "Equation_Class", "") == "Z4C":