diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 1a94d3c..2d550fa 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -245,11 +245,15 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, int first_fine[3]) { -#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) -#if ghost_width == 4 +#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5) +#if ghost_width == 5 + const int stencil_hi = 5; +#elif ghost_width == 4 const int stencil_hi = 4; -#else +#elif ghost_width == 3 const int stencil_hi = 3; +#else + const int stencil_hi = 2; #endif if (!src || !dst || !src->Bg || !dst->Bg) return false; @@ -291,11 +295,15 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, int first_fine_ii[3], int coarse_lb[3]) { -#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) -#if ghost_width == 4 +#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && (ghost_width >= 2) && (ghost_width <= 5) +#if ghost_width == 5 + const int stencil_hi = 5; +#elif ghost_width == 4 const int stencil_hi = 4; -#else +#elif ghost_width == 3 const int stencil_hi = 3; +#else + const int stencil_hi = 2; #endif if (!src || !dst || !src->Bg || !dst->Bg) return false; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index ed63fae..52b4303 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -7622,7 +7622,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src { const int state_index = blockIdx.y; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 35.0 / 65536.0; + const double c2 = -405.0 / 65536.0; + const double c3 = 567.0 / 16384.0; + const double c4 = -2205.0 / 16384.0; + const double c5 = 19845.0 / 32768.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -5.0 / 2048.0; const double c2 = 49.0 / 2048.0; const double c3 = -245.0 / 2048.0; @@ -7630,13 +7639,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 3.0 / 256.0; const double c2 = -25.0 / 256.0; const double c3 = 75.0 / 128.0; const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double w[6] = {c1, c2, c3, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -1.0 / 16.0; + const double c2 = 9.0 / 16.0; + const int offs[4] = {-1, 0, 1, 2}; + const double w[4] = {c1, c2, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; @@ -7708,7 +7723,16 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s const int offset = m[4]; const int fi0 = m[5], fj0 = m[6], fk0 = m[7]; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 35.0 / 65536.0; + const double c2 = -405.0 / 65536.0; + const double c3 = 567.0 / 16384.0; + const double c4 = -2205.0 / 16384.0; + const double c5 = 19845.0 / 32768.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -5.0 / 2048.0; const double c2 = 49.0 / 2048.0; const double c3 = -245.0 / 2048.0; @@ -7716,13 +7740,19 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 3.0 / 256.0; const double c2 = -25.0 / 256.0; const double c3 = 75.0 / 128.0; const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double w[6] = {c1, c2, c3, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -1.0 / 16.0; + const double c2 = 9.0 / 16.0; + const int offs[4] = {-1, 0, 1, 2}; + const double w[4] = {c1, c2, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; @@ -7768,7 +7798,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ { const int state_index = blockIdx.y; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 13585.0 / 33554432.0; + const double c2 = -159885.0 / 33554432.0; + const double c3 = 230945.0 / 8388608.0; + const double c4 = -969969.0 / 8388608.0; + const double c5 = 14549535.0 / 16777216.0; + const double c6 = 4849845.0 / 16777216.0; + const double c7 = -692835.0 / 8388608.0; + const double c8 = 188955.0 / 8388608.0; + const double c9 = -138567.0 / 33554432.0; + const double c10 = 12155.0 / 33554432.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10}; + const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -495.0 / 262144.0; const double c2 = 5005.0 / 262144.0; const double c3 = -27027.0 / 262144.0; @@ -7781,7 +7826,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 77.0 / 8192.0; const double c2 = -693.0 / 8192.0; const double c3 = 3465.0 / 4096.0; @@ -7792,6 +7837,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wr[6] = {c6, c5, c4, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -7.0 / 128.0; + const double c2 = 105.0 / 128.0; + const double c3 = 35.0 / 128.0; + const double c4 = -5.0 / 128.0; + const int offs[4] = {-1, 0, 1, 2}; + const double wl[4] = {c1, c2, c3, c4}; + const double wr[4] = {c4, c3, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; @@ -7847,7 +7901,22 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const int ii0 = m[5], jj0 = m[6], kk0 = m[7]; const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10]; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 13585.0 / 33554432.0; + const double c2 = -159885.0 / 33554432.0; + const double c3 = 230945.0 / 8388608.0; + const double c4 = -969969.0 / 8388608.0; + const double c5 = 14549535.0 / 16777216.0; + const double c6 = 4849845.0 / 16777216.0; + const double c7 = -692835.0 / 8388608.0; + const double c8 = 188955.0 / 8388608.0; + const double c9 = -138567.0 / 33554432.0; + const double c10 = 12155.0 / 33554432.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10}; + const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -495.0 / 262144.0; const double c2 = 5005.0 / 262144.0; const double c3 = -27027.0 / 262144.0; @@ -7860,7 +7929,7 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 77.0 / 8192.0; const double c2 = -693.0 / 8192.0; const double c3 = 3465.0 / 4096.0; @@ -7871,6 +7940,15 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wr[6] = {c6, c5, c4, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -7.0 / 128.0; + const double c2 = 105.0 / 128.0; + const double c3 = 35.0 / 128.0; + const double c4 = -5.0 / 128.0; + const int offs[4] = {-1, 0, 1, 2}; + const double wl[4] = {c1, c2, c3, c4}; + const double wr[4] = {c4, c3, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 36de461..4e5b3c9 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -5232,7 +5232,16 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src { const int state_index = blockIdx.y; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 35.0 / 65536.0; + const double c2 = -405.0 / 65536.0; + const double c3 = 567.0 / 16384.0; + const double c4 = -2205.0 / 16384.0; + const double c5 = 19845.0 / 32768.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -5.0 / 2048.0; const double c2 = 49.0 / 2048.0; const double c3 = -245.0 / 2048.0; @@ -5240,13 +5249,19 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 3.0 / 256.0; const double c2 = -25.0 / 256.0; const double c3 = 75.0 / 128.0; const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double w[6] = {c1, c2, c3, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -1.0 / 16.0; + const double c2 = 9.0 / 16.0; + const int offs[4] = {-1, 0, 1, 2}; + const double w[4] = {c1, c2, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; @@ -5289,7 +5304,22 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ { const int state_index = blockIdx.y; if (state_index >= state_count) return; -#if ghost_width == 4 +#if ghost_width == 5 + const double c1 = 13585.0 / 33554432.0; + const double c2 = -159885.0 / 33554432.0; + const double c3 = 230945.0 / 8388608.0; + const double c4 = -969969.0 / 8388608.0; + const double c5 = 14549535.0 / 16777216.0; + const double c6 = 4849845.0 / 16777216.0; + const double c7 = -692835.0 / 8388608.0; + const double c8 = 188955.0 / 8388608.0; + const double c9 = -138567.0 / 33554432.0; + const double c10 = 12155.0 / 33554432.0; + const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5}; + const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10}; + const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 10; +#elif ghost_width == 4 const double c1 = -495.0 / 262144.0; const double c2 = 5005.0 / 262144.0; const double c3 = -27027.0 / 262144.0; @@ -5302,7 +5332,7 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; const int nst = 8; -#else +#elif ghost_width == 3 const double c1 = 77.0 / 8192.0; const double c2 = -693.0 / 8192.0; const double c3 = 3465.0 / 4096.0; @@ -5313,6 +5343,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wr[6] = {c6, c5, c4, c3, c2, c1}; const int nst = 6; +#else + const double c1 = -7.0 / 128.0; + const double c2 = 105.0 / 128.0; + const double c3 = 35.0 / 128.0; + const double c4 = -5.0 / 128.0; + const int offs[4] = {-1, 0, 1, 2}; + const double wl[4] = {c1, c2, c3, c4}; + const double wr[4] = {c4, c3, c2, c1}; + const int nst = 4; #endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/makefile_and_run.py b/makefile_and_run.py index b8971d8..2edd4ef 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -138,6 +138,7 @@ def _stop_cuda_mps(runtime_env): def _gpu_runtime_env(): runtime_env = os.environ.copy() + finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip() defaults = { "AMSS_EVOLVE_TIMING": "1", @@ -160,6 +161,12 @@ def _gpu_runtime_env(): "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", } + if finite_difference in ("2nd-order", "8th-order"): + defaults.update({ + "AMSS_INTERP_FAST": "0", + "AMSS_INTERP_GPU": "0", + "AMSS_CUDA_AWARE_MPI": "0", + }) if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"): defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1" if getattr(input_data, "Equation_Class", "") == "Z4C":