diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h index 5051448..2b509b8 100644 --- a/AMSS_NCKU_source/share_func.h +++ b/AMSS_NCKU_source/share_func.h @@ -5,6 +5,7 @@ #include #include #include +#include /* 主网格:0-based -> 1D */ static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) { const int ex1 = ex[0], ex2 = ex[1]; @@ -87,60 +88,159 @@ static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc * funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) * enddo */ +static inline void symmetry_bd_impl(int ord, + int shift, + const int extc[3], + const double *__restrict func, + double *__restrict funcc, + const double SoA[3]) +{ + const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + const int nx = extc1 + ord; + const int ny = extc2 + ord; + + const size_t snx = (size_t)nx; + const size_t splane = (size_t)nx * (size_t)ny; + const size_t interior_i = (size_t)shift + 1u; /* iF = 1 */ + const size_t interior_j = ((size_t)shift + 1u) * snx; /* jF = 1 */ + const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */ + const size_t interior0 = interior_k + interior_j + interior_i; + + /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */ + for (int k0 = 0; k0 < extc3; ++k0) { + const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1; + const size_t dst_k0 = interior0 + (size_t)k0 * splane; + for (int j0 = 0; j0 < extc2; ++j0) { + const double *src = src_k + (size_t)j0 * (size_t)extc1; + double *dst = funcc + dst_k0 + (size_t)j0 * snx; + memcpy(dst, src, (size_t)extc1 * sizeof(double)); + } + } + + /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */ + const double s1 = SoA[0]; + if (s1 == 1.0) { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = funcc[off + src_i]; + } + } + } + } else if (s1 == -1.0) { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = -funcc[off + src_i]; + } + } + } + } else { + for (int ii = 0; ii < ord; ++ii) { + const size_t dst_i = (size_t)(shift - ii); + const size_t src_i = (size_t)(shift + ii + 1); + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane + interior_j; + for (int j0 = 0; j0 < extc2; ++j0) { + const size_t off = kbase + (size_t)j0 * snx; + funcc[off + dst_i] = funcc[off + src_i] * s1; + } + } + } + } + + /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */ + const double s2 = SoA[1]; + if (s2 == 1.0) { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = src[i]; + } + } + } else if (s2 == -1.0) { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = -src[i]; + } + } + } else { + for (int jj = 0; jj < ord; ++jj) { + const size_t dst_j = (size_t)(shift - jj) * snx; + const size_t src_j = (size_t)(shift + jj + 1) * snx; + for (int k0 = 0; k0 < extc3; ++k0) { + const size_t kbase = interior_k + (size_t)k0 * splane; + double *dst = funcc + kbase + dst_j; + const double *src = funcc + kbase + src_j; + for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2; + } + } + } + + /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */ + const double s3 = SoA[2]; + if (s3 == 1.0) { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = src[p]; + } + } else if (s3 == -1.0) { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = -src[p]; + } + } else { + for (int kk = 0; kk < ord; ++kk) { + const size_t dst_k = (size_t)(shift - kk) * splane; + const size_t src_k = (size_t)(shift + kk + 1) * splane; + double *dst = funcc + dst_k; + const double *src = funcc + src_k; + for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3; + } + } +} + static inline void symmetry_bd(int ord, const int extc[3], const double *func, double *funcc, const double SoA[3]) { - const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + if (ord <= 0) return; - // 1) funcc(1:extc1,1:extc2,1:extc3) = func - // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1) - for (int k0 = 0; k0 < extc3; ++k0) { - for (int j0 = 0; j0 < extc2; ++j0) { - for (int i0 = 0; i0 < extc1; ++i0) { - const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; - funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)]; - } - } + /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */ + if (ord == 2) { + symmetry_bd_impl(2, 1, extc, func, funcc, SoA); + return; + } + if (ord == 3) { + symmetry_bd_impl(3, 2, extc, func, funcc, SoA); + return; } - // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1) - for (int ii = 0; ii <= ord - 1; ++ii) { - const int iF_dst = -ii; // 0, -1, -2, ... - const int iF_src = ii + 1; // 1, 2, 3, ... - for (int kF = 1; kF <= extc3; ++kF) { - for (int jF = 1; jF <= extc2; ++jF) { - funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] = - funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0]; - } - } - } - - // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2) - // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖 - for (int jj = 0; jj <= ord - 1; ++jj) { - const int jF_dst = -jj; - const int jF_src = jj + 1; - for (int kF = 1; kF <= extc3; ++kF) { - for (int iF = -ord + 1; iF <= extc1; ++iF) { - funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] = - funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1]; - } - } - } - - // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3) - for (int kk = 0; kk <= ord - 1; ++kk) { - const int kF_dst = -kk; - const int kF_src = kk + 1; - for (int jF = -ord + 1; jF <= extc2; ++jF) { - for (int iF = -ord + 1; iF <= extc1; ++iF) { - funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] = - funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2]; - } - } - } + symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA); } #endif