#ifndef AMSS_NCKU_FD_CUDA_HELPERS_CUH #define AMSS_NCKU_FD_CUDA_HELPERS_CUH #ifndef ghost_width #error "ghost_width must be defined before including fd_cuda_helpers.cuh" #endif #if ghost_width < 2 || ghost_width > 5 #error "CUDA finite-difference helpers support ghost_width 2..5" #endif #define AMSS_FD_CENTER_RADIUS (ghost_width - 1) #define AMSS_FD_LK_RADIUS (ghost_width) __device__ __forceinline__ int fd_axis_radius(int qF, int qminF, int qmaxF) { #if AMSS_FD_CENTER_RADIUS >= 4 if (qF - 4 >= qminF && qF + 4 <= qmaxF) return 4; #endif #if AMSS_FD_CENTER_RADIUS >= 3 if (qF - 3 >= qminF && qF + 3 <= qmaxF) return 3; #endif #if AMSS_FD_CENTER_RADIUS >= 2 if (qF - 2 >= qminF && qF + 2 <= qmaxF) return 2; #endif if (qF - 1 >= qminF && qF + 1 <= qmaxF) return 1; return 0; } __device__ __forceinline__ int fd_common_radius(int iF, int jF, int kF, int iminF, int jminF, int kminF, int imaxF, int jmaxF, int kmaxF) { int r = fd_axis_radius(iF, iminF, imaxF); const int ry = fd_axis_radius(jF, jminF, jmaxF); const int rz = fd_axis_radius(kF, kminF, kmaxF); if (ry < r) r = ry; if (rz < r) r = rz; return r; } __device__ __forceinline__ double fd_first_coef(int r, int off) { switch (r) { case 1: if (off == -1) return -1.0; if (off == 1) return 1.0; return 0.0; case 2: if (off == -2) return 1.0; if (off == -1) return -8.0; if (off == 1) return 8.0; if (off == 2) return -1.0; return 0.0; case 3: if (off == -3) return -1.0; if (off == -2) return 9.0; if (off == -1) return -45.0; if (off == 1) return 45.0; if (off == 2) return -9.0; if (off == 3) return 1.0; return 0.0; case 4: if (off == -4) return 3.0; if (off == -3) return -32.0; if (off == -2) return 168.0; if (off == -1) return -672.0; if (off == 1) return 672.0; if (off == 2) return -168.0; if (off == 3) return 32.0; if (off == 4) return -3.0; return 0.0; default: return 0.0; } } __device__ __forceinline__ double fd_second_coef(int r, int off) { switch (r) { case 1: if (off == -1) return 1.0; if (off == 0) return -2.0; if (off == 1) return 1.0; return 0.0; case 2: if (off == -2) return -1.0; if (off == -1) return 16.0; if (off == 0) return -30.0; if (off == 1) return 16.0; if (off == 2) return -1.0; return 0.0; case 3: if (off == -3) return 2.0; if (off == -2) return -27.0; if (off == -1) return 270.0; if (off == 0) return -490.0; if (off == 1) return 270.0; if (off == 2) return -27.0; if (off == 3) return 2.0; return 0.0; case 4: if (off == -4) return -9.0; if (off == -3) return 128.0; if (off == -2) return -1008.0; if (off == -1) return 8064.0; if (off == 0) return -14350.0; if (off == 1) return 8064.0; if (off == 2) return -1008.0; if (off == 3) return 128.0; if (off == 4) return -9.0; return 0.0; default: return 0.0; } } __device__ __forceinline__ double fd_first_denom(int r) { return (r == 4) ? 840.0 : ((r == 3) ? 60.0 : ((r == 2) ? 12.0 : 2.0)); } __device__ __forceinline__ double fd_second_denom(int r) { return (r == 4) ? 5040.0 : ((r == 3) ? 180.0 : ((r == 2) ? 12.0 : 1.0)); } __device__ __forceinline__ double fd_fetch_axis(const double *src, int iF, int jF, int kF, int axis, int off, int SoA0, int SoA1, int SoA2) { if (axis == 0) iF += off; else if (axis == 1) jF += off; else kF += off; return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); } __device__ __forceinline__ double fd_fetch_axis2(const double *src, int iF, int jF, int kF, int axis_a, int off_a, int axis_b, int off_b, int SoA0, int SoA1, int SoA2) { if (axis_a == 0) iF += off_a; else if (axis_a == 1) jF += off_a; else kF += off_a; if (axis_b == 0) iF += off_b; else if (axis_b == 1) jF += off_b; else kF += off_b; return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); } __device__ __forceinline__ double fd_first_axis_radius(const double *src, int iF, int jF, int kF, int axis, int r, double h, int SoA0, int SoA1, int SoA2) { if (r <= 0) return 0.0; double s = 0.0; #pragma unroll for (int off = -4; off <= 4; ++off) { const double c = fd_first_coef(r, off); if (c != 0.0) { s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2); } } return s / (fd_first_denom(r) * h); } __device__ __forceinline__ double fd_second_axis_radius(const double *src, int iF, int jF, int kF, int axis, int r, double h, int SoA0, int SoA1, int SoA2) { if (r <= 0) return 0.0; double s = 0.0; #pragma unroll for (int off = -4; off <= 4; ++off) { const double c = fd_second_coef(r, off); if (c != 0.0) { s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2); } } return s / (fd_second_denom(r) * h * h); } __device__ __forceinline__ double fd_mixed_axis_radius(const double *src, int iF, int jF, int kF, int axis_a, int r_a, double h_a, int axis_b, int r_b, double h_b, int SoA0, int SoA1, int SoA2) { if (r_a <= 0 || r_b <= 0) return 0.0; double s = 0.0; #pragma unroll for (int off_a = -4; off_a <= 4; ++off_a) { const double ca = fd_first_coef(r_a, off_a); if (ca == 0.0) continue; #pragma unroll for (int off_b = -4; off_b <= 4; ++off_b) { const double cb = fd_first_coef(r_b, off_b); if (cb != 0.0) { s += ca * cb * fd_fetch_axis2(src, iF, jF, kF, axis_a, off_a, axis_b, off_b, SoA0, SoA1, SoA2); } } } return s / (fd_first_denom(r_a) * fd_first_denom(r_b) * h_a * h_b); } __device__ __forceinline__ void fd_compute_first3(const double *src, int iF, int jF, int kF, int iminF, int jminF, int kminF, int imaxF, int jmaxF, int kmaxF, int SoA0, int SoA1, int SoA2, double &fx, double &fy, double &fz) { #if ghost_width == 3 const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF); fx = fd_first_axis_radius(src, iF, jF, kF, 0, r, d_gp.dX, SoA0, SoA1, SoA2); fy = fd_first_axis_radius(src, iF, jF, kF, 1, r, d_gp.dY, SoA0, SoA1, SoA2); fz = fd_first_axis_radius(src, iF, jF, kF, 2, r, d_gp.dZ, SoA0, SoA1, SoA2); #else fx = fd_first_axis_radius(src, iF, jF, kF, 0, fd_axis_radius(iF, iminF, imaxF), d_gp.dX, SoA0, SoA1, SoA2); fy = fd_first_axis_radius(src, iF, jF, kF, 1, fd_axis_radius(jF, jminF, jmaxF), d_gp.dY, SoA0, SoA1, SoA2); fz = fd_first_axis_radius(src, iF, jF, kF, 2, fd_axis_radius(kF, kminF, kmaxF), d_gp.dZ, SoA0, SoA1, SoA2); #endif } __device__ __forceinline__ void fd_compute_second6(const double *src, int iF, int jF, int kF, int iminF, int jminF, int kminF, int imaxF, int jmaxF, int kmaxF, int SoA0, int SoA1, int SoA2, double &fxx, double &fxy, double &fxz, double &fyy, double &fyz, double &fzz) { #if ghost_width == 3 const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF); const int rx = r, ry = r, rz = r; #else const int rx = fd_axis_radius(iF, iminF, imaxF); const int ry = fd_axis_radius(jF, jminF, jmaxF); const int rz = fd_axis_radius(kF, kminF, kmaxF); #endif fxx = fd_second_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, SoA0, SoA1, SoA2); fyy = fd_second_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, SoA0, SoA1, SoA2); fzz = fd_second_axis_radius(src, iF, jF, kF, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2); fxy = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 1, ry, d_gp.dY, SoA0, SoA1, SoA2); fxz = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2); fyz = fd_mixed_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2); } __device__ __forceinline__ bool fd_lop_fits(int qF, int qminF, int qmaxF, int dir, int lo, int hi) { for (int off = lo; off <= hi; ++off) { const int q = qF + dir * off; if (q < qminF || q > qmaxF) return false; } return true; } __device__ __forceinline__ double fd_lop_fetch_sum(const double *src, int iF, int jF, int kF, int axis, int dir, const double *coef, int lo, int hi, int SoA0, int SoA1, int SoA2) { double s = 0.0; for (int off = lo; off <= hi; ++off) { const double c = coef[off - lo]; if (c != 0.0) { s += c * fd_fetch_axis(src, iF, jF, kF, axis, dir * off, SoA0, SoA1, SoA2); } } return s; } __device__ __forceinline__ double fd_lopsided_axis(const double *src, int iF, int jF, int kF, int axis, double speed, int qF, int qminF, int qmaxF, double h, int SoA0, int SoA1, int SoA2) { if (speed == 0.0) return 0.0; const int dir = (speed > 0.0) ? 1 : -1; const double mag = (speed > 0.0) ? speed : -speed; #if ghost_width == 2 if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 2)) { const double c[] = {-3.0, 4.0, -1.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 2, SoA0, SoA1, SoA2) / (2.0 * h); } if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 1)) { const double c[] = {-1.0, 1.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 1, SoA0, SoA1, SoA2) / (2.0 * h); } return 0.0; #elif ghost_width == 3 if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 3)) { const double c[] = {-3.0, -10.0, 18.0, -6.0, 1.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 3, SoA0, SoA1, SoA2) / (12.0 * h); } const int r = fd_axis_radius(qF, qminF, qmaxF); return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2); #elif ghost_width == 4 if (fd_lop_fits(qF, qminF, qmaxF, dir, -2, 4)) { const double c[] = {2.0, -24.0, -35.0, 80.0, -30.0, 8.0, -1.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -2, 4, SoA0, SoA1, SoA2) / (60.0 * h); } if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 5)) { const double c[] = {-10.0, -77.0, 150.0, -100.0, 50.0, -15.0, 2.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 5, SoA0, SoA1, SoA2) / (60.0 * h); } const int r = fd_axis_radius(qF, qminF, qmaxF); return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2); #else if (fd_lop_fits(qF, qminF, qmaxF, dir, -3, 5)) { const double c[] = {-5.0, 60.0, -420.0, -378.0, 1050.0, -420.0, 140.0, -30.0, 3.0}; return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -3, 5, SoA0, SoA1, SoA2) / (840.0 * h); } const int r = fd_axis_radius(qF, qminF, qmaxF); return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2); #endif } __device__ __forceinline__ double fd_ko_coef(int r, int off) { const int a = off < 0 ? -off : off; if (r == 2) { if (a == 0) return 6.0; if (a == 1) return -4.0; if (a == 2) return 1.0; } else if (r == 3) { if (a == 0) return -20.0; if (a == 1) return 15.0; if (a == 2) return -6.0; if (a == 3) return 1.0; } else if (r == 4) { if (a == 0) return 70.0; if (a == 1) return -56.0; if (a == 2) return 28.0; if (a == 3) return -8.0; if (a == 4) return 1.0; } else if (r == 5) { if (a == 0) return -252.0; if (a == 1) return 210.0; if (a == 2) return -120.0; if (a == 3) return 45.0; if (a == 4) return -10.0; if (a == 5) return 1.0; } return 0.0; } __device__ __forceinline__ double fd_ko_axis(const double *src, int iF, int jF, int kF, int axis, int r, int SoA0, int SoA1, int SoA2) { double s = 0.0; #pragma unroll for (int off = -5; off <= 5; ++off) { if (off < -r || off > r) continue; const double c = fd_ko_coef(r, off); if (c != 0.0) { s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2); } } return s; } __device__ __forceinline__ double fd_ko_term(const double *src, int iF, int jF, int kF, int iminF, int jminF, int kminF, int imaxF, int jmaxF, int kmaxF, double eps_val, int SoA0, int SoA1, int SoA2) { const int r = AMSS_FD_LK_RADIUS; if (eps_val <= 0.0) return 0.0; #if ghost_width >= 4 if (iF - r <= iminF || iF + r >= imaxF || jF - r <= jminF || jF + r >= jmaxF || kF - r <= kminF || kF + r >= kmaxF) { return 0.0; } #else if (iF - r < iminF || iF + r > imaxF || jF - r < jminF || jF + r > jmaxF || kF - r < kminF || kF + r > kmaxF) { return 0.0; } #endif double cof = 1.0; #pragma unroll for (int n = 0; n < 2 * r; ++n) cof *= 2.0; const double sign = (r & 1) ? 1.0 : -1.0; const double dx = fd_ko_axis(src, iF, jF, kF, 0, r, SoA0, SoA1, SoA2); const double dy = fd_ko_axis(src, iF, jF, kF, 1, r, SoA0, SoA1, SoA2); const double dz = fd_ko_axis(src, iF, jF, kF, 2, r, SoA0, SoA1, SoA2); return sign * eps_val * (dx / d_gp.dX + dy / d_gp.dY + dz / d_gp.dZ) / cof; } #endif