Support CUDA finite-difference order selection
This commit is contained in:
@@ -401,6 +401,8 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
|
||||
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
||||
}
|
||||
|
||||
#include "fd_cuda_helpers.cuh"
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* GPU buffer management */
|
||||
/* ------------------------------------------------------------------ */
|
||||
@@ -1729,45 +1731,10 @@ void kern_fderivs_batched(FDerivTables tables, int field_count)
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
fx[tid] = d_gp.d12dx * (
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
||||
fy[tid] = d_gp.d12dy * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
||||
fz[tid] = d_gp.d12dz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
fx[tid] = d_gp.d2dx * (
|
||||
-fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
||||
fy[tid] = d_gp.d2dy * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fz[tid] = d_gp.d2dz * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
}
|
||||
else {
|
||||
fx[tid] = 0.0;
|
||||
fy[tid] = 0.0;
|
||||
fz[tid] = 0.0;
|
||||
}
|
||||
fd_compute_first3(src, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
SoA0, SoA1, SoA2,
|
||||
fx[tid], fy[tid], fz[tid]);
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
@@ -1807,6 +1774,12 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
#if ghost_width != 3
|
||||
fd_compute_second6(src, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
SoA0, SoA1, SoA2,
|
||||
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||
#else
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
@@ -1934,12 +1907,43 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void gpu_fderivs_batch(int field_count,
|
||||
double *const *src_fields,
|
||||
double *const *fx_fields,
|
||||
double *const *fy_fields,
|
||||
double *const *fz_fields,
|
||||
const int *soa_signs,
|
||||
int all);
|
||||
static void gpu_fdderivs_batch(int field_count,
|
||||
double *const *src_fields,
|
||||
double *const *fxx_fields,
|
||||
double *const *fxy_fields,
|
||||
double *const *fxz_fields,
|
||||
double *const *fyy_fields,
|
||||
double *const *fyz_fields,
|
||||
double *const *fzz_fields,
|
||||
const int *soa_signs,
|
||||
int all);
|
||||
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||
double SoA0, double SoA1, double SoA2,
|
||||
double eps_val, int all);
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
#if ghost_width != 3
|
||||
double *src_fields[1] = {d_f};
|
||||
double *fx_fields[1] = {d_fx};
|
||||
double *fy_fields[1] = {d_fy};
|
||||
double *fz_fields[1] = {d_fz};
|
||||
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||
gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
|
||||
#else
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
@@ -1948,6 +1952,7 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
||||
@@ -1956,6 +1961,18 @@ static void gpu_fdderivs(double *d_f,
|
||||
double *d_fyy, double *d_fyz, double *d_fzz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
#if ghost_width != 3
|
||||
double *src_fields[1] = {d_f};
|
||||
double *fxx_fields[1] = {d_fxx};
|
||||
double *fxy_fields[1] = {d_fxy};
|
||||
double *fxz_fields[1] = {d_fxz};
|
||||
double *fyy_fields[1] = {d_fyy};
|
||||
double *fyz_fields[1] = {d_fyz};
|
||||
double *fzz_fields[1] = {d_fzz};
|
||||
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||
gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
|
||||
fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
|
||||
#else
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
@@ -1964,6 +1981,7 @@ static void gpu_fdderivs(double *d_f,
|
||||
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void gpu_fderivs_batch(int field_count,
|
||||
@@ -2053,6 +2071,12 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
#if ghost_width != 3
|
||||
fd_compute_second6(src, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
SoA0, SoA1, SoA2,
|
||||
fxx, fxy, fxz, fyy, fyz, fzz);
|
||||
#else
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
@@ -2176,6 +2200,7 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
||||
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
||||
@@ -2240,6 +2265,16 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
#if ghost_width != 3
|
||||
fd_compute_second6(Lap, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
1, 1, 1,
|
||||
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||
fd_compute_first3(chi, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
1, 1, 1,
|
||||
chix_out[tid], chiy_out[tid], chiz_out[tid]);
|
||||
#else
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
@@ -2393,6 +2428,7 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Combined ord=3 advection + KO dissipation.
|
||||
@@ -2404,6 +2440,11 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||
double SoA0, double SoA1, double SoA2,
|
||||
double eps_val, int all)
|
||||
{
|
||||
#if ghost_width != 3
|
||||
gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
|
||||
d_Sfx, d_Sfy, d_Sfz,
|
||||
SoA0, SoA1, SoA2, eps_val, all);
|
||||
#else
|
||||
double *fh = g_buf.d_fh3;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
@@ -2419,6 +2460,7 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||
}
|
||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
@@ -2449,6 +2491,22 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
#if ghost_width != 3
|
||||
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||
const double val =
|
||||
fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
|
||||
d_gp.dX, SoA0, SoA1, SoA2)
|
||||
+ fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
|
||||
d_gp.dY, SoA0, SoA1, SoA2)
|
||||
+ fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
|
||||
d_gp.dZ, SoA0, SoA1, SoA2);
|
||||
rhs[tid] += val;
|
||||
}
|
||||
|
||||
rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
eps_val, SoA0, SoA1, SoA2);
|
||||
#else
|
||||
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||
double val = 0.0;
|
||||
|
||||
@@ -2631,6 +2689,25 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
||||
|
||||
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||
double SoA0, double SoA1, double SoA2,
|
||||
double eps_val, int all)
|
||||
{
|
||||
LopsidedKodisTables tables = {};
|
||||
tables.adv_fields[0] = d_f_adv;
|
||||
tables.ko_fields[0] = d_f_ko;
|
||||
tables.rhs_fields[0] = d_f_rhs;
|
||||
tables.soa_signs[0] = (int)SoA0;
|
||||
tables.soa_signs[1] = (int)SoA1;
|
||||
tables.soa_signs[2] = (int)SoA2;
|
||||
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
|
||||
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
||||
d_Sfx, d_Sfy, d_Sfz, tables, eps_val);
|
||||
}
|
||||
|
||||
static void gpu_lopsided_kodis_state_batch(double eps_val, int all, bool include_escalar = false)
|
||||
@@ -4624,6 +4701,12 @@ void kern_phase12_13_chi_correction_fused(
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
#if ghost_width != 3
|
||||
fd_compute_second6(chi, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
1, 1, 1,
|
||||
cxx, cxy, cxz, cyy, cyz, czz);
|
||||
#else
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
@@ -4747,6 +4830,7 @@ void kern_phase12_13_chi_correction_fused(
|
||||
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
const double cx = chix[tid];
|
||||
@@ -4917,6 +5001,12 @@ void kern_phase15_trK_Aij_gauge(
|
||||
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
||||
|
||||
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||
#if ghost_width != 3
|
||||
fd_compute_second6(alpn1, iF, jF, kF,
|
||||
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||
1, 1, 1,
|
||||
fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
|
||||
#else
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
@@ -5040,6 +5130,7 @@ void kern_phase15_trK_Aij_gauge(
|
||||
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* raised chi/chi */
|
||||
@@ -5443,15 +5534,15 @@ static void setup_grid_params(int *ex,
|
||||
gp.imaxF = nx;
|
||||
gp.jmaxF = ny;
|
||||
gp.kmaxF = nz;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
|
||||
gp.iminF3 = 1;
|
||||
gp.jminF3 = 1;
|
||||
gp.kminF3 = 1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
|
||||
gp.Symmetry = Symmetry;
|
||||
gp.eps = eps;
|
||||
gp.co = co;
|
||||
|
||||
Reference in New Issue
Block a user