Support CUDA finite-difference order selection
This commit is contained in:
@@ -401,6 +401,8 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
|
|||||||
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "fd_cuda_helpers.cuh"
|
||||||
|
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
/* GPU buffer management */
|
/* GPU buffer management */
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
@@ -1729,45 +1731,10 @@ void kern_fderivs_batched(FDerivTables tables, int field_count)
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
fd_compute_first3(src, iF, jF, kF,
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
SoA0, SoA1, SoA2,
|
||||||
{
|
fx[tid], fy[tid], fz[tid]);
|
||||||
fx[tid] = d_gp.d12dx * (
|
|
||||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
||||||
fy[tid] = d_gp.d12dy * (
|
|
||||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
||||||
fz[tid] = d_gp.d12dz * (
|
|
||||||
fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
||||||
}
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fx[tid] = d_gp.d2dx * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
|
||||||
fy[tid] = d_gp.d2dy * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
||||||
fz[tid] = d_gp.d2dz * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fx[tid] = 0.0;
|
|
||||||
fy[tid] = 0.0;
|
|
||||||
fz[tid] = 0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 4)
|
__global__ __launch_bounds__(128, 4)
|
||||||
@@ -1807,6 +1774,12 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
SoA0, SoA1, SoA2,
|
||||||
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -1934,12 +1907,43 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
|||||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gpu_fderivs_batch(int field_count,
|
||||||
|
double *const *src_fields,
|
||||||
|
double *const *fx_fields,
|
||||||
|
double *const *fy_fields,
|
||||||
|
double *const *fz_fields,
|
||||||
|
const int *soa_signs,
|
||||||
|
int all);
|
||||||
|
static void gpu_fdderivs_batch(int field_count,
|
||||||
|
double *const *src_fields,
|
||||||
|
double *const *fxx_fields,
|
||||||
|
double *const *fxy_fields,
|
||||||
|
double *const *fxz_fields,
|
||||||
|
double *const *fyy_fields,
|
||||||
|
double *const *fyz_fields,
|
||||||
|
double *const *fzz_fields,
|
||||||
|
const int *soa_signs,
|
||||||
|
int all);
|
||||||
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||||
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||||
|
double SoA0, double SoA1, double SoA2,
|
||||||
|
double eps_val, int all);
|
||||||
|
|
||||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||||
double SoA0, double SoA1, double SoA2, int all)
|
double SoA0, double SoA1, double SoA2, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
double *src_fields[1] = {d_f};
|
||||||
|
double *fx_fields[1] = {d_fx};
|
||||||
|
double *fy_fields[1] = {d_fy};
|
||||||
|
double *fz_fields[1] = {d_fz};
|
||||||
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||||
|
gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh2;
|
double *fh = g_buf.d_fh2;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -1948,6 +1952,7 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
|||||||
|
|
||||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
||||||
@@ -1956,6 +1961,18 @@ static void gpu_fdderivs(double *d_f,
|
|||||||
double *d_fyy, double *d_fyz, double *d_fzz,
|
double *d_fyy, double *d_fyz, double *d_fzz,
|
||||||
double SoA0, double SoA1, double SoA2, int all)
|
double SoA0, double SoA1, double SoA2, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
double *src_fields[1] = {d_f};
|
||||||
|
double *fxx_fields[1] = {d_fxx};
|
||||||
|
double *fxy_fields[1] = {d_fxy};
|
||||||
|
double *fxz_fields[1] = {d_fxz};
|
||||||
|
double *fyy_fields[1] = {d_fyy};
|
||||||
|
double *fyz_fields[1] = {d_fyz};
|
||||||
|
double *fzz_fields[1] = {d_fzz};
|
||||||
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||||
|
gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
|
||||||
|
fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh2;
|
double *fh = g_buf.d_fh2;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -1964,6 +1981,7 @@ static void gpu_fdderivs(double *d_f,
|
|||||||
|
|
||||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gpu_fderivs_batch(int field_count,
|
static void gpu_fderivs_batch(int field_count,
|
||||||
@@ -2053,6 +2071,12 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
SoA0, SoA1, SoA2,
|
||||||
|
fxx, fxy, fxz, fyy, fyz, fzz);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -2176,6 +2200,7 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
|||||||
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
||||||
@@ -2240,6 +2265,16 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(Lap, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||||
|
fd_compute_first3(chi, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
chix_out[tid], chiy_out[tid], chiz_out[tid]);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -2393,6 +2428,7 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
|||||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Combined ord=3 advection + KO dissipation.
|
/* Combined ord=3 advection + KO dissipation.
|
||||||
@@ -2404,6 +2440,11 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|||||||
double SoA0, double SoA1, double SoA2,
|
double SoA0, double SoA1, double SoA2,
|
||||||
double eps_val, int all)
|
double eps_val, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
|
||||||
|
d_Sfx, d_Sfy, d_Sfz,
|
||||||
|
SoA0, SoA1, SoA2, eps_val, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh3;
|
double *fh = g_buf.d_fh3;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -2419,6 +2460,7 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|||||||
}
|
}
|
||||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 4)
|
__global__ __launch_bounds__(128, 4)
|
||||||
@@ -2449,6 +2491,22 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||||
|
const double val =
|
||||||
|
fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
|
||||||
|
d_gp.dX, SoA0, SoA1, SoA2)
|
||||||
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
|
||||||
|
d_gp.dY, SoA0, SoA1, SoA2)
|
||||||
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
|
||||||
|
d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
rhs[tid] += val;
|
||||||
|
}
|
||||||
|
|
||||||
|
rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
eps_val, SoA0, SoA1, SoA2);
|
||||||
|
#else
|
||||||
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||||
double val = 0.0;
|
double val = 0.0;
|
||||||
|
|
||||||
@@ -2631,6 +2689,25 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
|||||||
|
|
||||||
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||||
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||||
|
double SoA0, double SoA1, double SoA2,
|
||||||
|
double eps_val, int all)
|
||||||
|
{
|
||||||
|
LopsidedKodisTables tables = {};
|
||||||
|
tables.adv_fields[0] = d_f_adv;
|
||||||
|
tables.ko_fields[0] = d_f_ko;
|
||||||
|
tables.rhs_fields[0] = d_f_rhs;
|
||||||
|
tables.soa_signs[0] = (int)SoA0;
|
||||||
|
tables.soa_signs[1] = (int)SoA1;
|
||||||
|
tables.soa_signs[2] = (int)SoA2;
|
||||||
|
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
|
||||||
|
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
||||||
|
d_Sfx, d_Sfy, d_Sfz, tables, eps_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gpu_lopsided_kodis_state_batch(double eps_val, int all, bool include_escalar = false)
|
static void gpu_lopsided_kodis_state_batch(double eps_val, int all, bool include_escalar = false)
|
||||||
@@ -4624,6 +4701,12 @@ void kern_phase12_13_chi_correction_fused(
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(chi, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
cxx, cxy, cxz, cyy, cyz, czz);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -4747,6 +4830,7 @@ void kern_phase12_13_chi_correction_fused(
|
|||||||
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
const double cx = chix[tid];
|
const double cx = chix[tid];
|
||||||
@@ -4917,6 +5001,12 @@ void kern_phase15_trK_Aij_gauge(
|
|||||||
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
||||||
|
|
||||||
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(alpn1, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -5040,6 +5130,7 @@ void kern_phase15_trK_Aij_gauge(
|
|||||||
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* raised chi/chi */
|
/* raised chi/chi */
|
||||||
@@ -5443,15 +5534,15 @@ static void setup_grid_params(int *ex,
|
|||||||
gp.imaxF = nx;
|
gp.imaxF = nx;
|
||||||
gp.jmaxF = ny;
|
gp.jmaxF = ny;
|
||||||
gp.kmaxF = nz;
|
gp.kmaxF = nz;
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
|
||||||
gp.iminF3 = 1;
|
gp.iminF3 = 1;
|
||||||
gp.jminF3 = 1;
|
gp.jminF3 = 1;
|
||||||
gp.kminF3 = 1;
|
gp.kminF3 = 1;
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
|
||||||
gp.Symmetry = Symmetry;
|
gp.Symmetry = Symmetry;
|
||||||
gp.eps = eps;
|
gp.eps = eps;
|
||||||
gp.co = co;
|
gp.co = co;
|
||||||
|
|||||||
412
AMSS_NCKU_source/fd_cuda_helpers.cuh
Normal file
412
AMSS_NCKU_source/fd_cuda_helpers.cuh
Normal file
@@ -0,0 +1,412 @@
|
|||||||
|
#ifndef AMSS_NCKU_FD_CUDA_HELPERS_CUH
|
||||||
|
#define AMSS_NCKU_FD_CUDA_HELPERS_CUH
|
||||||
|
|
||||||
|
#ifndef ghost_width
|
||||||
|
#error "ghost_width must be defined before including fd_cuda_helpers.cuh"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if ghost_width < 2 || ghost_width > 5
|
||||||
|
#error "CUDA finite-difference helpers support ghost_width 2..5"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define AMSS_FD_CENTER_RADIUS (ghost_width - 1)
|
||||||
|
#define AMSS_FD_LK_RADIUS (ghost_width)
|
||||||
|
|
||||||
|
__device__ __forceinline__ int fd_axis_radius(int qF, int qminF, int qmaxF)
|
||||||
|
{
|
||||||
|
#if AMSS_FD_CENTER_RADIUS >= 4
|
||||||
|
if (qF - 4 >= qminF && qF + 4 <= qmaxF) return 4;
|
||||||
|
#endif
|
||||||
|
#if AMSS_FD_CENTER_RADIUS >= 3
|
||||||
|
if (qF - 3 >= qminF && qF + 3 <= qmaxF) return 3;
|
||||||
|
#endif
|
||||||
|
#if AMSS_FD_CENTER_RADIUS >= 2
|
||||||
|
if (qF - 2 >= qminF && qF + 2 <= qmaxF) return 2;
|
||||||
|
#endif
|
||||||
|
if (qF - 1 >= qminF && qF + 1 <= qmaxF) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ int fd_common_radius(int iF, int jF, int kF,
|
||||||
|
int iminF, int jminF, int kminF,
|
||||||
|
int imaxF, int jmaxF, int kmaxF)
|
||||||
|
{
|
||||||
|
int r = fd_axis_radius(iF, iminF, imaxF);
|
||||||
|
const int ry = fd_axis_radius(jF, jminF, jmaxF);
|
||||||
|
const int rz = fd_axis_radius(kF, kminF, kmaxF);
|
||||||
|
if (ry < r) r = ry;
|
||||||
|
if (rz < r) r = rz;
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_first_coef(int r, int off)
|
||||||
|
{
|
||||||
|
switch (r) {
|
||||||
|
case 1:
|
||||||
|
if (off == -1) return -1.0;
|
||||||
|
if (off == 1) return 1.0;
|
||||||
|
return 0.0;
|
||||||
|
case 2:
|
||||||
|
if (off == -2) return 1.0;
|
||||||
|
if (off == -1) return -8.0;
|
||||||
|
if (off == 1) return 8.0;
|
||||||
|
if (off == 2) return -1.0;
|
||||||
|
return 0.0;
|
||||||
|
case 3:
|
||||||
|
if (off == -3) return -1.0;
|
||||||
|
if (off == -2) return 9.0;
|
||||||
|
if (off == -1) return -45.0;
|
||||||
|
if (off == 1) return 45.0;
|
||||||
|
if (off == 2) return -9.0;
|
||||||
|
if (off == 3) return 1.0;
|
||||||
|
return 0.0;
|
||||||
|
case 4:
|
||||||
|
if (off == -4) return 3.0;
|
||||||
|
if (off == -3) return -32.0;
|
||||||
|
if (off == -2) return 168.0;
|
||||||
|
if (off == -1) return -672.0;
|
||||||
|
if (off == 1) return 672.0;
|
||||||
|
if (off == 2) return -168.0;
|
||||||
|
if (off == 3) return 32.0;
|
||||||
|
if (off == 4) return -3.0;
|
||||||
|
return 0.0;
|
||||||
|
default:
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_second_coef(int r, int off)
|
||||||
|
{
|
||||||
|
switch (r) {
|
||||||
|
case 1:
|
||||||
|
if (off == -1) return 1.0;
|
||||||
|
if (off == 0) return -2.0;
|
||||||
|
if (off == 1) return 1.0;
|
||||||
|
return 0.0;
|
||||||
|
case 2:
|
||||||
|
if (off == -2) return -1.0;
|
||||||
|
if (off == -1) return 16.0;
|
||||||
|
if (off == 0) return -30.0;
|
||||||
|
if (off == 1) return 16.0;
|
||||||
|
if (off == 2) return -1.0;
|
||||||
|
return 0.0;
|
||||||
|
case 3:
|
||||||
|
if (off == -3) return 2.0;
|
||||||
|
if (off == -2) return -27.0;
|
||||||
|
if (off == -1) return 270.0;
|
||||||
|
if (off == 0) return -490.0;
|
||||||
|
if (off == 1) return 270.0;
|
||||||
|
if (off == 2) return -27.0;
|
||||||
|
if (off == 3) return 2.0;
|
||||||
|
return 0.0;
|
||||||
|
case 4:
|
||||||
|
if (off == -4) return -9.0;
|
||||||
|
if (off == -3) return 128.0;
|
||||||
|
if (off == -2) return -1008.0;
|
||||||
|
if (off == -1) return 8064.0;
|
||||||
|
if (off == 0) return -14350.0;
|
||||||
|
if (off == 1) return 8064.0;
|
||||||
|
if (off == 2) return -1008.0;
|
||||||
|
if (off == 3) return 128.0;
|
||||||
|
if (off == 4) return -9.0;
|
||||||
|
return 0.0;
|
||||||
|
default:
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_first_denom(int r)
|
||||||
|
{
|
||||||
|
return (r == 4) ? 840.0 : ((r == 3) ? 60.0 : ((r == 2) ? 12.0 : 2.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_second_denom(int r)
|
||||||
|
{
|
||||||
|
return (r == 4) ? 5040.0 : ((r == 3) ? 180.0 : ((r == 2) ? 12.0 : 1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_fetch_axis(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, int off,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (axis == 0) iF += off;
|
||||||
|
else if (axis == 1) jF += off;
|
||||||
|
else kF += off;
|
||||||
|
return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_fetch_axis2(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis_a, int off_a,
|
||||||
|
int axis_b, int off_b,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (axis_a == 0) iF += off_a;
|
||||||
|
else if (axis_a == 1) jF += off_a;
|
||||||
|
else kF += off_a;
|
||||||
|
if (axis_b == 0) iF += off_b;
|
||||||
|
else if (axis_b == 1) jF += off_b;
|
||||||
|
else kF += off_b;
|
||||||
|
return fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_first_axis_radius(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, int r, double h,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (r <= 0) return 0.0;
|
||||||
|
double s = 0.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int off = -4; off <= 4; ++off) {
|
||||||
|
const double c = fd_first_coef(r, off);
|
||||||
|
if (c != 0.0) {
|
||||||
|
s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s / (fd_first_denom(r) * h);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_second_axis_radius(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, int r, double h,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (r <= 0) return 0.0;
|
||||||
|
double s = 0.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int off = -4; off <= 4; ++off) {
|
||||||
|
const double c = fd_second_coef(r, off);
|
||||||
|
if (c != 0.0) {
|
||||||
|
s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s / (fd_second_denom(r) * h * h);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_mixed_axis_radius(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis_a, int r_a, double h_a,
|
||||||
|
int axis_b, int r_b, double h_b,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (r_a <= 0 || r_b <= 0) return 0.0;
|
||||||
|
double s = 0.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int off_a = -4; off_a <= 4; ++off_a) {
|
||||||
|
const double ca = fd_first_coef(r_a, off_a);
|
||||||
|
if (ca == 0.0) continue;
|
||||||
|
#pragma unroll
|
||||||
|
for (int off_b = -4; off_b <= 4; ++off_b) {
|
||||||
|
const double cb = fd_first_coef(r_b, off_b);
|
||||||
|
if (cb != 0.0) {
|
||||||
|
s += ca * cb * fd_fetch_axis2(src, iF, jF, kF, axis_a, off_a,
|
||||||
|
axis_b, off_b, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s / (fd_first_denom(r_a) * fd_first_denom(r_b) * h_a * h_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void fd_compute_first3(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int iminF, int jminF, int kminF,
|
||||||
|
int imaxF, int jmaxF, int kmaxF,
|
||||||
|
int SoA0, int SoA1, int SoA2,
|
||||||
|
double &fx, double &fy, double &fz)
|
||||||
|
{
|
||||||
|
#if ghost_width == 3
|
||||||
|
const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF);
|
||||||
|
fx = fd_first_axis_radius(src, iF, jF, kF, 0, r, d_gp.dX, SoA0, SoA1, SoA2);
|
||||||
|
fy = fd_first_axis_radius(src, iF, jF, kF, 1, r, d_gp.dY, SoA0, SoA1, SoA2);
|
||||||
|
fz = fd_first_axis_radius(src, iF, jF, kF, 2, r, d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
#else
|
||||||
|
fx = fd_first_axis_radius(src, iF, jF, kF, 0, fd_axis_radius(iF, iminF, imaxF),
|
||||||
|
d_gp.dX, SoA0, SoA1, SoA2);
|
||||||
|
fy = fd_first_axis_radius(src, iF, jF, kF, 1, fd_axis_radius(jF, jminF, jmaxF),
|
||||||
|
d_gp.dY, SoA0, SoA1, SoA2);
|
||||||
|
fz = fd_first_axis_radius(src, iF, jF, kF, 2, fd_axis_radius(kF, kminF, kmaxF),
|
||||||
|
d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void fd_compute_second6(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int iminF, int jminF, int kminF,
|
||||||
|
int imaxF, int jmaxF, int kmaxF,
|
||||||
|
int SoA0, int SoA1, int SoA2,
|
||||||
|
double &fxx, double &fxy, double &fxz,
|
||||||
|
double &fyy, double &fyz, double &fzz)
|
||||||
|
{
|
||||||
|
#if ghost_width == 3
|
||||||
|
const int r = fd_common_radius(iF, jF, kF, iminF, jminF, kminF, imaxF, jmaxF, kmaxF);
|
||||||
|
const int rx = r, ry = r, rz = r;
|
||||||
|
#else
|
||||||
|
const int rx = fd_axis_radius(iF, iminF, imaxF);
|
||||||
|
const int ry = fd_axis_radius(jF, jminF, jmaxF);
|
||||||
|
const int rz = fd_axis_radius(kF, kminF, kmaxF);
|
||||||
|
#endif
|
||||||
|
fxx = fd_second_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, SoA0, SoA1, SoA2);
|
||||||
|
fyy = fd_second_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, SoA0, SoA1, SoA2);
|
||||||
|
fzz = fd_second_axis_radius(src, iF, jF, kF, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
fxy = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 1, ry, d_gp.dY, SoA0, SoA1, SoA2);
|
||||||
|
fxz = fd_mixed_axis_radius(src, iF, jF, kF, 0, rx, d_gp.dX, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
fyz = fd_mixed_axis_radius(src, iF, jF, kF, 1, ry, d_gp.dY, 2, rz, d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ bool fd_lop_fits(int qF, int qminF, int qmaxF,
|
||||||
|
int dir, int lo, int hi)
|
||||||
|
{
|
||||||
|
for (int off = lo; off <= hi; ++off) {
|
||||||
|
const int q = qF + dir * off;
|
||||||
|
if (q < qminF || q > qmaxF) return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_lop_fetch_sum(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, int dir,
|
||||||
|
const double *coef,
|
||||||
|
int lo, int hi,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
double s = 0.0;
|
||||||
|
for (int off = lo; off <= hi; ++off) {
|
||||||
|
const double c = coef[off - lo];
|
||||||
|
if (c != 0.0) {
|
||||||
|
s += c * fd_fetch_axis(src, iF, jF, kF, axis, dir * off, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_lopsided_axis(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, double speed,
|
||||||
|
int qF, int qminF, int qmaxF,
|
||||||
|
double h,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
if (speed == 0.0) return 0.0;
|
||||||
|
const int dir = (speed > 0.0) ? 1 : -1;
|
||||||
|
const double mag = (speed > 0.0) ? speed : -speed;
|
||||||
|
|
||||||
|
#if ghost_width == 2
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 2)) {
|
||||||
|
const double c[] = {-3.0, 4.0, -1.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 2, SoA0, SoA1, SoA2) / (2.0 * h);
|
||||||
|
}
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, 0, 1)) {
|
||||||
|
const double c[] = {-1.0, 1.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, 0, 1, SoA0, SoA1, SoA2) / (2.0 * h);
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
#elif ghost_width == 3
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 3)) {
|
||||||
|
const double c[] = {-3.0, -10.0, 18.0, -6.0, 1.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 3, SoA0, SoA1, SoA2) / (12.0 * h);
|
||||||
|
}
|
||||||
|
const int r = fd_axis_radius(qF, qminF, qmaxF);
|
||||||
|
return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
|
||||||
|
#elif ghost_width == 4
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, -2, 4)) {
|
||||||
|
const double c[] = {2.0, -24.0, -35.0, 80.0, -30.0, 8.0, -1.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -2, 4, SoA0, SoA1, SoA2) / (60.0 * h);
|
||||||
|
}
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, -1, 5)) {
|
||||||
|
const double c[] = {-10.0, -77.0, 150.0, -100.0, 50.0, -15.0, 2.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -1, 5, SoA0, SoA1, SoA2) / (60.0 * h);
|
||||||
|
}
|
||||||
|
const int r = fd_axis_radius(qF, qminF, qmaxF);
|
||||||
|
return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
|
||||||
|
#else
|
||||||
|
if (fd_lop_fits(qF, qminF, qmaxF, dir, -3, 5)) {
|
||||||
|
const double c[] = {-5.0, 60.0, -420.0, -378.0, 1050.0, -420.0, 140.0, -30.0, 3.0};
|
||||||
|
return mag * fd_lop_fetch_sum(src, iF, jF, kF, axis, dir, c, -3, 5, SoA0, SoA1, SoA2) / (840.0 * h);
|
||||||
|
}
|
||||||
|
const int r = fd_axis_radius(qF, qminF, qmaxF);
|
||||||
|
return speed * fd_first_axis_radius(src, iF, jF, kF, axis, r, h, SoA0, SoA1, SoA2);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_ko_coef(int r, int off)
|
||||||
|
{
|
||||||
|
const int a = off < 0 ? -off : off;
|
||||||
|
if (r == 2) {
|
||||||
|
if (a == 0) return 6.0;
|
||||||
|
if (a == 1) return -4.0;
|
||||||
|
if (a == 2) return 1.0;
|
||||||
|
} else if (r == 3) {
|
||||||
|
if (a == 0) return -20.0;
|
||||||
|
if (a == 1) return 15.0;
|
||||||
|
if (a == 2) return -6.0;
|
||||||
|
if (a == 3) return 1.0;
|
||||||
|
} else if (r == 4) {
|
||||||
|
if (a == 0) return 70.0;
|
||||||
|
if (a == 1) return -56.0;
|
||||||
|
if (a == 2) return 28.0;
|
||||||
|
if (a == 3) return -8.0;
|
||||||
|
if (a == 4) return 1.0;
|
||||||
|
} else if (r == 5) {
|
||||||
|
if (a == 0) return -252.0;
|
||||||
|
if (a == 1) return 210.0;
|
||||||
|
if (a == 2) return -120.0;
|
||||||
|
if (a == 3) return 45.0;
|
||||||
|
if (a == 4) return -10.0;
|
||||||
|
if (a == 5) return 1.0;
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_ko_axis(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int axis, int r,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
double s = 0.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int off = -5; off <= 5; ++off) {
|
||||||
|
if (off < -r || off > r) continue;
|
||||||
|
const double c = fd_ko_coef(r, off);
|
||||||
|
if (c != 0.0) {
|
||||||
|
s += c * fd_fetch_axis(src, iF, jF, kF, axis, off, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ double fd_ko_term(const double *src,
|
||||||
|
int iF, int jF, int kF,
|
||||||
|
int iminF, int jminF, int kminF,
|
||||||
|
int imaxF, int jmaxF, int kmaxF,
|
||||||
|
double eps_val,
|
||||||
|
int SoA0, int SoA1, int SoA2)
|
||||||
|
{
|
||||||
|
const int r = AMSS_FD_LK_RADIUS;
|
||||||
|
if (eps_val <= 0.0) return 0.0;
|
||||||
|
#if ghost_width >= 4
|
||||||
|
if (iF - r <= iminF || iF + r >= imaxF ||
|
||||||
|
jF - r <= jminF || jF + r >= jmaxF ||
|
||||||
|
kF - r <= kminF || kF + r >= kmaxF) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if (iF - r < iminF || iF + r > imaxF ||
|
||||||
|
jF - r < jminF || jF + r > jmaxF ||
|
||||||
|
kF - r < kminF || kF + r > kmaxF) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
double cof = 1.0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int n = 0; n < 2 * r; ++n) cof *= 2.0;
|
||||||
|
const double sign = (r & 1) ? 1.0 : -1.0;
|
||||||
|
const double dx = fd_ko_axis(src, iF, jF, kF, 0, r, SoA0, SoA1, SoA2);
|
||||||
|
const double dy = fd_ko_axis(src, iF, jF, kF, 1, r, SoA0, SoA1, SoA2);
|
||||||
|
const double dz = fd_ko_axis(src, iF, jF, kF, 2, r, SoA0, SoA1, SoA2);
|
||||||
|
return sign * eps_val * (dx / d_gp.dX + dy / d_gp.dY + dz / d_gp.dZ) / cof;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -63,11 +63,11 @@ endif
|
|||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||||
|
|
||||||
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
|
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
|
||||||
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
|
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h fd_cuda_helpers.cuh
|
||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||||
|
|
||||||
# CUDA rewrite of Z4C Cartesian RHS
|
# CUDA rewrite of Z4C Cartesian RHS
|
||||||
z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h
|
z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h fd_cuda_helpers.cuh
|
||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||||
|
|
||||||
# C rewrite of BSSN RHS kernel and helpers
|
# C rewrite of BSSN RHS kernel and helpers
|
||||||
|
|||||||
@@ -266,6 +266,8 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
|
|||||||
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "fd_cuda_helpers.cuh"
|
||||||
|
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
/* GPU buffer management */
|
/* GPU buffer management */
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
@@ -1419,45 +1421,10 @@ void kern_fderivs_batched(FDerivTables tables, int field_count)
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
fd_compute_first3(src, iF, jF, kF,
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
SoA0, SoA1, SoA2,
|
||||||
{
|
fx[tid], fy[tid], fz[tid]);
|
||||||
fx[tid] = d_gp.d12dx * (
|
|
||||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
||||||
fy[tid] = d_gp.d12dy * (
|
|
||||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
||||||
fz[tid] = d_gp.d12dz * (
|
|
||||||
fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
||||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
||||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
||||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
||||||
}
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fx[tid] = d_gp.d2dx * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
|
||||||
fy[tid] = d_gp.d2dy * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
||||||
fz[tid] = d_gp.d2dz * (
|
|
||||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
||||||
+fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fx[tid] = 0.0;
|
|
||||||
fy[tid] = 0.0;
|
|
||||||
fz[tid] = 0.0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 4)
|
__global__ __launch_bounds__(128, 4)
|
||||||
@@ -1497,6 +1464,12 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
SoA0, SoA1, SoA2,
|
||||||
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -1624,12 +1597,43 @@ void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
|||||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gpu_fderivs_batch(int field_count,
|
||||||
|
double *const *src_fields,
|
||||||
|
double *const *fx_fields,
|
||||||
|
double *const *fy_fields,
|
||||||
|
double *const *fz_fields,
|
||||||
|
const int *soa_signs,
|
||||||
|
int all);
|
||||||
|
static void gpu_fdderivs_batch(int field_count,
|
||||||
|
double *const *src_fields,
|
||||||
|
double *const *fxx_fields,
|
||||||
|
double *const *fxy_fields,
|
||||||
|
double *const *fxz_fields,
|
||||||
|
double *const *fyy_fields,
|
||||||
|
double *const *fyz_fields,
|
||||||
|
double *const *fzz_fields,
|
||||||
|
const int *soa_signs,
|
||||||
|
int all);
|
||||||
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||||
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||||
|
double SoA0, double SoA1, double SoA2,
|
||||||
|
double eps_val, int all);
|
||||||
|
|
||||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||||
double SoA0, double SoA1, double SoA2, int all)
|
double SoA0, double SoA1, double SoA2, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
double *src_fields[1] = {d_f};
|
||||||
|
double *fx_fields[1] = {d_fx};
|
||||||
|
double *fy_fields[1] = {d_fy};
|
||||||
|
double *fz_fields[1] = {d_fz};
|
||||||
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||||
|
gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh2;
|
double *fh = g_buf.d_fh2;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -1638,6 +1642,7 @@ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
|||||||
|
|
||||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
||||||
@@ -1646,6 +1651,18 @@ static void gpu_fdderivs(double *d_f,
|
|||||||
double *d_fyy, double *d_fyz, double *d_fzz,
|
double *d_fyy, double *d_fyz, double *d_fzz,
|
||||||
double SoA0, double SoA1, double SoA2, int all)
|
double SoA0, double SoA1, double SoA2, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
double *src_fields[1] = {d_f};
|
||||||
|
double *fxx_fields[1] = {d_fxx};
|
||||||
|
double *fxy_fields[1] = {d_fxy};
|
||||||
|
double *fxz_fields[1] = {d_fxz};
|
||||||
|
double *fyy_fields[1] = {d_fyy};
|
||||||
|
double *fyz_fields[1] = {d_fyz};
|
||||||
|
double *fzz_fields[1] = {d_fzz};
|
||||||
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
||||||
|
gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
|
||||||
|
fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh2;
|
double *fh = g_buf.d_fh2;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -1654,6 +1671,7 @@ static void gpu_fdderivs(double *d_f,
|
|||||||
|
|
||||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gpu_fderivs_batch(int field_count,
|
static void gpu_fderivs_batch(int field_count,
|
||||||
@@ -1743,6 +1761,12 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
SoA0, SoA1, SoA2,
|
||||||
|
fxx, fxy, fxz, fyy, fyz, fzz);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -1866,6 +1890,7 @@ void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
|||||||
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
||||||
@@ -1930,6 +1955,16 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(Lap, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
||||||
|
fd_compute_first3(chi, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
chix_out[tid], chiy_out[tid], chiz_out[tid]);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -2083,6 +2118,7 @@ void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
|||||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Combined ord=3 advection + KO dissipation.
|
/* Combined ord=3 advection + KO dissipation.
|
||||||
@@ -2094,6 +2130,11 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|||||||
double SoA0, double SoA1, double SoA2,
|
double SoA0, double SoA1, double SoA2,
|
||||||
double eps_val, int all)
|
double eps_val, int all)
|
||||||
{
|
{
|
||||||
|
#if ghost_width != 3
|
||||||
|
gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
|
||||||
|
d_Sfx, d_Sfy, d_Sfz,
|
||||||
|
SoA0, SoA1, SoA2, eps_val, all);
|
||||||
|
#else
|
||||||
double *fh = g_buf.d_fh3;
|
double *fh = g_buf.d_fh3;
|
||||||
const size_t nx = (size_t)g_buf.prev_nx;
|
const size_t nx = (size_t)g_buf.prev_nx;
|
||||||
const size_t ny = (size_t)g_buf.prev_ny;
|
const size_t ny = (size_t)g_buf.prev_ny;
|
||||||
@@ -2109,6 +2150,7 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|||||||
}
|
}
|
||||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 4)
|
__global__ __launch_bounds__(128, 4)
|
||||||
@@ -2141,6 +2183,24 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||||
|
const double val =
|
||||||
|
fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
|
||||||
|
d_gp.dX, SoA0, SoA1, SoA2)
|
||||||
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
|
||||||
|
d_gp.dY, SoA0, SoA1, SoA2)
|
||||||
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
|
||||||
|
d_gp.dZ, SoA0, SoA1, SoA2);
|
||||||
|
rhs[tid] += val;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (do_kodis) {
|
||||||
|
rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
eps_val, SoA0, SoA1, SoA2);
|
||||||
|
}
|
||||||
|
#else
|
||||||
if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
||||||
double val = 0.0;
|
double val = 0.0;
|
||||||
|
|
||||||
@@ -2323,6 +2383,25 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
|||||||
|
|
||||||
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||||
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||||
|
double SoA0, double SoA1, double SoA2,
|
||||||
|
double eps_val, int all)
|
||||||
|
{
|
||||||
|
LopsidedKodisTables tables = {};
|
||||||
|
tables.adv_fields[0] = d_f_adv;
|
||||||
|
tables.ko_fields[0] = d_f_ko;
|
||||||
|
tables.rhs_fields[0] = d_f_rhs;
|
||||||
|
tables.soa_signs[0] = (int)SoA0;
|
||||||
|
tables.soa_signs[1] = (int)SoA1;
|
||||||
|
tables.soa_signs[2] = (int)SoA2;
|
||||||
|
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
|
||||||
|
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
||||||
|
d_Sfx, d_Sfy, d_Sfz, tables, eps_val, 1, eps_val > 0.0 ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
|
static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
|
||||||
@@ -3873,6 +3952,12 @@ void kern_phase12_13_chi_correction_fused(
|
|||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(chi, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
cxx, cxy, cxz, cyy, cyz, czz);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -3996,6 +4081,7 @@ void kern_phase12_13_chi_correction_fused(
|
|||||||
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
const double cx = chix[tid];
|
const double cx = chix[tid];
|
||||||
@@ -4166,6 +4252,12 @@ void kern_phase15_trK_Aij_gauge(
|
|||||||
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
||||||
|
|
||||||
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||||
|
#if ghost_width != 3
|
||||||
|
fd_compute_second6(alpn1, iF, jF, kF,
|
||||||
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
||||||
|
1, 1, 1,
|
||||||
|
fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
|
||||||
|
#else
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
@@ -4289,6 +4381,7 @@ void kern_phase15_trK_Aij_gauge(
|
|||||||
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* raised chi/chi */
|
/* raised chi/chi */
|
||||||
@@ -4626,15 +4719,15 @@ static void setup_grid_params(int *ex,
|
|||||||
gp.imaxF = nx;
|
gp.imaxF = nx;
|
||||||
gp.jmaxF = ny;
|
gp.jmaxF = ny;
|
||||||
gp.kmaxF = nz;
|
gp.kmaxF = nz;
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
|
||||||
gp.iminF3 = 1;
|
gp.iminF3 = 1;
|
||||||
gp.jminF3 = 1;
|
gp.jminF3 = 1;
|
||||||
gp.kminF3 = 1;
|
gp.kminF3 = 1;
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
|
||||||
gp.Symmetry = Symmetry;
|
gp.Symmetry = Symmetry;
|
||||||
gp.eps = eps;
|
gp.eps = eps;
|
||||||
gp.co = co;
|
gp.co = co;
|
||||||
|
|||||||
Reference in New Issue
Block a user