Add Phase-10 Ricci kernels and batch launch flow
This commit is contained in:
@@ -1327,6 +1327,14 @@ struct FDDerivTables {
|
|||||||
int soa_signs[3 * BSSN_STATE_COUNT];
|
int soa_signs[3 * BSSN_STATE_COUNT];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static constexpr int PHASE10_METRIC_FIELD_COUNT = 6;
|
||||||
|
|
||||||
|
struct Phase10RicciTables {
|
||||||
|
const double *src_fields[PHASE10_METRIC_FIELD_COUNT];
|
||||||
|
double *dst_fields[PHASE10_METRIC_FIELD_COUNT];
|
||||||
|
int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT];
|
||||||
|
};
|
||||||
|
|
||||||
struct Rk4FinalizeTables {
|
struct Rk4FinalizeTables {
|
||||||
const double *f0_fields[BSSN_STATE_COUNT];
|
const double *f0_fields[BSSN_STATE_COUNT];
|
||||||
double *rhs_fields[BSSN_STATE_COUNT];
|
double *rhs_fields[BSSN_STATE_COUNT];
|
||||||
@@ -1670,6 +1678,383 @@ static void gpu_fdderivs_batch(int field_count,
|
|||||||
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 4)
|
||||||
|
void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
||||||
|
const double * __restrict__ gupxy,
|
||||||
|
const double * __restrict__ gupxz,
|
||||||
|
const double * __restrict__ gupyy,
|
||||||
|
const double * __restrict__ gupyz,
|
||||||
|
const double * __restrict__ gupzz,
|
||||||
|
Phase10RicciTables tables)
|
||||||
|
{
|
||||||
|
const int field = blockIdx.y;
|
||||||
|
if (field >= PHASE10_METRIC_FIELD_COUNT) return;
|
||||||
|
|
||||||
|
const double *src = tables.src_fields[field];
|
||||||
|
double *dst = tables.dst_fields[field];
|
||||||
|
const int SoA0 = tables.soa_signs[3 * field + 0];
|
||||||
|
const int SoA1 = tables.soa_signs[3 * field + 1];
|
||||||
|
const int SoA2 = tables.soa_signs[3 * field + 2];
|
||||||
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||||
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||||
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||||
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (tid >= d_gp.all) return;
|
||||||
|
|
||||||
|
const int i0 = tid % nx;
|
||||||
|
const int j0 = (tid / nx) % ny;
|
||||||
|
const int k0 = tid / (nx * ny);
|
||||||
|
|
||||||
|
double fxx = 0.0, fxy = 0.0, fxz = 0.0;
|
||||||
|
double fyy = 0.0, fyz = 0.0, fzz = 0.0;
|
||||||
|
|
||||||
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
|
{
|
||||||
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||||
|
fxx = d_gp.Fdxdx * (
|
||||||
|
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
||||||
|
fyy = d_gp.Fdydy * (
|
||||||
|
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
||||||
|
fzz = d_gp.Fdzdz * (
|
||||||
|
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
||||||
|
|
||||||
|
const double t_jm2 =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
|
||||||
|
const double t_jm1 =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
|
||||||
|
const double t_jp1 =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
|
||||||
|
const double t_jp2 =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
|
||||||
|
fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||||
|
|
||||||
|
const double t_km2_x =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
|
||||||
|
const double t_km1_x =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
|
||||||
|
const double t_kp1_x =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
|
||||||
|
const double t_kp2_x =
|
||||||
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
|
||||||
|
fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||||
|
|
||||||
|
const double t_km2_y =
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
|
||||||
|
const double t_km1_y =
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
|
||||||
|
const double t_kp1_y =
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
|
||||||
|
const double t_kp2_y =
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
|
||||||
|
fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||||
|
}
|
||||||
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||||
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||||
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||||
|
{
|
||||||
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||||
|
fxx = d_gp.Sdxdx * (
|
||||||
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
||||||
|
fyy = d_gp.Sdydy * (
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||||
|
fzz = d_gp.Sdzdz * (
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||||
|
fxy = d_gp.Sdxdy * (
|
||||||
|
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||||
|
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||||
|
fxz = d_gp.Sdxdz * (
|
||||||
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||||
|
fyz = d_gp.Sdydz * (
|
||||||
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||||
|
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||||
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
||||||
|
+ 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gpu_phase10_ricci_batch(const double *gupxx,
|
||||||
|
const double *gupxy,
|
||||||
|
const double *gupxz,
|
||||||
|
const double *gupyy,
|
||||||
|
const double *gupyz,
|
||||||
|
const double *gupzz,
|
||||||
|
double *const *src_fields,
|
||||||
|
double *const *dst_fields,
|
||||||
|
const int *soa_signs,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
Phase10RicciTables tables = {};
|
||||||
|
for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) {
|
||||||
|
tables.src_fields[i] = src_fields[i];
|
||||||
|
tables.dst_fields[i] = dst_fields[i];
|
||||||
|
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
||||||
|
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
||||||
|
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
||||||
|
}
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT);
|
||||||
|
kern_phase10_ricci_batched<<<launch_grid, BLK>>>(
|
||||||
|
gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 4)
|
||||||
|
void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
||||||
|
const double * __restrict__ chi,
|
||||||
|
double * __restrict__ fxx,
|
||||||
|
double * __restrict__ fxy,
|
||||||
|
double * __restrict__ fxz,
|
||||||
|
double * __restrict__ fyy,
|
||||||
|
double * __restrict__ fyz,
|
||||||
|
double * __restrict__ fzz,
|
||||||
|
double * __restrict__ chix_out,
|
||||||
|
double * __restrict__ chiy_out,
|
||||||
|
double * __restrict__ chiz_out)
|
||||||
|
{
|
||||||
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||||
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||||
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||||
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (tid >= d_gp.all) return;
|
||||||
|
|
||||||
|
const int i0 = tid % nx;
|
||||||
|
const int j0 = (tid / nx) % ny;
|
||||||
|
const int k0 = tid / (nx * ny);
|
||||||
|
|
||||||
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
||||||
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||||
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
|
{
|
||||||
|
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
||||||
|
fxx[tid] = d_gp.Fdxdx * (
|
||||||
|
-fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
-30.0 * lap_c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1));
|
||||||
|
fyy[tid] = d_gp.Fdydy * (
|
||||||
|
-fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
-30.0 * lap_c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1));
|
||||||
|
fzz[tid] = d_gp.Fdzdz * (
|
||||||
|
-fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
-30.0 * lap_c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1));
|
||||||
|
|
||||||
|
const double t_jm2 =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1);
|
||||||
|
const double t_jm1 =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1);
|
||||||
|
const double t_jp1 =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1);
|
||||||
|
const double t_jp2 =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1);
|
||||||
|
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||||
|
|
||||||
|
const double t_km2_x =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1);
|
||||||
|
const double t_km1_x =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1);
|
||||||
|
const double t_kp1_x =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1);
|
||||||
|
const double t_kp2_x =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1);
|
||||||
|
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||||
|
|
||||||
|
const double t_km2_y =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1);
|
||||||
|
const double t_km1_y =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1);
|
||||||
|
const double t_kp1_y =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1);
|
||||||
|
const double t_kp2_y =
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1);
|
||||||
|
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||||
|
|
||||||
|
chix_out[tid] = d_gp.d12dx * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
||||||
|
chiy_out[tid] = d_gp.d12dy * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
||||||
|
chiz_out[tid] = d_gp.d12dz * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
||||||
|
}
|
||||||
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||||
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||||
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||||
|
{
|
||||||
|
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
||||||
|
fxx[tid] = d_gp.Sdxdx * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
- 2.0 * lap_c
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1));
|
||||||
|
fyy[tid] = d_gp.Sdydy * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
- 2.0 * lap_c
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1));
|
||||||
|
fzz[tid] = d_gp.Sdzdz * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
- 2.0 * lap_c
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1));
|
||||||
|
fxy[tid] = d_gp.Sdxdy * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1));
|
||||||
|
fxz[tid] = d_gp.Sdxdz * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1));
|
||||||
|
fyz[tid] = d_gp.Sdydz * (
|
||||||
|
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
|
chix_out[tid] = d_gp.d2dx * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
+fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
||||||
|
chiy_out[tid] = d_gp.d2dy * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
+fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
||||||
|
chiz_out[tid] = d_gp.d2dz * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
+fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||||
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||||
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Combined ord=3 advection + KO dissipation.
|
/* Combined ord=3 advection + KO dissipation.
|
||||||
* When advection and KO use the same source field, symmetry packing is shared.
|
* When advection and KO use the same source field, symmetry packing is shared.
|
||||||
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
||||||
@@ -2987,6 +3372,206 @@ void kern_phase11_ricci_offdiag(
|
|||||||
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
|
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
|
||||||
* then add chi contribution to Rxx..Rzz.
|
* then add chi contribution to Rxx..Rzz.
|
||||||
*/
|
*/
|
||||||
|
__global__ __launch_bounds__(128, 4)
|
||||||
|
void kern_phase12_13_chi_correction_fused(
|
||||||
|
const double* __restrict__ chi,
|
||||||
|
const double* __restrict__ chin1,
|
||||||
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
||||||
|
const double* __restrict__ chiz,
|
||||||
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
||||||
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
||||||
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
||||||
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
||||||
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
||||||
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
||||||
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
||||||
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
||||||
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
||||||
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
||||||
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
||||||
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
||||||
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
||||||
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
||||||
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
||||||
|
double* __restrict__ Rxx, double* __restrict__ Rxy,
|
||||||
|
double* __restrict__ Rxz, double* __restrict__ Ryy,
|
||||||
|
double* __restrict__ Ryz, double* __restrict__ Rzz)
|
||||||
|
{
|
||||||
|
const double TWO = 2.0;
|
||||||
|
const double F3o2 = 1.5;
|
||||||
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||||
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||||
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||||
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (tid >= d_gp.all) return;
|
||||||
|
|
||||||
|
const int i0 = tid % nx;
|
||||||
|
const int j0 = (tid / nx) % ny;
|
||||||
|
const int k0 = tid / (nx * ny);
|
||||||
|
|
||||||
|
double cxx = 0.0, cxy = 0.0, cxz = 0.0;
|
||||||
|
double cyy = 0.0, cyz = 0.0, czz = 0.0;
|
||||||
|
|
||||||
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
|
||||||
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||||
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||||
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||||
|
{
|
||||||
|
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
||||||
|
cxx = d_gp.Fdxdx * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
||||||
|
cyy = d_gp.Fdydy * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
||||||
|
czz = d_gp.Fdzdz * (
|
||||||
|
-fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
-30.0 * c
|
||||||
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
||||||
|
|
||||||
|
const double t_jm2 =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1);
|
||||||
|
const double t_jm1 =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1);
|
||||||
|
const double t_jp1 =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1);
|
||||||
|
const double t_jp2 =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1);
|
||||||
|
cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||||
|
|
||||||
|
const double t_km2_x =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1);
|
||||||
|
const double t_km1_x =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1);
|
||||||
|
const double t_kp1_x =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1);
|
||||||
|
const double t_kp2_x =
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1);
|
||||||
|
cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||||
|
|
||||||
|
const double t_km2_y =
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1);
|
||||||
|
const double t_km1_y =
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1);
|
||||||
|
const double t_kp1_y =
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1);
|
||||||
|
const double t_kp2_y =
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1)
|
||||||
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1)
|
||||||
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1);
|
||||||
|
cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||||
|
}
|
||||||
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||||
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||||
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||||
|
{
|
||||||
|
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
||||||
|
cxx = d_gp.Sdxdx * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
||||||
|
cyy = d_gp.Sdydy * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
||||||
|
czz = d_gp.Sdzdz * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||||
|
- 2.0 * c
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
||||||
|
cxy = d_gp.Sdxdy * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1));
|
||||||
|
cxz = d_gp.Sdxdz * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1));
|
||||||
|
cyz = d_gp.Sdydz * (
|
||||||
|
fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||||
|
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||||
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const double cx = chix[tid];
|
||||||
|
const double cy = chiy[tid];
|
||||||
|
const double cz = chiz[tid];
|
||||||
|
const double c1 = chin1[tid];
|
||||||
|
|
||||||
|
cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz;
|
||||||
|
cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz;
|
||||||
|
cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz;
|
||||||
|
cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz;
|
||||||
|
cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz;
|
||||||
|
czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz;
|
||||||
|
|
||||||
|
const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid];
|
||||||
|
const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid];
|
||||||
|
const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx)
|
||||||
|
+ uyy * (cyy - F3o2 / c1 * cy * cy)
|
||||||
|
+ uzz * (czz - F3o2 / c1 * cz * cz)
|
||||||
|
+ TWO * uxy * (cxy - F3o2 / c1 * cx * cy)
|
||||||
|
+ TWO * uxz * (cxz - F3o2 / c1 * cx * cz)
|
||||||
|
+ TWO * uyz * (cyz - F3o2 / c1 * cy * cz);
|
||||||
|
|
||||||
|
const double inv2c = 1.0 / (c1 * TWO);
|
||||||
|
Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c;
|
||||||
|
Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c;
|
||||||
|
Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c;
|
||||||
|
Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c;
|
||||||
|
Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c;
|
||||||
|
Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c;
|
||||||
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 4)
|
__global__ __launch_bounds__(128, 4)
|
||||||
void kern_phase13_chi_correction(
|
void kern_phase13_chi_correction(
|
||||||
const double* __restrict__ chin1,
|
const double* __restrict__ chin1,
|
||||||
@@ -3726,41 +4311,21 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
|||||||
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
||||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
||||||
|
|
||||||
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
|
{
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
const int soa_signs[] = {
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
(int)ANTI, (int)ANTI, (int)SYM,
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
(int)ANTI, (int)SYM, (int)ANTI,
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
(int)SYM, (int)ANTI, (int)ANTI
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
|
};
|
||||||
|
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||||
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
src_fields, dst_fields, soa_signs, all);
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
}
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
|
|
||||||
|
|
||||||
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
||||||
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
||||||
@@ -3798,11 +4363,8 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
|||||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
||||||
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
||||||
|
|
||||||
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
|
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
D(S_chi), D(S_chin1),
|
||||||
|
|
||||||
kern_phase13_chi_correction<<<grid(all),BLK>>>(
|
|
||||||
D(S_chin1),
|
|
||||||
D(S_chix), D(S_chiy), D(S_chiz),
|
D(S_chix), D(S_chiy), D(S_chiz),
|
||||||
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
||||||
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||||
@@ -3813,15 +4375,14 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
|||||||
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
||||||
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
||||||
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
||||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
|
||||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
|
||||||
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
||||||
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
||||||
|
|
||||||
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
|
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
D(S_Lap), D(S_chi),
|
||||||
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
|
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||||
SYM,SYM,SYM, all);
|
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||||
|
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
|
||||||
|
|
||||||
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
||||||
D(S_alpn1), D(S_chin1),
|
D(S_alpn1), D(S_chin1),
|
||||||
@@ -3926,18 +4487,18 @@ __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem
|
|||||||
int state_count,
|
int state_count,
|
||||||
int all)
|
int all)
|
||||||
{
|
{
|
||||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
const int state_index = blockIdx.y;
|
||||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
if (state_index >= state_count) return;
|
||||||
tid < total;
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
tid += (size_t)blockDim.x * gridDim.x)
|
local < region_all;
|
||||||
|
local += blockDim.x * gridDim.x)
|
||||||
{
|
{
|
||||||
const int state_index = (int)(tid / (size_t)region_all);
|
|
||||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
|
||||||
const int ii = local % sx;
|
const int ii = local % sx;
|
||||||
const int jj = (local / sx) % sy;
|
const int jj = (local / sx) % sy;
|
||||||
const int kk = local / (sx * sy);
|
const int kk = local / (sx * sy);
|
||||||
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
dst[(size_t)state_index * region_all + local] =
|
||||||
|
src_mem[(size_t)state_index * all + src];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3950,18 +4511,18 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
|||||||
int state_count,
|
int state_count,
|
||||||
int all)
|
int all)
|
||||||
{
|
{
|
||||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
const int state_index = blockIdx.y;
|
||||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
if (state_index >= state_count) return;
|
||||||
tid < total;
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
tid += (size_t)blockDim.x * gridDim.x)
|
local < region_all;
|
||||||
|
local += blockDim.x * gridDim.x)
|
||||||
{
|
{
|
||||||
const int state_index = (int)(tid / (size_t)region_all);
|
|
||||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
|
||||||
const int ii = local % sx;
|
const int ii = local % sx;
|
||||||
const int jj = (local / sx) % sy;
|
const int jj = (local / sx) % sy;
|
||||||
const int kk = local / (sx * sy);
|
const int kk = local / (sx * sy);
|
||||||
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
dst_mem[(size_t)state_index * all + dst] =
|
||||||
|
src[(size_t)state_index * region_all + local];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3970,15 +4531,15 @@ __global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
|||||||
int subset_count,
|
int subset_count,
|
||||||
int all)
|
int all)
|
||||||
{
|
{
|
||||||
const size_t total = (size_t)subset_count * (size_t)all;
|
const int subset_slot = blockIdx.y;
|
||||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
if (subset_slot >= subset_count) return;
|
||||||
tid < total;
|
const int state_index = d_subset_state_indices[subset_slot];
|
||||||
tid += (size_t)blockDim.x * gridDim.x)
|
for (int src = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
src < all;
|
||||||
|
src += blockDim.x * gridDim.x)
|
||||||
{
|
{
|
||||||
const int subset_slot = (int)(tid / (size_t)all);
|
dst[(size_t)subset_slot * all + src] =
|
||||||
const int state_index = d_subset_state_indices[subset_slot];
|
src_mem[(size_t)state_index * all + src];
|
||||||
const int src = (int)(tid - (size_t)subset_slot * all);
|
|
||||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3987,15 +4548,15 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
|
|||||||
int subset_count,
|
int subset_count,
|
||||||
int all)
|
int all)
|
||||||
{
|
{
|
||||||
const size_t total = (size_t)subset_count * (size_t)all;
|
const int subset_slot = blockIdx.y;
|
||||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
if (subset_slot >= subset_count) return;
|
||||||
tid < total;
|
const int state_index = d_subset_state_indices[subset_slot];
|
||||||
tid += (size_t)blockDim.x * gridDim.x)
|
for (int dst = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
dst < all;
|
||||||
|
dst += blockDim.x * gridDim.x)
|
||||||
{
|
{
|
||||||
const int subset_slot = (int)(tid / (size_t)all);
|
dst_mem[(size_t)state_index * all + dst] =
|
||||||
const int state_index = d_subset_state_indices[subset_slot];
|
src[(size_t)subset_slot * all + dst];
|
||||||
const int dst = (int)(tid - (size_t)subset_slot * all);
|
|
||||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4078,7 +4639,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
|
|||||||
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||||
|
|
||||||
if (kind == cudaMemcpyDeviceToHost) {
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
kern_pack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
||||||
|
(unsigned int)state_count);
|
||||||
|
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
|
||||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
ex[0] * ex[1] * ex[2]);
|
ex[0] * ex[1] * ex[2]);
|
||||||
@@ -4089,7 +4652,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
|
|||||||
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
||||||
total_doubles * sizeof(double),
|
total_doubles * sizeof(double),
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
kern_unpack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
||||||
|
(unsigned int)state_count);
|
||||||
|
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
|
||||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
ex[0] * ex[1] * ex[2]);
|
ex[0] * ex[1] * ex[2]);
|
||||||
@@ -4142,7 +4707,8 @@ static void copy_state_subset(void *block_tag,
|
|||||||
(size_t)active_count * sizeof(int),
|
(size_t)active_count * sizeof(int),
|
||||||
0, cudaMemcpyHostToDevice));
|
0, cudaMemcpyHostToDevice));
|
||||||
if (kind == cudaMemcpyDeviceToHost) {
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
kern_pack_state_subset<<<grid(total_doubles), BLK>>>(
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
||||||
|
kern_pack_state_subset<<<launch_grid, BLK>>>(
|
||||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||||
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
||||||
total_doubles * sizeof(double),
|
total_doubles * sizeof(double),
|
||||||
@@ -4161,7 +4727,8 @@ static void copy_state_subset(void *block_tag,
|
|||||||
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
||||||
total_doubles * sizeof(double),
|
total_doubles * sizeof(double),
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
kern_unpack_state_subset<<<grid(total_doubles), BLK>>>(
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
||||||
|
kern_unpack_state_subset<<<launch_grid, BLK>>>(
|
||||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4378,41 +4945,21 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
||||||
|
|
||||||
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
|
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
|
||||||
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
|
{
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
const int soa_signs[] = {
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
|
(int)SYM, (int)SYM, (int)SYM,
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
(int)ANTI, (int)ANTI, (int)SYM,
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
(int)ANTI, (int)SYM, (int)ANTI,
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
(int)SYM, (int)ANTI, (int)ANTI
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
|
};
|
||||||
|
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||||
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
src_fields, dst_fields, soa_signs, all);
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
}
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
|
|
||||||
|
|
||||||
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
|
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
|
|
||||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
|
||||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
||||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
|
|
||||||
|
|
||||||
/* Phase 11: Ricci assembly (diagonal + off-diagonal) */
|
/* Phase 11: Ricci assembly (diagonal + off-diagonal) */
|
||||||
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
||||||
@@ -4452,16 +4999,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
||||||
|
|
||||||
/* ============================================================ */
|
/* ============================================================ */
|
||||||
/* Phase 12: fdderivs(chi) */
|
/* Phase 12/13: chi fdderivs + chi correction */
|
||||||
/* ============================================================ */
|
/* ============================================================ */
|
||||||
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
|
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
D(S_chi), D(S_chin1),
|
||||||
|
|
||||||
/* ============================================================ */
|
|
||||||
/* Phase 13: chi correction to Ricci */
|
|
||||||
/* ============================================================ */
|
|
||||||
kern_phase13_chi_correction<<<grid(all),BLK>>>(
|
|
||||||
D(S_chin1),
|
|
||||||
D(S_chix), D(S_chiy), D(S_chiz),
|
D(S_chix), D(S_chiy), D(S_chiz),
|
||||||
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
||||||
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||||
@@ -4472,18 +5013,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
||||||
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
||||||
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
||||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
|
||||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
|
||||||
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
||||||
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
||||||
|
|
||||||
/* ============================================================ */
|
/* ============================================================ */
|
||||||
/* Phase 14: fdderivs(Lap) + fderivs(chi) */
|
/* Phase 14: fdderivs(Lap) + fderivs(chi) */
|
||||||
/* ============================================================ */
|
/* ============================================================ */
|
||||||
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
|
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
|
||||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
D(S_Lap), D(S_chi),
|
||||||
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
|
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||||
SYM,SYM,SYM, all);
|
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||||
|
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
|
||||||
|
|
||||||
/* ============================================================ */
|
/* ============================================================ */
|
||||||
/* Phase 15: trK_rhs, Aij_rhs, gauge */
|
/* Phase 15: trK_rhs, Aij_rhs, gauge */
|
||||||
|
|||||||
Reference in New Issue
Block a user