Add Phase-10 Ricci kernels and batch launch flow
This commit is contained in:
@@ -1327,6 +1327,14 @@ struct FDDerivTables {
|
||||
int soa_signs[3 * BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
static constexpr int PHASE10_METRIC_FIELD_COUNT = 6;
|
||||
|
||||
struct Phase10RicciTables {
|
||||
const double *src_fields[PHASE10_METRIC_FIELD_COUNT];
|
||||
double *dst_fields[PHASE10_METRIC_FIELD_COUNT];
|
||||
int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT];
|
||||
};
|
||||
|
||||
struct Rk4FinalizeTables {
|
||||
const double *f0_fields[BSSN_STATE_COUNT];
|
||||
double *rhs_fields[BSSN_STATE_COUNT];
|
||||
@@ -1670,6 +1678,383 @@ static void gpu_fdderivs_batch(int field_count,
|
||||
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
||||
const double * __restrict__ gupxy,
|
||||
const double * __restrict__ gupxz,
|
||||
const double * __restrict__ gupyy,
|
||||
const double * __restrict__ gupyz,
|
||||
const double * __restrict__ gupzz,
|
||||
Phase10RicciTables tables)
|
||||
{
|
||||
const int field = blockIdx.y;
|
||||
if (field >= PHASE10_METRIC_FIELD_COUNT) return;
|
||||
|
||||
const double *src = tables.src_fields[field];
|
||||
double *dst = tables.dst_fields[field];
|
||||
const int SoA0 = tables.soa_signs[3 * field + 0];
|
||||
const int SoA1 = tables.soa_signs[3 * field + 1];
|
||||
const int SoA2 = tables.soa_signs[3 * field + 2];
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
double fxx = 0.0, fxy = 0.0, fxz = 0.0;
|
||||
double fyy = 0.0, fyz = 0.0, fzz = 0.0;
|
||||
|
||||
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||
const int iF = i0 + 1;
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||
fxx = d_gp.Fdxdx * (
|
||||
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
||||
fyy = d_gp.Fdydy * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
||||
fzz = d_gp.Fdzdz * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
||||
|
||||
const double t_jm2 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jm1 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jp1 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jp2 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
|
||||
fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||
|
||||
const double t_km2_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
|
||||
const double t_km1_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp1_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp2_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
|
||||
fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||
|
||||
const double t_km2_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
|
||||
const double t_km1_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp1_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp2_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
|
||||
fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||
fxx = d_gp.Sdxdx * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
||||
fyy = d_gp.Sdydy * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fzz = d_gp.Sdzdz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
fxy = d_gp.Sdxdy * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fxz = d_gp.Sdxdz * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
fyz = d_gp.Sdydz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||
}
|
||||
}
|
||||
|
||||
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
||||
+ 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz);
|
||||
}
|
||||
|
||||
static void gpu_phase10_ricci_batch(const double *gupxx,
|
||||
const double *gupxy,
|
||||
const double *gupxz,
|
||||
const double *gupyy,
|
||||
const double *gupyz,
|
||||
const double *gupzz,
|
||||
double *const *src_fields,
|
||||
double *const *dst_fields,
|
||||
const int *soa_signs,
|
||||
int all)
|
||||
{
|
||||
Phase10RicciTables tables = {};
|
||||
for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) {
|
||||
tables.src_fields[i] = src_fields[i];
|
||||
tables.dst_fields[i] = dst_fields[i];
|
||||
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
||||
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
||||
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
||||
}
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT);
|
||||
kern_phase10_ricci_batched<<<launch_grid, BLK>>>(
|
||||
gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables);
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
||||
const double * __restrict__ chi,
|
||||
double * __restrict__ fxx,
|
||||
double * __restrict__ fxy,
|
||||
double * __restrict__ fxz,
|
||||
double * __restrict__ fyy,
|
||||
double * __restrict__ fyz,
|
||||
double * __restrict__ fzz,
|
||||
double * __restrict__ chix_out,
|
||||
double * __restrict__ chiy_out,
|
||||
double * __restrict__ chiz_out)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||
return;
|
||||
}
|
||||
|
||||
const int iF = i0 + 1;
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
||||
fxx[tid] = d_gp.Fdxdx * (
|
||||
-fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
||||
-30.0 * lap_c
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1));
|
||||
fyy[tid] = d_gp.Fdydy * (
|
||||
-fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
||||
-30.0 * lap_c
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1));
|
||||
fzz[tid] = d_gp.Fdzdz * (
|
||||
-fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
||||
-30.0 * lap_c
|
||||
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1));
|
||||
|
||||
const double t_jm2 =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1);
|
||||
const double t_jm1 =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1);
|
||||
const double t_jp1 =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1);
|
||||
const double t_jp2 =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1);
|
||||
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||
|
||||
const double t_km2_x =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1);
|
||||
const double t_km1_x =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1);
|
||||
const double t_kp1_x =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1);
|
||||
const double t_kp2_x =
|
||||
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1);
|
||||
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||
|
||||
const double t_km2_y =
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1);
|
||||
const double t_km1_y =
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1);
|
||||
const double t_kp1_y =
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1);
|
||||
const double t_kp2_y =
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1);
|
||||
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||
|
||||
chix_out[tid] = d_gp.d12dx * (
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
||||
chiy_out[tid] = d_gp.d12dy * (
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
||||
chiz_out[tid] = d_gp.d12dz * (
|
||||
fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
||||
fxx[tid] = d_gp.Sdxdx * (
|
||||
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
||||
- 2.0 * lap_c
|
||||
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1));
|
||||
fyy[tid] = d_gp.Sdydy * (
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
||||
- 2.0 * lap_c
|
||||
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1));
|
||||
fzz[tid] = d_gp.Sdzdz * (
|
||||
fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
||||
- 2.0 * lap_c
|
||||
+ fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1));
|
||||
fxy[tid] = d_gp.Sdxdy * (
|
||||
fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1));
|
||||
fxz[tid] = d_gp.Sdxdz * (
|
||||
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1));
|
||||
fyz[tid] = d_gp.Sdydz * (
|
||||
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||
chix_out[tid] = d_gp.d2dx * (
|
||||
-fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||
+fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
||||
chiy_out[tid] = d_gp.d2dy * (
|
||||
-fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||
+fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
||||
chiz_out[tid] = d_gp.d2dz * (
|
||||
-fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||
+fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
||||
}
|
||||
else {
|
||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Combined ord=3 advection + KO dissipation.
|
||||
* When advection and KO use the same source field, symmetry packing is shared.
|
||||
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
||||
@@ -2987,6 +3372,206 @@ void kern_phase11_ricci_offdiag(
|
||||
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
|
||||
* then add chi contribution to Rxx..Rzz.
|
||||
*/
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_phase12_13_chi_correction_fused(
|
||||
const double* __restrict__ chi,
|
||||
const double* __restrict__ chin1,
|
||||
const double* __restrict__ chix, const double* __restrict__ chiy,
|
||||
const double* __restrict__ chiz,
|
||||
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
||||
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
||||
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
||||
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
||||
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
||||
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
||||
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
||||
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
||||
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
||||
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
||||
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
||||
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
||||
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
||||
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
||||
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
||||
double* __restrict__ Rxx, double* __restrict__ Rxy,
|
||||
double* __restrict__ Rxz, double* __restrict__ Ryy,
|
||||
double* __restrict__ Ryz, double* __restrict__ Rzz)
|
||||
{
|
||||
const double TWO = 2.0;
|
||||
const double F3o2 = 1.5;
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
double cxx = 0.0, cxy = 0.0, cxz = 0.0;
|
||||
double cyy = 0.0, cyz = 0.0, czz = 0.0;
|
||||
|
||||
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
||||
const int iF = i0 + 1;
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
||||
cxx = d_gp.Fdxdx * (
|
||||
-fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
||||
cyy = d_gp.Fdydy * (
|
||||
-fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
||||
czz = d_gp.Fdzdz * (
|
||||
-fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
||||
|
||||
const double t_jm2 =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1);
|
||||
const double t_jm1 =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1);
|
||||
const double t_jp1 =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1);
|
||||
const double t_jp2 =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1);
|
||||
cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||
|
||||
const double t_km2_x =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1);
|
||||
const double t_km1_x =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1);
|
||||
const double t_kp1_x =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1);
|
||||
const double t_kp2_x =
|
||||
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1);
|
||||
cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||
|
||||
const double t_km2_y =
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1);
|
||||
const double t_km1_y =
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1);
|
||||
const double t_kp1_y =
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1);
|
||||
const double t_kp2_y =
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1)
|
||||
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1)
|
||||
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1);
|
||||
cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
||||
cxx = d_gp.Sdxdx * (
|
||||
fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
||||
cyy = d_gp.Sdydy * (
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
||||
czz = d_gp.Sdzdz * (
|
||||
fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
||||
cxy = d_gp.Sdxdy * (
|
||||
fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1));
|
||||
cxz = d_gp.Sdxdz * (
|
||||
fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1));
|
||||
cyz = d_gp.Sdydz * (
|
||||
fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
||||
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
||||
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
||||
}
|
||||
}
|
||||
|
||||
const double cx = chix[tid];
|
||||
const double cy = chiy[tid];
|
||||
const double cz = chiz[tid];
|
||||
const double c1 = chin1[tid];
|
||||
|
||||
cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz;
|
||||
cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz;
|
||||
cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz;
|
||||
cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz;
|
||||
cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz;
|
||||
czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz;
|
||||
|
||||
const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid];
|
||||
const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid];
|
||||
const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx)
|
||||
+ uyy * (cyy - F3o2 / c1 * cy * cy)
|
||||
+ uzz * (czz - F3o2 / c1 * cz * cz)
|
||||
+ TWO * uxy * (cxy - F3o2 / c1 * cx * cy)
|
||||
+ TWO * uxz * (cxz - F3o2 / c1 * cx * cz)
|
||||
+ TWO * uyz * (cyz - F3o2 / c1 * cy * cz);
|
||||
|
||||
const double inv2c = 1.0 / (c1 * TWO);
|
||||
Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c;
|
||||
Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c;
|
||||
Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c;
|
||||
Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c;
|
||||
Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c;
|
||||
Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c;
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_phase13_chi_correction(
|
||||
const double* __restrict__ chin1,
|
||||
@@ -3726,41 +4311,21 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
||||
|
||||
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
|
||||
|
||||
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
|
||||
|
||||
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
|
||||
|
||||
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
|
||||
|
||||
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
|
||||
|
||||
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
|
||||
{
|
||||
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
||||
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
||||
const int soa_signs[] = {
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)ANTI, (int)ANTI, (int)SYM,
|
||||
(int)ANTI, (int)SYM, (int)ANTI,
|
||||
(int)SYM, (int)ANTI, (int)ANTI
|
||||
};
|
||||
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
||||
src_fields, dst_fields, soa_signs, all);
|
||||
}
|
||||
|
||||
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
||||
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
||||
@@ -3798,11 +4363,8 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
||||
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
||||
|
||||
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
|
||||
kern_phase13_chi_correction<<<grid(all),BLK>>>(
|
||||
D(S_chin1),
|
||||
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
||||
D(S_chi), D(S_chin1),
|
||||
D(S_chix), D(S_chiy), D(S_chiz),
|
||||
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
||||
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||
@@ -3813,15 +4375,14 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
||||
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
||||
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
||||
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
||||
|
||||
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
|
||||
SYM,SYM,SYM, all);
|
||||
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
|
||||
D(S_Lap), D(S_chi),
|
||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
|
||||
|
||||
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
||||
D(S_alpn1), D(S_chin1),
|
||||
@@ -3926,18 +4487,18 @@ __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem
|
||||
int state_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
const int state_index = blockIdx.y;
|
||||
if (state_index >= state_count) return;
|
||||
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
local < region_all;
|
||||
local += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int state_index = (int)(tid / (size_t)region_all);
|
||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||
const int ii = local % sx;
|
||||
const int jj = (local / sx) % sy;
|
||||
const int kk = local / (sx * sy);
|
||||
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||
dst[(size_t)state_index * region_all + local] =
|
||||
src_mem[(size_t)state_index * all + src];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3950,18 +4511,18 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
||||
int state_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
const int state_index = blockIdx.y;
|
||||
if (state_index >= state_count) return;
|
||||
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
local < region_all;
|
||||
local += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int state_index = (int)(tid / (size_t)region_all);
|
||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||
const int ii = local % sx;
|
||||
const int jj = (local / sx) % sy;
|
||||
const int kk = local / (sx * sy);
|
||||
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||
dst_mem[(size_t)state_index * all + dst] =
|
||||
src[(size_t)state_index * region_all + local];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3970,15 +4531,15 @@ __global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
||||
int subset_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)subset_count * (size_t)all;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
const int subset_slot = blockIdx.y;
|
||||
if (subset_slot >= subset_count) return;
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
for (int src = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
src < all;
|
||||
src += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int subset_slot = (int)(tid / (size_t)all);
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
const int src = (int)(tid - (size_t)subset_slot * all);
|
||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||
dst[(size_t)subset_slot * all + src] =
|
||||
src_mem[(size_t)state_index * all + src];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3987,15 +4548,15 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
|
||||
int subset_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)subset_count * (size_t)all;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
const int subset_slot = blockIdx.y;
|
||||
if (subset_slot >= subset_count) return;
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
for (int dst = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
dst < all;
|
||||
dst += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int subset_slot = (int)(tid / (size_t)all);
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
const int dst = (int)(tid - (size_t)subset_slot * all);
|
||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||
dst_mem[(size_t)state_index * all + dst] =
|
||||
src[(size_t)subset_slot * all + dst];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4078,7 +4639,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
|
||||
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||
|
||||
if (kind == cudaMemcpyDeviceToHost) {
|
||||
kern_pack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
||||
(unsigned int)state_count);
|
||||
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||
ex[0] * ex[1] * ex[2]);
|
||||
@@ -4089,7 +4652,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
|
||||
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyHostToDevice));
|
||||
kern_unpack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
||||
(unsigned int)state_count);
|
||||
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||
ex[0] * ex[1] * ex[2]);
|
||||
@@ -4142,7 +4707,8 @@ static void copy_state_subset(void *block_tag,
|
||||
(size_t)active_count * sizeof(int),
|
||||
0, cudaMemcpyHostToDevice));
|
||||
if (kind == cudaMemcpyDeviceToHost) {
|
||||
kern_pack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
||||
kern_pack_state_subset<<<launch_grid, BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
||||
total_doubles * sizeof(double),
|
||||
@@ -4161,7 +4727,8 @@ static void copy_state_subset(void *block_tag,
|
||||
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyHostToDevice));
|
||||
kern_unpack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
||||
kern_unpack_state_subset<<<launch_grid, BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||
}
|
||||
}
|
||||
@@ -4378,41 +4945,21 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
||||
|
||||
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
|
||||
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
|
||||
|
||||
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
|
||||
|
||||
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
|
||||
|
||||
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
|
||||
|
||||
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
|
||||
|
||||
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
|
||||
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
|
||||
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
||||
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
|
||||
{
|
||||
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
||||
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
||||
const int soa_signs[] = {
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)ANTI, (int)ANTI, (int)SYM,
|
||||
(int)ANTI, (int)SYM, (int)ANTI,
|
||||
(int)SYM, (int)ANTI, (int)ANTI
|
||||
};
|
||||
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
||||
src_fields, dst_fields, soa_signs, all);
|
||||
}
|
||||
|
||||
/* Phase 11: Ricci assembly (diagonal + off-diagonal) */
|
||||
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
|
||||
@@ -4452,16 +4999,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_Rxy),D(S_Rxz),D(S_Ryz));
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 12: fdderivs(chi) */
|
||||
/* Phase 12/13: chi fdderivs + chi correction */
|
||||
/* ============================================================ */
|
||||
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 13: chi correction to Ricci */
|
||||
/* ============================================================ */
|
||||
kern_phase13_chi_correction<<<grid(all),BLK>>>(
|
||||
D(S_chin1),
|
||||
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
||||
D(S_chi), D(S_chin1),
|
||||
D(S_chix), D(S_chiy), D(S_chiz),
|
||||
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
||||
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||
@@ -4472,18 +5013,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
||||
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
||||
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
||||
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 14: fdderivs(Lap) + fderivs(chi) */
|
||||
/* ============================================================ */
|
||||
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
|
||||
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
|
||||
SYM,SYM,SYM, all);
|
||||
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
|
||||
D(S_Lap), D(S_chi),
|
||||
D(S_fxx), D(S_fxy), D(S_fxz),
|
||||
D(S_fyy), D(S_fyz), D(S_fzz),
|
||||
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 15: trK_rhs, Aij_rhs, gauge */
|
||||
|
||||
Reference in New Issue
Block a user