Add Phase-10 Ricci kernels and batch launch flow

This commit is contained in:
2026-04-14 19:00:22 +08:00
parent 7191fc0b96
commit af344bf1e5

View File

@@ -1327,6 +1327,14 @@ struct FDDerivTables {
int soa_signs[3 * BSSN_STATE_COUNT];
};
static constexpr int PHASE10_METRIC_FIELD_COUNT = 6;
struct Phase10RicciTables {
const double *src_fields[PHASE10_METRIC_FIELD_COUNT];
double *dst_fields[PHASE10_METRIC_FIELD_COUNT];
int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT];
};
struct Rk4FinalizeTables {
const double *f0_fields[BSSN_STATE_COUNT];
double *rhs_fields[BSSN_STATE_COUNT];
@@ -1670,6 +1678,383 @@ static void gpu_fdderivs_batch(int field_count,
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
}
__global__ __launch_bounds__(128, 4)
void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
const double * __restrict__ gupxy,
const double * __restrict__ gupxz,
const double * __restrict__ gupyy,
const double * __restrict__ gupyz,
const double * __restrict__ gupzz,
Phase10RicciTables tables)
{
const int field = blockIdx.y;
if (field >= PHASE10_METRIC_FIELD_COUNT) return;
const double *src = tables.src_fields[field];
double *dst = tables.dst_fields[field];
const int SoA0 = tables.soa_signs[3 * field + 0];
const int SoA1 = tables.soa_signs[3 * field + 1];
const int SoA2 = tables.soa_signs[3 * field + 2];
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= d_gp.all) return;
const int i0 = tid % nx;
const int j0 = (tid / nx) % ny;
const int k0 = tid / (nx * ny);
double fxx = 0.0, fxy = 0.0, fxz = 0.0;
double fyy = 0.0, fyz = 0.0, fzz = 0.0;
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
fxx = d_gp.Fdxdx * (
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
fyy = d_gp.Fdydy * (
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
fzz = d_gp.Fdzdz * (
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
const double t_jm2 =
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
const double t_jm1 =
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
const double t_jp1 =
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
const double t_jp2 =
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
const double t_km2_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
const double t_km1_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
const double t_kp1_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
const double t_kp2_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
const double t_km2_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
const double t_km1_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
const double t_kp1_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
const double t_kp2_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
}
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
fxx = d_gp.Sdxdx * (
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
fyy = d_gp.Sdydy * (
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
fzz = d_gp.Sdzdz * (
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
fxy = d_gp.Sdxdy * (
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
fxz = d_gp.Sdxdz * (
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
fyz = d_gp.Sdydz * (
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
}
}
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
+ 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz);
}
static void gpu_phase10_ricci_batch(const double *gupxx,
const double *gupxy,
const double *gupxz,
const double *gupyy,
const double *gupyz,
const double *gupzz,
double *const *src_fields,
double *const *dst_fields,
const int *soa_signs,
int all)
{
Phase10RicciTables tables = {};
for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) {
tables.src_fields[i] = src_fields[i];
tables.dst_fields[i] = dst_fields[i];
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
}
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT);
kern_phase10_ricci_batched<<<launch_grid, BLK>>>(
gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables);
}
__global__ __launch_bounds__(128, 4)
void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
const double * __restrict__ chi,
double * __restrict__ fxx,
double * __restrict__ fxy,
double * __restrict__ fxz,
double * __restrict__ fyy,
double * __restrict__ fyz,
double * __restrict__ fzz,
double * __restrict__ chix_out,
double * __restrict__ chiy_out,
double * __restrict__ chiz_out)
{
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= d_gp.all) return;
const int i0 = tid % nx;
const int j0 = (tid / nx) % ny;
const int k0 = tid / (nx * ny);
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
return;
}
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
fxx[tid] = d_gp.Fdxdx * (
-fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
-30.0 * lap_c
+16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1));
fyy[tid] = d_gp.Fdydy * (
-fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
-30.0 * lap_c
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1));
fzz[tid] = d_gp.Fdzdz * (
-fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
-30.0 * lap_c
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1));
const double t_jm2 =
fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1);
const double t_jm1 =
fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1);
const double t_jp1 =
fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1);
const double t_jp2 =
fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1);
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
const double t_km2_x =
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1);
const double t_km1_x =
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1);
const double t_kp1_x =
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1);
const double t_kp2_x =
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1);
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
const double t_km2_y =
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1);
const double t_km1_y =
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1);
const double t_kp1_y =
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1);
const double t_kp2_y =
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1);
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
chix_out[tid] = d_gp.d12dx * (
fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
chiy_out[tid] = d_gp.d12dy * (
fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
chiz_out[tid] = d_gp.d12dz * (
fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
}
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
fxx[tid] = d_gp.Sdxdx * (
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
- 2.0 * lap_c
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1));
fyy[tid] = d_gp.Sdydy * (
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
- 2.0 * lap_c
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1));
fzz[tid] = d_gp.Sdzdz * (
fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
- 2.0 * lap_c
+ fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1));
fxy[tid] = d_gp.Sdxdy * (
fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
+ fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1));
fxz[tid] = d_gp.Sdxdz * (
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1));
fyz[tid] = d_gp.Sdydz * (
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1));
chix_out[tid] = d_gp.d2dx * (
-fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
+fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
chiy_out[tid] = d_gp.d2dy * (
-fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
+fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
chiz_out[tid] = d_gp.d2dz * (
-fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
+fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
}
else {
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
}
}
/* Combined ord=3 advection + KO dissipation.
* When advection and KO use the same source field, symmetry packing is shared.
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
@@ -2987,6 +3372,206 @@ void kern_phase11_ricci_offdiag(
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
* then add chi contribution to Rxx..Rzz.
*/
__global__ __launch_bounds__(128, 4)
void kern_phase12_13_chi_correction_fused(
const double* __restrict__ chi,
const double* __restrict__ chin1,
const double* __restrict__ chix, const double* __restrict__ chiy,
const double* __restrict__ chiz,
const double* __restrict__ gxx, const double* __restrict__ gxy,
const double* __restrict__ gxz, const double* __restrict__ gyy,
const double* __restrict__ gyz, const double* __restrict__ gzz,
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
double* __restrict__ Rxx, double* __restrict__ Rxy,
double* __restrict__ Rxz, double* __restrict__ Ryy,
double* __restrict__ Ryz, double* __restrict__ Rzz)
{
const double TWO = 2.0;
const double F3o2 = 1.5;
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= d_gp.all) return;
const int i0 = tid % nx;
const int j0 = (tid / nx) % ny;
const int k0 = tid / (nx * ny);
double cxx = 0.0, cxy = 0.0, cxz = 0.0;
double cyy = 0.0, cyz = 0.0, czz = 0.0;
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
cxx = d_gp.Fdxdx * (
-fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
cyy = d_gp.Fdydy * (
-fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
czz = d_gp.Fdzdz * (
-fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
const double t_jm2 =
fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1);
const double t_jm1 =
fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1);
const double t_jp1 =
fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1);
const double t_jp2 =
fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1);
cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
const double t_km2_x =
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1);
const double t_km1_x =
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1);
const double t_kp1_x =
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1);
const double t_kp2_x =
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1);
cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
const double t_km2_y =
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1);
const double t_km1_y =
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1);
const double t_kp1_y =
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1);
const double t_kp2_y =
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1)
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1)
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1);
cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
}
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
cxx = d_gp.Sdxdx * (
fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
- 2.0 * c
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
cyy = d_gp.Sdydy * (
fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
- 2.0 * c
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
czz = d_gp.Sdzdz * (
fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
- 2.0 * c
+ fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
cxy = d_gp.Sdxdy * (
fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
+ fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1));
cxz = d_gp.Sdxdz * (
fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1));
cyz = d_gp.Sdydz * (
fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
}
}
const double cx = chix[tid];
const double cy = chiy[tid];
const double cz = chiz[tid];
const double c1 = chin1[tid];
cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz;
cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz;
cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz;
cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz;
cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz;
czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz;
const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid];
const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid];
const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx)
+ uyy * (cyy - F3o2 / c1 * cy * cy)
+ uzz * (czz - F3o2 / c1 * cz * cz)
+ TWO * uxy * (cxy - F3o2 / c1 * cx * cy)
+ TWO * uxz * (cxz - F3o2 / c1 * cx * cz)
+ TWO * uyz * (cyz - F3o2 / c1 * cy * cz);
const double inv2c = 1.0 / (c1 * TWO);
Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c;
Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c;
Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c;
Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c;
Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c;
Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c;
}
__global__ __launch_bounds__(128, 4)
void kern_phase13_chi_correction(
const double* __restrict__ chin1,
@@ -3726,41 +4311,21 @@ static void launch_rhs_pipeline(int all, double eps, int co)
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
{
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
const int soa_signs[] = {
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)ANTI, (int)ANTI, (int)SYM,
(int)ANTI, (int)SYM, (int)ANTI,
(int)SYM, (int)ANTI, (int)ANTI
};
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
src_fields, dst_fields, soa_signs, all);
}
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
@@ -3798,11 +4363,8 @@ static void launch_rhs_pipeline(int all, double eps, int co)
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
D(S_Rxy),D(S_Rxz),D(S_Ryz));
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase13_chi_correction<<<grid(all),BLK>>>(
D(S_chin1),
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
D(S_chi), D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
@@ -3813,15 +4375,14 @@ static void launch_rhs_pipeline(int all, double eps, int co)
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_Rxx), D(S_Rxy), D(S_Rxz),
D(S_Ryy), D(S_Ryz), D(S_Rzz));
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
SYM,SYM,SYM, all);
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
D(S_Lap), D(S_chi),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
D(S_alpn1), D(S_chin1),
@@ -3926,18 +4487,18 @@ __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem
int state_count,
int all)
{
const size_t total = (size_t)region_all * (size_t)state_count;
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += (size_t)blockDim.x * gridDim.x)
const int state_index = blockIdx.y;
if (state_index >= state_count) return;
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
local < region_all;
local += blockDim.x * gridDim.x)
{
const int state_index = (int)(tid / (size_t)region_all);
const int local = (int)(tid - (size_t)state_index * region_all);
const int ii = local % sx;
const int jj = (local / sx) % sy;
const int kk = local / (sx * sy);
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
dst[tid] = src_mem[(size_t)state_index * all + src];
dst[(size_t)state_index * region_all + local] =
src_mem[(size_t)state_index * all + src];
}
}
@@ -3950,18 +4511,18 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
int state_count,
int all)
{
const size_t total = (size_t)region_all * (size_t)state_count;
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += (size_t)blockDim.x * gridDim.x)
const int state_index = blockIdx.y;
if (state_index >= state_count) return;
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
local < region_all;
local += blockDim.x * gridDim.x)
{
const int state_index = (int)(tid / (size_t)region_all);
const int local = (int)(tid - (size_t)state_index * region_all);
const int ii = local % sx;
const int jj = (local / sx) % sy;
const int kk = local / (sx * sy);
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
dst_mem[(size_t)state_index * all + dst] = src[tid];
dst_mem[(size_t)state_index * all + dst] =
src[(size_t)state_index * region_all + local];
}
}
@@ -3970,15 +4531,15 @@ __global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
int subset_count,
int all)
{
const size_t total = (size_t)subset_count * (size_t)all;
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += (size_t)blockDim.x * gridDim.x)
const int subset_slot = blockIdx.y;
if (subset_slot >= subset_count) return;
const int state_index = d_subset_state_indices[subset_slot];
for (int src = blockIdx.x * blockDim.x + threadIdx.x;
src < all;
src += blockDim.x * gridDim.x)
{
const int subset_slot = (int)(tid / (size_t)all);
const int state_index = d_subset_state_indices[subset_slot];
const int src = (int)(tid - (size_t)subset_slot * all);
dst[tid] = src_mem[(size_t)state_index * all + src];
dst[(size_t)subset_slot * all + src] =
src_mem[(size_t)state_index * all + src];
}
}
@@ -3987,15 +4548,15 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
int subset_count,
int all)
{
const size_t total = (size_t)subset_count * (size_t)all;
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
tid < total;
tid += (size_t)blockDim.x * gridDim.x)
const int subset_slot = blockIdx.y;
if (subset_slot >= subset_count) return;
const int state_index = d_subset_state_indices[subset_slot];
for (int dst = blockIdx.x * blockDim.x + threadIdx.x;
dst < all;
dst += blockDim.x * gridDim.x)
{
const int subset_slot = (int)(tid / (size_t)all);
const int state_index = d_subset_state_indices[subset_slot];
const int dst = (int)(tid - (size_t)subset_slot * all);
dst_mem[(size_t)state_index * all + dst] = src[tid];
dst_mem[(size_t)state_index * all + dst] =
src[(size_t)subset_slot * all + dst];
}
}
@@ -4078,7 +4639,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
if (kind == cudaMemcpyDeviceToHost) {
kern_pack_state_region_batch<<<grid(total_doubles), BLK>>>(
dim3 launch_grid((unsigned int)grid((size_t)region_all),
(unsigned int)state_count);
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
i0, j0, k0, sx, sy, sz, region_all, state_count,
ex[0] * ex[1] * ex[2]);
@@ -4089,7 +4652,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
total_doubles * sizeof(double),
cudaMemcpyHostToDevice));
kern_unpack_state_region_batch<<<grid(total_doubles), BLK>>>(
dim3 launch_grid((unsigned int)grid((size_t)region_all),
(unsigned int)state_count);
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
i0, j0, k0, sx, sy, sz, region_all, state_count,
ex[0] * ex[1] * ex[2]);
@@ -4142,7 +4707,8 @@ static void copy_state_subset(void *block_tag,
(size_t)active_count * sizeof(int),
0, cudaMemcpyHostToDevice));
if (kind == cudaMemcpyDeviceToHost) {
kern_pack_state_subset<<<grid(total_doubles), BLK>>>(
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
kern_pack_state_subset<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
total_doubles * sizeof(double),
@@ -4161,7 +4727,8 @@ static void copy_state_subset(void *block_tag,
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
total_doubles * sizeof(double),
cudaMemcpyHostToDevice));
kern_unpack_state_subset<<<grid(total_doubles), BLK>>>(
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
kern_unpack_state_subset<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
}
}
@@ -4378,41 +4945,21 @@ int f_compute_rhs_bssn(int *ex, double &T,
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
{
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
const int soa_signs[] = {
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)ANTI, (int)ANTI, (int)SYM,
(int)ANTI, (int)SYM, (int)ANTI,
(int)SYM, (int)ANTI, (int)ANTI
};
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
src_fields, dst_fields, soa_signs, all);
}
/* Phase 11: Ricci assembly (diagonal + off-diagonal) */
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
@@ -4452,16 +4999,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
D(S_Rxy),D(S_Rxz),D(S_Ryz));
/* ============================================================ */
/* Phase 12: fdderivs(chi) */
/* Phase 12/13: chi fdderivs + chi correction */
/* ============================================================ */
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
/* ============================================================ */
/* Phase 13: chi correction to Ricci */
/* ============================================================ */
kern_phase13_chi_correction<<<grid(all),BLK>>>(
D(S_chin1),
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
D(S_chi), D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
@@ -4472,18 +5013,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_Rxx), D(S_Rxy), D(S_Rxz),
D(S_Ryy), D(S_Ryz), D(S_Rzz));
/* ============================================================ */
/* Phase 14: fdderivs(Lap) + fderivs(chi) */
/* ============================================================ */
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
SYM,SYM,SYM, all);
kern_phase14_lap_chi_derivs<<<grid((size_t)all),BLK>>>(
D(S_Lap), D(S_chi),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs));
/* ============================================================ */
/* Phase 15: trK_rhs, Aij_rhs, gauge */