diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 6aae645..5cd91cb 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -1327,6 +1327,14 @@ struct FDDerivTables { int soa_signs[3 * BSSN_STATE_COUNT]; }; +static constexpr int PHASE10_METRIC_FIELD_COUNT = 6; + +struct Phase10RicciTables { + const double *src_fields[PHASE10_METRIC_FIELD_COUNT]; + double *dst_fields[PHASE10_METRIC_FIELD_COUNT]; + int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT]; +}; + struct Rk4FinalizeTables { const double *f0_fields[BSSN_STATE_COUNT]; double *rhs_fields[BSSN_STATE_COUNT]; @@ -1670,6 +1678,383 @@ static void gpu_fdderivs_batch(int field_count, kern_fdderivs_batched<<>>(tables, field_count); } +__global__ __launch_bounds__(128, 4) +void kern_phase10_ricci_batched(const double * __restrict__ gupxx, + const double * __restrict__ gupxy, + const double * __restrict__ gupxz, + const double * __restrict__ gupyy, + const double * __restrict__ gupyz, + const double * __restrict__ gupzz, + Phase10RicciTables tables) +{ + const int field = blockIdx.y; + if (field >= PHASE10_METRIC_FIELD_COUNT) return; + + const double *src = tables.src_fields[field]; + double *dst = tables.dst_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + double fxx = 0.0, fxy = 0.0, fxz = 0.0; + double fyy = 0.0, fyz = 0.0, fzz = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fyy = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fzz = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + + const double t_jm2 = + fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2); + const double t_jm1 = + fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2); + const double t_jp1 = + fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2); + const double t_jp2 = + fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2); + fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2); + fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2); + fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fyy = d_gp.Sdydy * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fzz = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + fxy = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)); + fxz = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)); + fyz = d_gp.Sdydz * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)); + } + } + + dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz + + 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz); +} + +static void gpu_phase10_ricci_batch(const double *gupxx, + const double *gupxy, + const double *gupxz, + const double *gupyy, + const double *gupyz, + const double *gupzz, + double *const *src_fields, + double *const *dst_fields, + const int *soa_signs, + int all) +{ + Phase10RicciTables tables = {}; + for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.dst_fields[i] = dst_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT); + kern_phase10_ricci_batched<<>>( + gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables); +} + +__global__ __launch_bounds__(128, 4) +void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap, + const double * __restrict__ chi, + double * __restrict__ fxx, + double * __restrict__ fxy, + double * __restrict__ fxz, + double * __restrict__ fyy, + double * __restrict__ fyz, + double * __restrict__ fzz, + double * __restrict__ chix_out, + double * __restrict__ chiy_out, + double * __restrict__ chiz_out) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1); + fxx[tid] = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1)); + fyy[tid] = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1)); + fzz[tid] = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1); + fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1); + fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1); + fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + + chix_out[tid] = d_gp.d12dx * ( + fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1)); + chiy_out[tid] = d_gp.d12dy * ( + fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1)); + chiz_out[tid] = d_gp.d12dz * ( + fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1)); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1); + fxx[tid] = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)); + fyy[tid] = d_gp.Sdydy * ( + fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)); + fzz[tid] = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)); + fxy[tid] = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)); + fxz[tid] = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)); + fyz[tid] = d_gp.Sdydz * ( + fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)); + chix_out[tid] = d_gp.d2dx * ( + -fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)); + chiy_out[tid] = d_gp.d2dy * ( + -fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)); + chiz_out[tid] = d_gp.d2dz * ( + -fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)); + } + else { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0; + } +} + /* Combined ord=3 advection + KO dissipation. * When advection and KO use the same source field, symmetry packing is shared. * If they differ (e.g. gxx advection + dxx KO), only KO repacks. @@ -2987,6 +3372,206 @@ void kern_phase11_ricci_offdiag( * After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f, * then add chi contribution to Rxx..Rzz. */ +__global__ __launch_bounds__(128, 4) +void kern_phase12_13_chi_correction_fused( + const double* __restrict__ chi, + const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + double* __restrict__ Rxx, double* __restrict__ Rxy, + double* __restrict__ Rxz, double* __restrict__ Ryy, + double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double TWO = 2.0; + const double F3o2 = 1.5; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + double cxx = 0.0, cxy = 0.0, cxz = 0.0; + double cyy = 0.0, cyz = 0.0, czz = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1); + cxx = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1)); + cyy = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1)); + czz = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1); + cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1); + cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1); + cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1); + cxx = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)); + cyy = d_gp.Sdydy * ( + fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)); + czz = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)); + cxy = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)); + cxz = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)); + cyz = d_gp.Sdydz * ( + fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)); + } + } + + const double cx = chix[tid]; + const double cy = chiy[tid]; + const double cz = chiz[tid]; + const double c1 = chin1[tid]; + + cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz; + cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz; + cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz; + cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz; + cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz; + czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz; + + const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid]; + const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid]; + const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx) + + uyy * (cyy - F3o2 / c1 * cy * cy) + + uzz * (czz - F3o2 / c1 * cz * cz) + + TWO * uxy * (cxy - F3o2 / c1 * cx * cy) + + TWO * uxz * (cxz - F3o2 / c1 * cx * cz) + + TWO * uyz * (cyz - F3o2 / c1 * cy * cz); + + const double inv2c = 1.0 / (c1 * TWO); + Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c; + Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c; + Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c; + Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c; + Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c; + Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c; +} + __global__ __launch_bounds__(128, 4) void kern_phase13_chi_correction( const double* __restrict__ chin1, @@ -3726,41 +4311,21 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz)); - gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx)); - - gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy)); - - gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz)); - - gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy)); - - gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz)); - - gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz)); + { + double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)}; + double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)}; + const int soa_signs[] = { + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)ANTI, (int)ANTI + }; + gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + src_fields, dst_fields, soa_signs, all); + } kern_phase11_ricci_diag<<>>( D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), @@ -3798,11 +4363,8 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), D(S_Rxy),D(S_Rxz),D(S_Ryz)); - gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - - kern_phase13_chi_correction<<>>( - D(S_chin1), + kern_phase12_13_chi_correction_fused<<>>( + D(S_chi), D(S_chin1), D(S_chix), D(S_chiy), D(S_chiz), D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), D(S_gupxx), D(S_gupxy), D(S_gupxz), @@ -3813,15 +4375,14 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz)); - gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs), - SYM,SYM,SYM, all); + kern_phase14_lap_chi_derivs<<>>( + D(S_Lap), D(S_chi), + D(S_fxx), D(S_fxy), D(S_fxz), + D(S_fyy), D(S_fyz), D(S_fzz), + D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs)); kern_phase15_trK_Aij_gauge<<>>( D(S_alpn1), D(S_chin1), @@ -3926,18 +4487,18 @@ __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem int state_count, int all) { - const size_t total = (size_t)region_all * (size_t)state_count; - for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += (size_t)blockDim.x * gridDim.x) + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) { - const int state_index = (int)(tid / (size_t)region_all); - const int local = (int)(tid - (size_t)state_index * region_all); const int ii = local % sx; const int jj = (local / sx) % sy; const int kk = local / (sx * sy); const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; - dst[tid] = src_mem[(size_t)state_index * all + src]; + dst[(size_t)state_index * region_all + local] = + src_mem[(size_t)state_index * all + src]; } } @@ -3950,18 +4511,18 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, int state_count, int all) { - const size_t total = (size_t)region_all * (size_t)state_count; - for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += (size_t)blockDim.x * gridDim.x) + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) { - const int state_index = (int)(tid / (size_t)region_all); - const int local = (int)(tid - (size_t)state_index * region_all); const int ii = local % sx; const int jj = (local / sx) % sy; const int kk = local / (sx * sy); const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; - dst_mem[(size_t)state_index * all + dst] = src[tid]; + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)state_index * region_all + local]; } } @@ -3970,15 +4531,15 @@ __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, int subset_count, int all) { - const size_t total = (size_t)subset_count * (size_t)all; - for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += (size_t)blockDim.x * gridDim.x) + const int subset_slot = blockIdx.y; + if (subset_slot >= subset_count) return; + const int state_index = d_subset_state_indices[subset_slot]; + for (int src = blockIdx.x * blockDim.x + threadIdx.x; + src < all; + src += blockDim.x * gridDim.x) { - const int subset_slot = (int)(tid / (size_t)all); - const int state_index = d_subset_state_indices[subset_slot]; - const int src = (int)(tid - (size_t)subset_slot * all); - dst[tid] = src_mem[(size_t)state_index * all + src]; + dst[(size_t)subset_slot * all + src] = + src_mem[(size_t)state_index * all + src]; } } @@ -3987,15 +4548,15 @@ __global__ void kern_unpack_state_subset(double * __restrict__ dst_mem, int subset_count, int all) { - const size_t total = (size_t)subset_count * (size_t)all; - for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += (size_t)blockDim.x * gridDim.x) + const int subset_slot = blockIdx.y; + if (subset_slot >= subset_count) return; + const int state_index = d_subset_state_indices[subset_slot]; + for (int dst = blockIdx.x * blockDim.x + threadIdx.x; + dst < all; + dst += blockDim.x * gridDim.x) { - const int subset_slot = (int)(tid / (size_t)all); - const int state_index = d_subset_state_indices[subset_slot]; - const int dst = (int)(tid - (size_t)subset_slot * all); - dst_mem[(size_t)state_index * all + dst] = src[tid]; + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)subset_slot * all + dst]; } } @@ -4078,7 +4639,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); if (kind == cudaMemcpyDeviceToHost) { - kern_pack_state_region_batch<<>>( + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_pack_state_region_batch<<>>( ctx.d_state_curr_mem, d_comm, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); @@ -4089,7 +4652,9 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, CUDA_CHECK(cudaMemcpy(d_comm, host_buffer, total_doubles * sizeof(double), cudaMemcpyHostToDevice)); - kern_unpack_state_region_batch<<>>( + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_unpack_state_region_batch<<>>( ctx.d_state_curr_mem, d_comm, ex[0], ex[1], i0, j0, k0, sx, sy, sz, region_all, state_count, ex[0] * ex[1] * ex[2]); @@ -4142,7 +4707,8 @@ static void copy_state_subset(void *block_tag, (size_t)active_count * sizeof(int), 0, cudaMemcpyHostToDevice)); if (kind == cudaMemcpyDeviceToHost) { - kern_pack_state_subset<<>>( + dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); + kern_pack_state_subset<<>>( ctx.d_state_curr_mem, d_comm, active_count, (int)all); CUDA_CHECK(cudaMemcpy(h_comm, d_comm, total_doubles * sizeof(double), @@ -4161,7 +4727,8 @@ static void copy_state_subset(void *block_tag, CUDA_CHECK(cudaMemcpy(d_comm, h_comm, total_doubles * sizeof(double), cudaMemcpyHostToDevice)); - kern_unpack_state_subset<<>>( + dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); + kern_unpack_state_subset<<>>( ctx.d_state_curr_mem, d_comm, active_count, (int)all); } } @@ -4378,41 +4945,21 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz)); /* Phase 10: 6x fdderivs(metric) + Ricci contract */ - gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx)); - - gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy)); - - gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz)); - - gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy)); - - gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz)); - - gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all); - kern_phase10_ricci_contract<<>>( - D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), - D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz)); + { + double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)}; + double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)}; + const int soa_signs[] = { + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)ANTI, (int)ANTI + }; + gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + src_fields, dst_fields, soa_signs, all); + } /* Phase 11: Ricci assembly (diagonal + off-diagonal) */ kern_phase11_ricci_diag<<>>( @@ -4452,16 +4999,10 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_Rxy),D(S_Rxz),D(S_Ryz)); /* ============================================================ */ - /* Phase 12: fdderivs(chi) */ + /* Phase 12/13: chi fdderivs + chi correction */ /* ============================================================ */ - gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - - /* ============================================================ */ - /* Phase 13: chi correction to Ricci */ - /* ============================================================ */ - kern_phase13_chi_correction<<>>( - D(S_chin1), + kern_phase12_13_chi_correction_fused<<>>( + D(S_chi), D(S_chin1), D(S_chix), D(S_chiy), D(S_chiz), D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), D(S_gupxx), D(S_gupxy), D(S_gupxz), @@ -4472,18 +5013,17 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), - D(S_fxx), D(S_fxy), D(S_fxz), - D(S_fyy), D(S_fyz), D(S_fzz), D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz)); /* ============================================================ */ /* Phase 14: fdderivs(Lap) + fderivs(chi) */ /* ============================================================ */ - gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz), - D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs), - SYM,SYM,SYM, all); + kern_phase14_lap_chi_derivs<<>>( + D(S_Lap), D(S_chi), + D(S_fxx), D(S_fxy), D(S_fxz), + D(S_fyy), D(S_fyz), D(S_fzz), + D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs)); /* ============================================================ */ /* Phase 15: trK_rhs, Aij_rhs, gauge */