From b3ec244cf90da362fac54f5de65260161ab4fd2d Mon Sep 17 00:00:00 2001 From: ianchb Date: Mon, 13 Apr 2026 20:51:08 +0800 Subject: [PATCH] Add batched first/second derivative kernels for CUDA RHS --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 456 ++++++++++++++++++++++++++++-- 1 file changed, 428 insertions(+), 28 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index c8efa0a..04f15a4 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -189,6 +189,33 @@ __device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) { return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1]; } +__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src, + int iF, int jF, int kF, + int SoA0, int SoA1, int SoA2) +{ + int siF = iF; + int sjF = jF; + int skF = kF; + double sign = 1.0; + + if (iF <= 0) { + siF = 1 - iF; + sign *= (double)SoA0; + } + if (jF <= 0) { + sjF = 1 - jF; + sign *= (double)SoA1; + } + if (kF <= 0) { + skF = 1 - kF; + sign *= (double)SoA2; + } + + return sign * src[(siF - 1) + + (sjF - 1) * d_gp.ex[0] + + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]]; +} + /* ord=2 ghost-padded: Fortran index iF -> flat index */ __device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) { return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny; @@ -1272,6 +1299,25 @@ struct LopsidedKodisTables { int soa_signs[3 * BSSN_LK_FIELD_COUNT]; }; +struct FDerivTables { + const double *src_fields[BSSN_STATE_COUNT]; + double *fx_fields[BSSN_STATE_COUNT]; + double *fy_fields[BSSN_STATE_COUNT]; + double *fz_fields[BSSN_STATE_COUNT]; + int soa_signs[3 * BSSN_STATE_COUNT]; +}; + +struct FDDerivTables { + const double *src_fields[BSSN_STATE_COUNT]; + double *fxx_fields[BSSN_STATE_COUNT]; + double *fxy_fields[BSSN_STATE_COUNT]; + double *fxz_fields[BSSN_STATE_COUNT]; + double *fyy_fields[BSSN_STATE_COUNT]; + double *fyz_fields[BSSN_STATE_COUNT]; + double *fzz_fields[BSSN_STATE_COUNT]; + int soa_signs[3 * BSSN_STATE_COUNT]; +}; + struct Rk4FinalizeTables { const double *f0_fields[BSSN_STATE_COUNT]; double *rhs_fields[BSSN_STATE_COUNT]; @@ -1291,6 +1337,248 @@ static inline int grid(size_t n) { return (int)g; } +__global__ __launch_bounds__(128, 4) +void kern_fderivs_batched(FDerivTables tables, int field_count) +{ + const int field = blockIdx.y; + if (field >= field_count) return; + + const double *src = tables.src_fields[field]; + double *fx = tables.fx_fields[field]; + double *fy = tables.fy_fields[field]; + double *fz = tables.fz_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fx[tid] = 0.0; + fy[tid] = 0.0; + fz[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fx[tid] = d_gp.d12dx * ( + fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fy[tid] = d_gp.d12dy * ( + fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fz[tid] = d_gp.d12dz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fx[tid] = d_gp.d2dx * ( + -fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fy[tid] = d_gp.d2dy * ( + -fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fz[tid] = d_gp.d2dz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + } + else { + fx[tid] = 0.0; + fy[tid] = 0.0; + fz[tid] = 0.0; + } +} + +__global__ __launch_bounds__(128, 4) +void kern_fdderivs_batched(FDDerivTables tables, int field_count) +{ + const int field = blockIdx.y; + if (field >= field_count) return; + + const double *src = tables.src_fields[field]; + double *fxx = tables.fxx_fields[field]; + double *fxy = tables.fxy_fields[field]; + double *fxz = tables.fxz_fields[field]; + double *fyy = tables.fyy_fields[field]; + double *fyz = tables.fyz_fields[field]; + double *fzz = tables.fzz_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx[tid] = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fyy[tid] = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fzz[tid] = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + + const double t_jm2 = + fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2); + const double t_jm1 = + fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2); + const double t_jp1 = + fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2); + const double t_jp2 = + fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2); + fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2); + fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2); + fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx[tid] = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fyy[tid] = d_gp.Sdydy * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fzz[tid] = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + fxy[tid] = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)); + fxz[tid] = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)); + fyz[tid] = d_gp.Sdydz * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)); + } + else { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + } +} + /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, double SoA0, double SoA1, double SoA2, int all) @@ -1321,6 +1609,58 @@ static void gpu_fdderivs(double *d_f, kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); } +static void gpu_fderivs_batch(int field_count, + double *const *src_fields, + double *const *fx_fields, + double *const *fy_fields, + double *const *fz_fields, + const int *soa_signs, + int all) +{ + if (field_count <= 0) return; + FDerivTables tables = {}; + for (int i = 0; i < field_count; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.fx_fields[i] = fx_fields[i]; + tables.fy_fields[i] = fy_fields[i]; + tables.fz_fields[i] = fz_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count); + kern_fderivs_batched<<>>(tables, field_count); +} + +static void gpu_fdderivs_batch(int field_count, + double *const *src_fields, + double *const *fxx_fields, + double *const *fxy_fields, + double *const *fxz_fields, + double *const *fyy_fields, + double *const *fyz_fields, + double *const *fzz_fields, + const int *soa_signs, + int all) +{ + if (field_count <= 0) return; + FDDerivTables tables = {}; + for (int i = 0; i < field_count; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.fxx_fields[i] = fxx_fields[i]; + tables.fxy_fields[i] = fxy_fields[i]; + tables.fxz_fields[i] = fxz_fields[i]; + tables.fyy_fields[i] = fyy_fields[i]; + tables.fyz_fields[i] = fyz_fields[i]; + tables.fzz_fields[i] = fzz_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count); + kern_fdderivs_batched<<>>(tables, field_count); +} + /* Combined ord=3 advection + KO dissipation. * When advection and KO use the same source field, symmetry packing is shared. * If they differ (e.g. gxx advection + dxx KO), only KO repacks. @@ -3203,23 +3543,49 @@ static void launch_rhs_pipeline(int all, double eps, int co) const double ANTI = -1.0; #define D(s) g_buf.slot[s] - kern_phase1_prep<<>>( D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz)); - gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all); - gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all); - gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all); - gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all); - gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all); - gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all); - gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all); + { + double *src_fields[] = { + D(S_betax), D(S_betay), D(S_betaz), D(S_chi), + D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), + D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK) + }; + double *fx_fields[] = { + D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), + D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx) + }; + double *fy_fields[] = { + D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), + D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky) + }; + double *fz_fields[] = { + D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), + D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz) + }; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } kern_phase2_metric_rhs<<>>( D(S_alpn1), D(S_chin1), @@ -3285,15 +3651,38 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs)); - gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx), - D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all); - gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy), - D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all); - gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz), - D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all); - gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all); - gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all); - gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all); + { + double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)}; + double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)}; + double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)}; + double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)}; + double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)}; + double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)}; + double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)}; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI + }; + gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fxx_fields, fxy_fields, fxz_fields, + fyy_fields, fyz_fields, fzz_fields, + soa_signs, all); + } + { + double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)}; + double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)}; + double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)}; + double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)}; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } kern_phase8_gamma_rhs_part2<<>>( D(S_gupxx), D(S_gupxy), D(S_gupxz), @@ -3462,12 +3851,23 @@ static void launch_rhs_pipeline(int all, double eps, int co) gpu_lopsided_kodis_state_batch(eps, all); if (co == 0) { - gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all); - gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all); - gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all); - gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all); - gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all); + { + double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)}; + double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)}; + double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)}; + double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)}; + const int soa_signs[] = { + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } kern_phase18_constraints<<>>( D(S_chin1),