diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index b6983c4..4df55a8 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -6,10 +6,20 @@ #include "parameters.h" #include #include -#if USE_CUDA_BSSN + +#ifndef USE_CUDA_Z4C +#define USE_CUDA_Z4C 0 +#endif + +#if USE_CUDA_BSSN || USE_CUDA_Z4C #include +#endif +#if USE_CUDA_BSSN #include "bssn_rhs_cuda.h" #endif +#if USE_CUDA_Z4C +#include "z4c_rhs_cuda.h" +#endif namespace { @@ -80,7 +90,7 @@ bool cuda_sync_pinned_enabled() if (enabled < 0) { const char *env = getenv("AMSS_CUDA_PINNED_SYNC"); -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C enabled = (!env || atoi(env) != 0) ? 1 : 0; #else enabled = 0; @@ -93,7 +103,7 @@ void free_comm_buffer(double *&ptr, unsigned char &is_pinned) { if (!ptr) return; -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C if (is_pinned) cudaFreeHost(ptr); else @@ -110,7 +120,7 @@ double *alloc_comm_buffer(int length, unsigned char &is_pinned) is_pinned = 0; if (length <= 0) return 0; -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C if (cuda_sync_pinned_enabled()) { double *ptr = 0; @@ -157,19 +167,43 @@ int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) return (src_vars || dst_vars) ? -1 : count; } -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C +bool cuda_state_count_direct_supported(int state_count) +{ +#if USE_CUDA_Z4C && (ABEtype == 2) + return state_count == Z4C_CUDA_STATE_COUNT; +#elif USE_CUDA_BSSN + return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; +#else + (void)state_count; + return false; +#endif +} + bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) { if (type != 1 || !src || !dst || !src->Bg) return false; +#if USE_CUDA_Z4C && (ABEtype == 2) + return z4c_cuda_has_resident_state(src->Bg) != 0; +#elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(src->Bg) != 0; +#else + return false; +#endif } bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) { if (type != 1 || !dst || !dst->Bg) return false; +#if USE_CUDA_Z4C && (ABEtype == 2) + return z4c_cuda_has_resident_state(dst->Bg) != 0; +#elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; +#else + return false; +#endif } bool cuda_direct_pack_segment(double *buffer, @@ -177,15 +211,28 @@ bool cuda_direct_pack_segment(double *buffer, const Parallel::gridseg *dst, int state_count) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count != Z4C_CUDA_STATE_COUNT) + return false; +#elif USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; +#else + return false; +#endif const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); +#if USE_CUDA_Z4C && (ABEtype == 2) + const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; +#else const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; +#endif if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; return ok; @@ -195,15 +242,28 @@ bool cuda_direct_unpack_segment(double *buffer, const Parallel::gridseg *dst, int state_count) { +#if USE_CUDA_Z4C && (ABEtype == 2) + if (state_count != Z4C_CUDA_STATE_COUNT) + return false; +#elif USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; +#else + return false; +#endif const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); +#if USE_CUDA_Z4C && (ABEtype == 2) + const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; +#else const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; +#endif if (sync_profile_enabled()) sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; return ok; @@ -3966,9 +4026,10 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) + if (dir == PACK && cuda_state_count_direct_supported(state_count) && + cuda_can_direct_pack(src->data, dst->data, type)) { handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); if (!handled_by_cuda) @@ -3977,7 +4038,8 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) + else if (dir == UNPACK && cuda_state_count_direct_supported(state_count) && + cuda_can_direct_unpack(dst->data, type)) { handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count); if (!handled_by_cuda) @@ -4012,7 +4074,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub); -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C } else { @@ -4593,7 +4655,7 @@ void Parallel::SyncCache::destroy() { if (send_bufs && send_bufs[i]) { -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C free_comm_buffer(send_bufs[i], send_buf_pinned[i]); #else delete[] send_bufs[i]; @@ -4601,7 +4663,7 @@ void Parallel::SyncCache::destroy() } if (recv_bufs && recv_bufs[i]) { -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || USE_CUDA_Z4C free_comm_buffer(recv_bufs[i], recv_buf_pinned[i]); #else delete[] recv_bufs[i]; diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index 6f4cd27..741919f 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -26,12 +26,20 @@ using namespace std; #include "shellfunctions.h" #include "cpbc.h" #include "kodiss.h" -#include "parameters.h" - -#ifdef With_AHF -#include "derivatives.h" -#include "myglobal.h" -#endif +#include "parameters.h" + +#ifndef USE_CUDA_Z4C +#define USE_CUDA_Z4C 0 +#endif + +#if USE_CUDA_Z4C && (ABEtype == 2) +#include "z4c_rhs_cuda.h" +#endif + +#ifdef With_AHF +#include "derivatives.h" +#include "myglobal.h" +#endif //================================================================================================ @@ -167,12 +175,554 @@ Z4c_class::~Z4c_class() #define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them -#ifndef CPBC -// for sommerfeld boundary - -void Z4c_class::Step(int lev, int YN) -{ - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#ifndef CPBC +// for sommerfeld boundary + +#if USE_CUDA_Z4C && (ABEtype == 2) +#ifdef WithShell +#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only" +#endif +#if (MRBD == 2) +#error "USE_CUDA_Z4C resident path does not support MRBD == 2" +#endif + +namespace { + +static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20}; + +bool fill_z4c_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = 0, + double *soa_flat = 0) +{ + int idx = 0; + while (vars && idx < Z4C_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == Z4C_CUDA_STATE_COUNT && vars == 0; +} + +void z4c_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && z4c_cuda_has_resident_state(cg)) + { + double *state_out[Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, vars, state_out)) + { + cout << "CUDA Z4C state list mismatch on resident state download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (z4c_cuda_download_resident_state(cg, cg->shape, state_out)) + { + cout << "CUDA Z4C resident state download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (release_ctx) + z4c_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +bool z4c_cuda_patch_contains_point(Patch *patch, const double *point) +{ + if (!patch) + return false; + for (int d = 0; d < dim; d++) + { + const double h = patch->getdX(d); + const double lo = patch->bbox[d] + patch->lli[d] * h; + const double hi = patch->bbox[dim + d] - patch->uui[d] * h; + if (point[d] < lo || point[d] > hi) + return false; + } + return true; +} + +bool z4c_cuda_point_in_block(Patch *patch, Block *block, + const double *point, const double *DH) +{ + if (!patch || !block) + return false; + for (int d = 0; d < dim; d++) + { + double llb; + double uub; +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + (ghost_width - 0.5) * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d]; +#else +#ifdef Cell + llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2)) + ? block->bbox[d] + patch->lli[d] * DH[d] + : block->bbox[d] + ghost_width * DH[d]; + uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2)) + ? block->bbox[dim + d] - patch->uui[d] * DH[d] + : block->bbox[dim + d] - ghost_width * DH[d]; +#else +#error Not define Vertex nor Cell +#endif +#endif + if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2) + return false; + } + return true; +} + +int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn) +{ + if (!coords || n <= ordn) + return 0; + int cxi = int((x - coords[0]) / dx + 0.4) + 1; + int start = cxi - ordn / 2; + if (start < 0) + start = 0; + const int max_start = n - ordn; + if (start > max_start) + start = max_start; + return start; +} + +bool z4c_cuda_interp_bh_point_resident(MyList *PatL, + int myrank, + const double *point, + var *forx, var *fory, var *forz, + int Symmetry, + double *shellf) +{ + const int ordn = 2 * ghost_width; + int owner_rank = -1; + + shellf[0] = shellf[1] = shellf[2] = 0.0; + + MyList *PL = PatL; + while (PL) + { + Patch *patch = PL->data; + if (!z4c_cuda_patch_contains_point(patch, point)) + { + PL = PL->next; + continue; + } + + double DH[dim]; + for (int d = 0; d < dim; d++) + DH[d] = patch->getdX(d); + + MyList *BP = patch->blb; + while (BP) + { + Block *block = BP->data; + if (z4c_cuda_point_in_block(patch, block, point, DH)) + { + owner_rank = block->rank; + if (myrank == owner_rank) + { + int interp_ordn = ordn; + int interp_sym = Symmetry; + double x = point[0]; + double y = point[1]; + double z = point[2]; + + if (z4c_cuda_has_resident_state(block) && + block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) + { + const int sx = ordn; + const int sy = ordn; + const int sz = ordn; + const int region_all = sx * sy * sz; + const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); + const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); + const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); + double *packed_fields = new double[3 * region_all]; + var *vars[3] = {forx, fory, forz}; + for (int f = 0; f < 3; f++) + { + if (z4c_cuda_pack_state_region_to_host_buffer(block, + k_z4c_cuda_bh_state_indices[f], + packed_fields + f * region_all, + block->shape, + i0, j0, k0, + sx, sy, sz) != 0) + { + delete[] packed_fields; + cout << "CUDA Z4C BH tile download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int tile_shape[3] = {sx, sy, sz}; + f_global_interp(tile_shape, + block->X[0] + i0, + block->X[1] + j0, + block->X[2] + k0, + packed_fields + f * region_all, + shellf[f], + x, y, z, + interp_ordn, + vars[f]->SoA, + interp_sym); + } + delete[] packed_fields; + } + else + { + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forx->sgfn], shellf[0], + x, y, z, interp_ordn, forx->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[fory->sgfn], shellf[1], + x, y, z, interp_ordn, fory->SoA, interp_sym); + f_global_interp(block->shape, block->X[0], block->X[1], block->X[2], + block->fgfs[forz->sgfn], shellf[2], + x, y, z, interp_ordn, forz->SoA, interp_sym); + } + } + break; + } + if (BP == patch->ble) + break; + BP = BP->next; + } + + if (owner_rank >= 0) + break; + PL = PL->next; + } + + if (owner_rank < 0) + return false; + + MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD); + return true; +} + +bool z4c_cuda_compute_porg_rhs_resident(cgh *GH, + int ilev, + int myrank, + int BH_num, + double **BH_PS, + double **BH_RHS, + var *forx, var *fory, var *forz, + int Symmetry) +{ + for (int n = 0; n < BH_num; n++) + { + double shellf[3] = {0.0, 0.0, 0.0}; + int lev = ilev; + while (lev >= 0 && + !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], + forx, fory, forz, Symmetry, shellf)) + { + --lev; + } + if (lev < 0) + return false; + BH_RHS[n][0] = -shellf[0]; + BH_RHS[n][1] = -shellf[1]; + BH_RHS[n][2] = -shellf[2]; + } + return true; +} + +} // namespace +#endif + +void Z4c_class::Step(int lev, int YN) +{ +#if USE_CUDA_Z4C && (ABEtype == 2) + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#ifdef With_AHF + AH_Step_Find(lev, dT_lev); +#endif + bool BB = fgt(PhysTime, StartTime, dT_lev / 2); + double ndeps = numepss; + if (lev < GH->movls) + ndeps = numepsb; + double TRK4 = PhysTime; + int iter_count = 0; + int pre = 0, cor = 1; + int ERROR = 0; + + MyList *Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + double *state_in[Z4C_CUDA_STATE_COUNT]; + double *state_out[Z4C_CUDA_STATE_COUNT]; + double propspeed[Z4C_CUDA_STATE_COUNT]; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || + !fill_z4c_cuda_views(cg, SynchList_pre, state_out)) + { + cout << "CUDA Z4C state list mismatch on predictor step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (MRBD == 0) +#if (SommerType == 0) + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#elif (MRBD == 1) + apply_bam_bc = 1; +#endif + int keep_resident_state = 1; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif + if (z4c_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, pre, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA Z4C predictor substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + { + int erh = ERROR; + MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + } + if (ERROR) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + + if (BH_num > 0 && lev == GH->levels - 1) + { + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg[ithBH][2] = fabs(Porg[ithBH][2]); + if (Symmetry == 2) + { + Porg[ithBH][0] = fabs(Porg[ithBH][0]); + Porg[ithBH][1] = fabs(Porg[ithBH][1]); + } + } + } + + if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime)) + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); + if (lev == a_lev) + AnalysisStuff(lev, dT_lev); + + for (iter_count = 1; iter_count < 4; iter_count++) + { + if (iter_count == 1 || iter_count == 3) + TRK4 += dT_lev / 2; + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { + double *state_in[Z4C_CUDA_STATE_COUNT]; + double *state_out[Z4C_CUDA_STATE_COUNT]; + double propspeed[Z4C_CUDA_STATE_COUNT]; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || + !fill_z4c_cuda_views(cg, SynchList_cor, state_out)) + { + cout << "CUDA Z4C state list mismatch on corrector step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (MRBD == 0) +#if (SommerType == 0) + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#elif (MRBD == 1) + apply_bam_bc = 1; +#endif + int keep_resident_state = 1; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#elif (AGM == 1) + apply_enforce_ga = (iter_count == 3) ? 1 : 0; +#endif + if (z4c_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, cor, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA Z4C corrector substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + { + int erh = ERROR; + MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + } + if (ERROR) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count + << " at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + + if (BH_num > 0 && lev == GH->levels - 1) + { + if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num, + Porg, Porg1, + Sfx, Sfy, Sfz, Symmetry)) + { + if (myrank == 0 && ErrorMonitor->outfile) + ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = " + << PhysTime << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); + if (Symmetry == 2) + { + Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); + Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); + } + } + } + + if (iter_count < 3) + { + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + cg->swapList(SynchList_pre, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg[ithBH][0] = Porg1[ithBH][0]; + Porg[ithBH][1] = Porg1[ithBH][1]; + Porg[ithBH][2] = Porg1[ithBH][2]; + } + } + } + } + + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + +#if (RPS == 0) + RestrictProlong(lev, YN, BB); +#endif + + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + cg->swapList(StateList, SynchList_cor, myrank); + cg->swapList(OldStateList, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg0[ithBH][0] = Porg1[ithBH][0]; + Porg0[ithBH][1] = Porg1[ithBH][1]; + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } +#else + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); #ifdef With_AHF AH_Step_Find(lev, dT_lev); #endif @@ -1039,15 +1589,19 @@ void Z4c_class::Step(int lev, int YN) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -} -#else -// for constraint preserving boundary (CPBC) -#ifndef WithShell -#error "CPBC only supports Shell" -#endif + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } +#endif +} +#else +// for constraint preserving boundary (CPBC) +#if USE_CUDA_Z4C && (ABEtype == 2) +#error "USE_CUDA_Z4C resident path does not support CPBC" +#endif +#ifndef WithShell +#error "CPBC only supports Shell" +#endif // 0: extroplate rhs, 1: extroplate variable // 2: extroplate variable but before RHS calculation diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index eb3c19c..97de0c8 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -49,6 +49,10 @@ endif bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) +# CUDA rewrite of Z4C Cartesian RHS +z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + # C rewrite of BSSN RHS kernel and helpers bssn_rhs_c.o: bssn_rhs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ @@ -65,14 +69,14 @@ kodiss_c.o: kodiss_c.C lopsided_c.o: lopsided_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ -lopsided_kodis_c.o: lopsided_kodis_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -z4c_rhs_c.o: z4c_rhs_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h -# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ +lopsided_kodis_c.o: lopsided_kodis_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +z4c_rhs_c.o: z4c_rhs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h +# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata @@ -88,13 +92,16 @@ TwoPunctureABE.o: TwoPunctureABE.C # Input files -## CUDA BSSN RHS switch -## 1 : use the rewritten CUDA bssn_rhs backend -## 0 : keep the normal CPU/Fortran selection below -USE_CUDA_BSSN ?= 0 +## CUDA BSSN RHS switch +## 1 : use the rewritten CUDA bssn_rhs backend +## 0 : keep the normal CPU/Fortran selection below +USE_CUDA_BSSN ?= 0 +USE_CUDA_Z4C ?= 0 CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) +CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) +CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C) ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) ifeq ($(USE_CXX_KERNELS),0) @@ -107,22 +114,25 @@ endif CFILES_CUDA_BSSN = bssn_rhs_cuda.o -ifeq ($(USE_CUDA_BSSN),1) -CFILES = $(CFILES_CUDA_BSSN) -else -CFILES = $(CFILES_CPU) -endif - -ifeq ($(USE_CXX_Z4C_KERNELS),1) -CFILES += z4c_rhs_c.o -Z4C_F90_OBJ = -else -Z4C_F90_OBJ = Z4c_rhs.o -endif - -## RK4 kernel switch (independent from USE_CXX_KERNELS) -ifeq ($(USE_CXX_RK4),1) -RK4_C_OBJ = rungekutta4_rout_c.o +ifeq ($(USE_CUDA_BSSN),1) +CFILES = $(CFILES_CUDA_BSSN) +else +CFILES = $(CFILES_CPU) +endif + +ifeq ($(USE_CUDA_Z4C),1) +CFILES += z4c_rhs_cuda.o +Z4C_F90_OBJ = +else ifeq ($(USE_CXX_Z4C_KERNELS),1) +CFILES += z4c_rhs_c.o +Z4C_F90_OBJ = +else +Z4C_F90_OBJ = Z4c_rhs.o +endif + +## RK4 kernel switch (independent from USE_CXX_KERNELS) +ifeq ($(USE_CXX_RK4),1) +RK4_C_OBJ = rungekutta4_rout_c.o RK4_F90_OBJ = else RK4_C_OBJ = @@ -130,12 +140,15 @@ RK4_F90_OBJ = rungekutta4_rout.o endif CFILES += $(RK4_C_OBJ) -ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ) +ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) z4c_rhs_cuda.o $(RK4_C_OBJ) ABE_LDLIBS = $(LDLIBS) ifeq ($(USE_CUDA_BSSN),1) ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH) endif +ifeq ($(USE_CUDA_Z4C),1) +ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH) +endif C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ @@ -155,13 +168,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\ - lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ - shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ - getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ - fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\ - cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ - getnpem2.o empart.o NullNews.o fourdcurvature.o\ - bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ + lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ + shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ + getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ + fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\ + cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ + getnpem2.o empart.o NullNews.o fourdcurvature.o\ + bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ NullNews2.o tool_f.o @@ -220,9 +233,10 @@ misc.o : zbesh.o ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS) -ABE_CUDA: USE_CUDA_BSSN=1 -ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH) +ABE_CUDA: USE_CUDA_BSSN=1 +ABE_CUDA: USE_CUDA_Z4C=1 +ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH) #ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) # $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu new file mode 100644 index 0000000..79a11c1 --- /dev/null +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -0,0 +1,7496 @@ +/* + * z4c_rhs_cuda.cu - CUDA implementation of the main Cartesian Z4C RHS. + * + * This is a drop-in replacement for z4c_rhs_c.C when USE_CUDA_Z4C=1. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "macrodef.h" +#include "bssn_rhs.h" +#include "ricci_gamma.h" +#include "z4c_rhs_cuda.h" + +#ifdef fortran1 +#define f_constraint_bssn constraint_bssn +#endif +#ifdef fortran2 +#define f_constraint_bssn CONSTRAINT_BSSN +#endif +#ifdef fortran3 +#define f_constraint_bssn constraint_bssn_ +#endif + +namespace z4c_cuda { + +/* ------------------------------------------------------------------ */ +/* Multi-GPU dispatch: distribute ranks across available GPUs */ +/* ------------------------------------------------------------------ */ +static struct { + int num_gpus; + int my_rank; + int my_local_rank; + int my_device; + bool inited; +} g_dispatch = {0, -1, -1, -1, false}; + +static int env_to_int(const char *name, int fallback = -1) { + const char *v = getenv(name); + if (!v || !*v) return fallback; + return atoi(v); +} + +static void init_gpu_dispatch() { + if (g_dispatch.inited) return; + cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus); + if (err != cudaSuccess) g_dispatch.num_gpus = 1; + if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; + + /* Get MPI rank from environment (set by mpirun/mpiexec). */ + g_dispatch.my_rank = env_to_int("PMI_RANK", + env_to_int("OMPI_COMM_WORLD_RANK", + env_to_int("MV2_COMM_WORLD_RANK", + env_to_int("SLURM_PROCID", 0)))); + + /* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */ + g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK", + env_to_int("MV2_COMM_WORLD_LOCAL_RANK", + env_to_int("MPI_LOCALRANKID", + env_to_int("SLURM_LOCALID", -1)))); + + const int rank_for_map = (g_dispatch.my_local_rank >= 0) + ? g_dispatch.my_local_rank : g_dispatch.my_rank; + g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus; + cudaSetDevice(g_dispatch.my_device); + + if (g_dispatch.my_rank == 0) { + printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n", + g_dispatch.num_gpus, + (g_dispatch.my_local_rank >= 0) ? "local" : "global"); + } + g_dispatch.inited = true; +} + +struct CudaProfileStats { + long long calls; + double total_ms; + double state_ms; + double matter_ms; + double rhs_ms; + double bc_ms; + double finalize_ms; + double output_ms; +}; + +static CudaProfileStats &cuda_profile_stats() { + static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + return stats; +} + +static bool cuda_profile_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static int cuda_profile_every() { + static int every = -1; + if (every < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 100; + } + return every; +} + +static double cuda_profile_now_ms() { + using clock = std::chrono::steady_clock; + return std::chrono::duration( + clock::now().time_since_epoch()).count(); +} + +static void cuda_profile_sync() { + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + fprintf(stderr, "CUDA error %s:%d: %s\n", + __FILE__, __LINE__, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +static void cuda_profile_maybe_log() { + if (!cuda_profile_enabled()) return; + CudaProfileStats &stats = cuda_profile_stats(); + if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return; + fprintf(stderr, + "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n", + g_dispatch.my_rank, g_dispatch.my_device, stats.calls, + stats.total_ms / (double)stats.calls, + stats.state_ms / (double)stats.calls, + stats.matter_ms / (double)stats.calls, + stats.rhs_ms / (double)stats.calls, + stats.bc_ms / (double)stats.calls, + stats.finalize_ms / (double)stats.calls, + stats.output_ms / (double)stats.calls); + fflush(stderr); +} + +/* ------------------------------------------------------------------ */ +/* Error checking */ +/* ------------------------------------------------------------------ */ +#define CUDA_CHECK(call) do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +/* ------------------------------------------------------------------ */ +/* Physical / gauge constants (matching bssn_rhs_c.C) */ +/* ------------------------------------------------------------------ */ +static const double PI_VAL = 3.14159265358979323846; +static const double FF_VAL = 0.75; +static const double ETA_VAL = 2.0; + +/* ------------------------------------------------------------------ */ +/* Constant memory for grid parameters and stencil coefficients */ +/* ------------------------------------------------------------------ */ +struct GridParams { + int ex[3]; /* nx, ny, nz */ + int all; /* nx*ny*nz */ + double dX, dY, dZ; + /* fderivs coefficients */ + double d12dx, d12dy, d12dz; /* 1/(12*dX) etc */ + double d2dx, d2dy, d2dz; /* 1/(2*dX) etc */ + /* fdderivs coefficients */ + double Fdxdx, Fdydy, Fdzdz; /* 1/(12*dX^2) etc */ + double Sdxdx, Sdydy, Sdzdz; /* 1/(dX^2) etc */ + double Fdxdy, Fdxdz, Fdydz; /* 1/(144*dX*dY) etc */ + double Sdxdy, Sdxdz, Sdydz; /* 1/(4*dX*dY) etc */ + /* symmetry bounds (Fortran 1-based) */ + int iminF, jminF, kminF; + int imaxF, jmaxF, kmaxF; + /* symmetry bounds for ord=3 (lopsided/kodis) */ + int iminF3, jminF3, kminF3; + int Symmetry; + double eps; + int co; + /* padded sizes */ + int fh2_nx, fh2_ny, fh2_nz; /* (nx+2), (ny+2), (nz+2) for ord=2 */ + int fh3_nx, fh3_ny, fh3_nz; /* (nx+3), (ny+3), (nz+3) for ord=3 */ +}; + +__constant__ GridParams d_gp; +static GridParams g_gp_host_cache = {}; +static bool g_gp_host_cache_valid = false; + +/* ------------------------------------------------------------------ */ +/* Device indexing helpers */ +/* ------------------------------------------------------------------ */ +__device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) { + return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1]; +} + +__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src, + int iF, int jF, int kF, + int SoA0, int SoA1, int SoA2) +{ + int siF = iF; + int sjF = jF; + int skF = kF; + double sign = 1.0; + + if (iF <= 0) { + siF = 1 - iF; + sign *= (double)SoA0; + } + if (jF <= 0) { + sjF = 1 - jF; + sign *= (double)SoA1; + } + if (kF <= 0) { + skF = 1 - kF; + sign *= (double)SoA2; + } + + return sign * src[(siF - 1) + + (sjF - 1) * d_gp.ex[0] + + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]]; +} + +/* ord=2 ghost-padded: Fortran index iF -> flat index */ +__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) { + return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny; +} + +/* ord=3 ghost-padded: Fortran index iF -> flat index */ +__device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) { + return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny; +} + +__device__ __forceinline__ double fetch_sym_ord3_direct(const double *src, + int iF, int jF, int kF, + int SoA0, int SoA1, int SoA2) +{ + int siF = iF; + int sjF = jF; + int skF = kF; + double sign = 1.0; + + if (iF <= 0) { + siF = 1 - iF; + sign *= (double)SoA0; + } + if (jF <= 0) { + sjF = 1 - jF; + sign *= (double)SoA1; + } + if (kF <= 0) { + skF = 1 - kF; + sign *= (double)SoA2; + } + + return sign * src[(siF - 1) + + (sjF - 1) * d_gp.ex[0] + + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]]; +} + +/* ------------------------------------------------------------------ */ +/* GPU buffer management */ +/* ------------------------------------------------------------------ */ +/* + * Array slot indices — all arrays live in one big cudaMalloc block. + * INPUT arrays (H2D): 39 slots + * OUTPUT arrays (D2H): 52 slots + * TEMPORARY arrays (GPU-only): ~65 slots + * Plus 2 extended arrays for ghost-padded stencils (fh_ord2, fh_ord3) + */ + +/* Total number of "all"-sized slots */ +#define NUM_SLOTS 320 + +struct GpuBuffers { + double *d_mem; /* single big allocation */ + double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ + double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ + double *h_stage; /* host staging buffer for bulk H2D/D2H */ + bool h_stage_pinned; /* true if allocated by cudaMallocHost */ + double *slot[NUM_SLOTS]; /* pointers into d_mem */ + size_t cap_all; + size_t cap_fh2_size; + size_t cap_fh3_size; + int prev_nx, prev_ny, prev_nz; + bool initialized; +}; + +static GpuBuffers g_buf = { + nullptr, nullptr, nullptr, nullptr, false, {}, + 0, 0, 0, 0, 0, 0, false +}; + +/* Slot assignments — INPUT (H2D) */ +enum { + S_chi=0, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, + S_Lap, S_betax, S_betay, S_betaz, + S_dtSfx, S_dtSfy, S_dtSfz, S_TZ, + S_rho, S_Sx, S_Sy, S_Sz, + S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz, + S_X, S_Y, S_Z, /* coordinate arrays — only nx/ny/nz long */ + /* 37 input slots so far; X/Y/Z are special-sized */ + + /* OUTPUT (D2H) */ + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, S_TZ_rhs, + S_Gamxxx, S_Gamxxy, S_Gamxxz, S_Gamxyy, S_Gamxyz, S_Gamxzz, + S_Gamyxx, S_Gamyxy, S_Gamyxz, S_Gamyyy, S_Gamyyz, S_Gamyzz, + S_Gamzxx, S_Gamzxy, S_Gamzxz, S_Gamzyy, S_Gamzyz, S_Gamzzz, + S_Rxx, S_Rxy, S_Rxz, S_Ryy, S_Ryz, S_Rzz, + S_ham_Res, S_movx_Res, S_movy_Res, S_movz_Res, + S_Gmx_Res, S_Gmy_Res, S_Gmz_Res, + + /* TEMPORARY (GPU-only) */ + S_gxx, S_gyy, S_gzz, /* physical metric = dxx+1 etc */ + S_alpn1, S_chin1, + S_chix, S_chiy, S_chiz, + S_gxxx, S_gxyx, S_gxzx, S_gyyx, S_gyzx, S_gzzx, + S_gxxy, S_gxyy, S_gxzy, S_gyyy, S_gyzy, S_gzzy, + S_gxxz, S_gxyz, S_gxzz, S_gyyz, S_gyzz, S_gzzz, + S_Lapx, S_Lapy, S_Lapz, + S_betaxx, S_betaxy, S_betaxz, + S_betayx, S_betayy, S_betayz, + S_betazx, S_betazy, S_betazz, + S_Gamxx, S_Gamxy, S_Gamxz, + S_Gamyx, S_Gamyy_t, S_Gamyz_t, + S_Gamzx, S_Gamzy, S_Gamzz_t, + S_Kx, S_Ky, S_Kz, + S_S_arr, S_f_arr, + S_fxx, S_fxy, S_fxz, S_fyy, S_fyz, S_fzz, + S_Gamxa, S_Gamya, S_Gamza, + S_gupxx, S_gupxy, S_gupxz, + S_gupyy, S_gupyz, S_gupzz, + + S_dBxx, S_dBxy, S_dBxz, S_dByx, S_dByy, S_dByz, S_dBzx, S_dBzy, S_dBzz, + S_TZx, S_TZy, S_TZz, + S_Axxx, S_Axxy, S_Axxz, S_Axyx, S_Axyy, S_Axyz, + S_Axzx, S_Axzy, S_Axzz, S_Ayyx, S_Ayyy, S_Ayyz, + S_Ayzx, S_Ayzy, S_Ayzz, S_Azzx, S_Azzy, S_Azzz, + S_chixx, S_chixy, S_chixz, S_chiyy, S_chiyz, S_chizz, + S_Lapxx, S_Lapxy, S_Lapxz, S_Lapyy, S_Lapyz, S_Lapzz, + S_gxxxx, S_gxxxy, S_gxxxz, S_gxxyy, S_gxxyz, S_gxxzz, + S_gxyxx, S_gxyxy, S_gxyxz, S_gxyyy, S_gxyyz, S_gxyzz, + S_gxzxx, S_gxzxy, S_gxzxz, S_gxzyy, S_gxzyz, S_gxzzz, + S_gyyxx, S_gyyxy, S_gyyxz, S_gyyyy, S_gyyyz, S_gyyzz, + S_gyzxx, S_gyzxy, S_gyzxz, S_gyzyy, S_gyzyz, S_gyzzz, + S_gzzxx, S_gzzxy, S_gzzxz, S_gzzyy, S_gzzyz, S_gzzzz, + S_sfxxx, S_sfxxy, S_sfxxz, S_sfxyy, S_sfxyz, S_sfxzz, + S_sfyxx, S_sfyxy, S_sfyxz, S_sfyyy, S_sfyyz, S_sfyzz, + S_sfzxx, S_sfzxy, S_sfzxz, S_sfzyy, S_sfzyz, S_sfzzz, + NUM_USED_SLOTS +}; + +static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS"); + +static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1); +static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1); +static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1); +static const int STAGE_SLOT_COUNT = + (H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT)) + ? H2D_INPUT_SLOT_COUNT + : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT); + +static constexpr int BSSN_STATE_COUNT = Z4C_CUDA_STATE_COUNT; +static constexpr int BSSN_MATTER_COUNT = 10; +static constexpr int BSSN_LK_FIELD_COUNT = 24; + +static const int k_state_input_slots[BSSN_STATE_COUNT] = { + S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, + S_Lap, S_betax, S_betay, S_betaz, + S_dtSfx, S_dtSfy, S_dtSfz, S_TZ +}; + +static const int k_state_rhs_slots[BSSN_STATE_COUNT] = { + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, S_TZ_rhs +}; + +static const int k_matter_slots[BSSN_MATTER_COUNT] = { + S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz +}; + +static const int k_lk_adv_slots[BSSN_LK_FIELD_COUNT] = { + S_gxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_gyy, S_betay, + S_gyz, S_betaz, S_gzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz, + S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy +}; + +static const int k_lk_ko_slots[BSSN_LK_FIELD_COUNT] = { + S_dxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_dyy, S_betay, + S_gyz, S_betaz, S_dzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz, + S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy +}; + +static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { + S_gxx_rhs, S_Gamz_rhs, S_gxy_rhs, S_Lap_rhs, S_gxz_rhs, S_betax_rhs, + S_gyy_rhs, S_betay_rhs, S_gyz_rhs, S_betaz_rhs, S_gzz_rhs, S_dtSfx_rhs, + S_Axx_rhs, S_dtSfy_rhs, S_Axy_rhs, S_dtSfz_rhs, S_Axz_rhs, S_Ayy_rhs, + S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs +}; + +__constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; + +static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { + 1, 1, 1, + 1, 1, -1, + -1, -1, 1, + 1, 1, 1, + -1, 1, -1, + -1, 1, 1, + 1, 1, 1, + 1, -1, 1, + 1, -1, -1, + 1, 1, -1, + 1, 1, 1, + -1, 1, 1, + 1, 1, 1, + 1, -1, 1, + -1, -1, 1, + 1, 1, -1, + -1, 1, -1, + 1, 1, 1, + 1, -1, -1, + 1, 1, 1, + 1, 1, 1, + 1, 1, 1, + -1, 1, 1, + 1, -1, 1 +}; + +struct StepContext { + double *d_state0_mem; + double *d_accum_mem; + double *d_state_curr_mem; + double *d_state_next_mem; + double *d_matter_mem; + double *d_comm_mem; + double *h_comm_mem; + std::array d_state0; + std::array d_accum; + std::array d_state_curr; + std::array d_state_next; + std::array d_matter; + size_t cap_all; + size_t cap_comm; + bool h_comm_pinned; + size_t cap_h_comm; + bool matter_ready; + bool state_ready; + + StepContext() + : d_state0_mem(nullptr), d_accum_mem(nullptr), + d_state_curr_mem(nullptr), d_state_next_mem(nullptr), + d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), + cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), + matter_ready(false), state_ready(false) + { + d_state0.fill(nullptr); + d_accum.fill(nullptr); + d_state_curr.fill(nullptr); + d_state_next.fill(nullptr); + d_matter.fill(nullptr); + } +}; + +struct StepAllocation { + double *d_state0_mem; + double *d_accum_mem; + double *d_state_curr_mem; + double *d_state_next_mem; + double *d_matter_mem; + double *d_comm_mem; + double *h_comm_mem; + size_t cap_all; + size_t cap_comm; + bool h_comm_pinned; + size_t cap_h_comm; +}; + +static std::unordered_map g_step_ctx; +static std::vector g_step_pool; + +static StepAllocation empty_step_allocation() +{ + StepAllocation alloc = { + nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, + 0, 0, false, 0 + }; + return alloc; +} + +static bool has_step_allocation(const StepAllocation &alloc) +{ + return alloc.cap_all != 0; +} + +static StepAllocation detach_step_allocation(StepContext &ctx) +{ + StepAllocation alloc = { + ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem, + ctx.d_state_next_mem, ctx.d_matter_mem, + ctx.d_comm_mem, ctx.h_comm_mem, + ctx.cap_all, ctx.cap_comm, ctx.h_comm_pinned, ctx.cap_h_comm + }; + ctx.d_state0_mem = nullptr; + ctx.d_accum_mem = nullptr; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_next_mem = nullptr; + ctx.d_matter_mem = nullptr; + ctx.d_comm_mem = nullptr; + ctx.h_comm_mem = nullptr; + ctx.cap_all = 0; + ctx.cap_comm = 0; + ctx.h_comm_pinned = false; + ctx.cap_h_comm = 0; + ctx.matter_ready = false; + ctx.state_ready = false; + ctx.d_state0.fill(nullptr); + ctx.d_accum.fill(nullptr); + ctx.d_state_curr.fill(nullptr); + ctx.d_state_next.fill(nullptr); + ctx.d_matter.fill(nullptr); + return alloc; +} + +static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc) +{ + ctx.d_state0_mem = alloc.d_state0_mem; + ctx.d_accum_mem = alloc.d_accum_mem; + ctx.d_state_curr_mem = alloc.d_state_curr_mem; + ctx.d_state_next_mem = alloc.d_state_next_mem; + ctx.d_matter_mem = alloc.d_matter_mem; + ctx.d_comm_mem = alloc.d_comm_mem; + ctx.h_comm_mem = alloc.h_comm_mem; + ctx.cap_all = alloc.cap_all; + ctx.cap_comm = alloc.cap_comm; + ctx.h_comm_pinned = alloc.h_comm_pinned; + ctx.cap_h_comm = alloc.cap_h_comm; + ctx.matter_ready = false; + ctx.state_ready = false; +} + +static void recycle_step_allocation(StepAllocation &alloc) +{ + if (!has_step_allocation(alloc)) return; + g_step_pool.push_back(alloc); + alloc = empty_step_allocation(); +} + +static StepAllocation acquire_step_allocation(size_t all) +{ + size_t best = g_step_pool.size(); + for (size_t i = 0; i < g_step_pool.size(); ++i) { + if (g_step_pool[i].cap_all < all) continue; + if (best == g_step_pool.size() || g_step_pool[i].cap_all < g_step_pool[best].cap_all) + best = i; + } + if (best == g_step_pool.size()) + return empty_step_allocation(); + + StepAllocation alloc = g_step_pool[best]; + g_step_pool[best] = g_step_pool.back(); + g_step_pool.pop_back(); + return alloc; +} + +static void ensure_gpu_buffers(int nx, int ny, int nz) { + size_t all = (size_t)nx * ny * nz; + size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); + size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3); + const bool need_grow = (!g_buf.initialized) + || (all > g_buf.cap_all) + || (fh2_size > g_buf.cap_fh2_size) + || (fh3_size > g_buf.cap_fh3_size); + + if (need_grow) { + if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } + if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } + if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } + if (g_buf.h_stage) { + if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage); + else free(g_buf.h_stage); + g_buf.h_stage = nullptr; + g_buf.h_stage_pinned = false; + } + + CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); + + const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double); + cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes); + if (stage_err == cudaSuccess) { + g_buf.h_stage_pinned = true; + } else { + g_buf.h_stage = (double *)malloc(stage_bytes); + g_buf.h_stage_pinned = false; + if (!g_buf.h_stage) { + fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes); + exit(EXIT_FAILURE); + } + } + + g_buf.cap_all = all; + g_buf.cap_fh2_size = fh2_size; + g_buf.cap_fh3_size = fh3_size; + g_buf.initialized = true; + } + + for (int s = 0; s < NUM_USED_SLOTS; ++s) + g_buf.slot[s] = g_buf.d_mem + s * all; + + g_buf.prev_nx = nx; + g_buf.prev_ny = ny; + g_buf.prev_nz = nz; +} + +static StepContext &ensure_step_ctx(void *block_tag, size_t all) +{ + StepContext &ctx = g_step_ctx[block_tag]; + if (ctx.cap_all < all) { + StepAllocation old_alloc = detach_step_allocation(ctx); + recycle_step_allocation(old_alloc); + + StepAllocation alloc = acquire_step_allocation(all); + if (!has_step_allocation(alloc)) { + CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); + alloc.cap_all = all; + } + attach_step_allocation(ctx, alloc); + } + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; + ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; + ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all; + ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all; + } + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; + } + return ctx; +} + +static void release_step_ctx(void *block_tag) +{ + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return; + StepAllocation alloc = detach_step_allocation(it->second); + recycle_step_allocation(alloc); + g_step_ctx.erase(it); +} + +static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles) +{ + if (needed_doubles == 0) return nullptr; + if (ctx.cap_comm < needed_doubles) { + if (ctx.d_comm_mem) { + CUDA_CHECK(cudaFree(ctx.d_comm_mem)); + ctx.d_comm_mem = nullptr; + } + CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double))); + ctx.cap_comm = needed_doubles; + } + return ctx.d_comm_mem; +} + +static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles) +{ + if (needed_doubles == 0) return nullptr; + if (ctx.cap_h_comm < needed_doubles) { + if (ctx.h_comm_mem) { + if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem); + else free(ctx.h_comm_mem); + ctx.h_comm_mem = nullptr; + ctx.h_comm_pinned = false; + } + + const size_t bytes = needed_doubles * sizeof(double); + cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes); + if (err == cudaSuccess) { + ctx.h_comm_pinned = true; + } else { + ctx.h_comm_mem = (double *)malloc(bytes); + ctx.h_comm_pinned = false; + if (!ctx.h_comm_mem) { + fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes); + exit(EXIT_FAILURE); + } + } + ctx.cap_h_comm = needed_doubles; + } + return ctx.h_comm_mem; +} + +static void upload_grid_params_if_needed(const GridParams &gp) +{ + if (!g_gp_host_cache_valid || + std::memcmp(&g_gp_host_cache, &gp, sizeof(GridParams)) != 0) { + CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams))); + g_gp_host_cache = gp; + g_gp_host_cache_valid = true; + } +} + +/* ================================================================== */ +/* A. Symmetry boundary kernels (ord=2 and ord=3) */ +/* ================================================================== */ + +/* Step 1: Copy interior into ghost-padded array */ +__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid]; + } +} + +/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 1; /* -1 .. nx */ + int jF = jj - 1; /* -1 .. ny */ + int kF = kk - 1; /* -1 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +/* Step 2: Fill i-ghosts (x-direction symmetry) */ +__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0) +{ + const int ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; + /* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */ + /* Fortran: do ii=0,ord-1: funcc(-ii,jF,kF) = funcc(ii+1,jF,kF)*SoA[0] */ + int total = ny * nz; /* jF=1..ny, kF=1..nz */ + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 2; /* 2 ghost layers */ + tid += blockDim.x * gridDim.x) + { + int ii = tid / total; /* 0 or 1 */ + int rem = tid % total; + int j0 = rem % ny; + int k0 = rem / ny; + int jF = j0 + 1, kF = k0 + 1; + int iF_dst = -ii; /* 0, -1 */ + int iF_src = ii + 1; /* 1, 2 */ + fh[(iF_dst+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = + fh[(iF_src+1) + (jF+1)*fnx + (kF+1)*fnx*fny] * SoA0; + } +} + +/* Step 3: Fill j-ghosts (y-direction symmetry) */ +__global__ void kern_symbd_jghost_ord2(double * __restrict__ fh, double SoA1) +{ + const int nx = d_gp.ex[0], nz = d_gp.ex[2]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; + /* iF ranges from -1 to nx (i.e. -ord+1 to ex1), total = nx+2 */ + int irange = nx + 2; + int total = irange * nz; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 2; + tid += blockDim.x * gridDim.x) + { + int jj = tid / total; + int rem = tid % total; + int ii = rem % irange; + int k0 = rem / irange; + int iF = ii - 1; /* -1 .. nx */ + int kF = k0 + 1; + int jF_dst = -jj; + int jF_src = jj + 1; + fh[(iF+1) + (jF_dst+1)*fnx + (kF+1)*fnx*fny] = + fh[(iF+1) + (jF_src+1)*fnx + (kF+1)*fnx*fny] * SoA1; + } +} + +/* Step 4: Fill k-ghosts (z-direction symmetry) */ +__global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; + int irange = nx + 2; + int jrange = ny + 2; + int total = irange * jrange; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 2; + tid += blockDim.x * gridDim.x) + { + int kk = tid / total; + int rem = tid % total; + int ii = rem % irange; + int jj = rem / irange; + int iF = ii - 1; + int jF = jj - 1; + int kF_dst = -kk; + int kF_src = kk + 1; + fh[(iF+1) + (jF+1)*fnx + (kF_dst+1)*fnx*fny] = + fh[(iF+1) + (jF+1)*fnx + (kF_src+1)*fnx*fny] * SoA2; + } +} + +/* ---- ord=3 variants (for lopsided / kodis) ---- */ + +__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, + double * __restrict__ fh) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid]; + } +} + +/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord3(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 2; /* -2 .. nx */ + int jF = jj - 2; /* -2 .. ny */ + int kF = kk - 2; /* -2 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0) +{ + const int ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; + int total = ny * nz; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 3; + tid += blockDim.x * gridDim.x) + { + int ii = tid / total; + int rem = tid % total; + int j0 = rem % ny; + int k0 = rem / ny; + int jF = j0 + 1, kF = k0 + 1; + int iF_dst = -ii; + int iF_src = ii + 1; + fh[(iF_dst+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = + fh[(iF_src+2) + (jF+2)*fnx + (kF+2)*fnx*fny] * SoA0; + } +} + +__global__ void kern_symbd_jghost_ord3(double * __restrict__ fh, double SoA1) +{ + const int nx = d_gp.ex[0], nz = d_gp.ex[2]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; + int irange = nx + 3; + int total = irange * nz; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 3; + tid += blockDim.x * gridDim.x) + { + int jj = tid / total; + int rem = tid % total; + int ii = rem % irange; + int k0 = rem / irange; + int iF = ii - 2; + int kF = k0 + 1; + int jF_dst = -jj; + int jF_src = jj + 1; + fh[(iF+2) + (jF_dst+2)*fnx + (kF+2)*fnx*fny] = + fh[(iF+2) + (jF_src+2)*fnx + (kF+2)*fnx*fny] * SoA1; + } +} + +__global__ void kern_symbd_kghost_ord3(double * __restrict__ fh, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; + int irange = nx + 3; + int jrange = ny + 3; + int total = irange * jrange; + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total * 3; + tid += blockDim.x * gridDim.x) + { + int kk = tid / total; + int rem = tid % total; + int ii = rem % irange; + int jj = rem / irange; + int iF = ii - 2; + int jF = jj - 2; + int kF_dst = -kk; + int kF_src = kk + 1; + fh[(iF+2) + (jF+2)*fnx + (kF_dst+2)*fnx*fny] = + fh[(iF+2) + (jF+2)*fnx + (kF_src+2)*fnx*fny] * SoA2; + } +} + +/* ================================================================== */ +/* B. Stencil kernels */ +/* ================================================================== */ + +/* ---- First derivatives (ord=2, 4th/2nd order) ---- */ +__global__ __launch_bounds__(128, 4) +void kern_fderivs(const double * __restrict__ fh, + double * __restrict__ fx, + double * __restrict__ fy, + double * __restrict__ fz) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + + /* boundary points: leave as zero */ + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0; + continue; + } + + int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + + if ((iF+2) <= imaxF && (iF-2) >= iminF && + (jF+2) <= jmaxF && (jF-2) >= jminF && + (kF+2) <= kmaxF && (kF-2) >= kminF) + { + fx[tid] = d_gp.d12dx * ( + fh[idx_fh2(iF-2,jF,kF)] - 8.0*fh[idx_fh2(iF-1,jF,kF)] + + 8.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]); + fy[tid] = d_gp.d12dy * ( + fh[idx_fh2(iF,jF-2,kF)] - 8.0*fh[idx_fh2(iF,jF-1,kF)] + + 8.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]); + fz[tid] = d_gp.d12dz * ( + fh[idx_fh2(iF,jF,kF-2)] - 8.0*fh[idx_fh2(iF,jF,kF-1)] + + 8.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]); + } + else if ((iF+1) <= imaxF && (iF-1) >= iminF && + (jF+1) <= jmaxF && (jF-1) >= jminF && + (kF+1) <= kmaxF && (kF-1) >= kminF) + { + fx[tid] = d_gp.d2dx * ( + -fh[idx_fh2(iF-1,jF,kF)] + fh[idx_fh2(iF+1,jF,kF)]); + fy[tid] = d_gp.d2dy * ( + -fh[idx_fh2(iF,jF-1,kF)] + fh[idx_fh2(iF,jF+1,kF)]); + fz[tid] = d_gp.d2dz * ( + -fh[idx_fh2(iF,jF,kF-1)] + fh[idx_fh2(iF,jF,kF+1)]); + } + else { + fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0; + } + } +} + +/* ---- Second derivatives (ord=2, 4th/2nd order) ---- */ +__global__ __launch_bounds__(128, 4) +void kern_fdderivs(const double * __restrict__ fh, + double * __restrict__ fxx, double * __restrict__ fxy, + double * __restrict__ fxz, double * __restrict__ fyy, + double * __restrict__ fyz, double * __restrict__ fzz) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fxx[tid]=0; fxy[tid]=0; fxz[tid]=0; + fyy[tid]=0; fyz[tid]=0; fzz[tid]=0; + continue; + } + + int iF = i0+1, jF = j0+1, kF = k0+1; + + if ((iF+2)<=imaxF && (iF-2)>=iminF && + (jF+2)<=jmaxF && (jF-2)>=jminF && + (kF+2)<=kmaxF && (kF-2)>=kminF) + { + /* 4th-order diagonal */ + double c = fh[idx_fh2(iF,jF,kF)]; + fxx[tid] = d_gp.Fdxdx*( + -fh[idx_fh2(iF-2,jF,kF)] + 16.0*fh[idx_fh2(iF-1,jF,kF)] + -30.0*c + 16.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]); + fyy[tid] = d_gp.Fdydy*( + -fh[idx_fh2(iF,jF-2,kF)] + 16.0*fh[idx_fh2(iF,jF-1,kF)] + -30.0*c + 16.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]); + fzz[tid] = d_gp.Fdzdz*( + -fh[idx_fh2(iF,jF,kF-2)] + 16.0*fh[idx_fh2(iF,jF,kF-1)] + -30.0*c + 16.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]); + + /* 4th-order cross: fxy */ + { + double t_jm2 = fh[idx_fh2(iF-2,jF-2,kF)] - 8.0*fh[idx_fh2(iF-1,jF-2,kF)] + + 8.0*fh[idx_fh2(iF+1,jF-2,kF)] - fh[idx_fh2(iF+2,jF-2,kF)]; + double t_jm1 = fh[idx_fh2(iF-2,jF-1,kF)] - 8.0*fh[idx_fh2(iF-1,jF-1,kF)] + + 8.0*fh[idx_fh2(iF+1,jF-1,kF)] - fh[idx_fh2(iF+2,jF-1,kF)]; + double t_jp1 = fh[idx_fh2(iF-2,jF+1,kF)] - 8.0*fh[idx_fh2(iF-1,jF+1,kF)] + + 8.0*fh[idx_fh2(iF+1,jF+1,kF)] - fh[idx_fh2(iF+2,jF+1,kF)]; + double t_jp2 = fh[idx_fh2(iF-2,jF+2,kF)] - 8.0*fh[idx_fh2(iF-1,jF+2,kF)] + + 8.0*fh[idx_fh2(iF+1,jF+2,kF)] - fh[idx_fh2(iF+2,jF+2,kF)]; + fxy[tid] = d_gp.Fdxdy*(t_jm2 - 8.0*t_jm1 + 8.0*t_jp1 - t_jp2); + } + /* 4th-order cross: fxz */ + { + double t_km2 = fh[idx_fh2(iF-2,jF,kF-2)] - 8.0*fh[idx_fh2(iF-1,jF,kF-2)] + + 8.0*fh[idx_fh2(iF+1,jF,kF-2)] - fh[idx_fh2(iF+2,jF,kF-2)]; + double t_km1 = fh[idx_fh2(iF-2,jF,kF-1)] - 8.0*fh[idx_fh2(iF-1,jF,kF-1)] + + 8.0*fh[idx_fh2(iF+1,jF,kF-1)] - fh[idx_fh2(iF+2,jF,kF-1)]; + double t_kp1 = fh[idx_fh2(iF-2,jF,kF+1)] - 8.0*fh[idx_fh2(iF-1,jF,kF+1)] + + 8.0*fh[idx_fh2(iF+1,jF,kF+1)] - fh[idx_fh2(iF+2,jF,kF+1)]; + double t_kp2 = fh[idx_fh2(iF-2,jF,kF+2)] - 8.0*fh[idx_fh2(iF-1,jF,kF+2)] + + 8.0*fh[idx_fh2(iF+1,jF,kF+2)] - fh[idx_fh2(iF+2,jF,kF+2)]; + fxz[tid] = d_gp.Fdxdz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2); + } + /* 4th-order cross: fyz */ + { + double t_km2 = fh[idx_fh2(iF,jF-2,kF-2)] - 8.0*fh[idx_fh2(iF,jF-1,kF-2)] + + 8.0*fh[idx_fh2(iF,jF+1,kF-2)] - fh[idx_fh2(iF,jF+2,kF-2)]; + double t_km1 = fh[idx_fh2(iF,jF-2,kF-1)] - 8.0*fh[idx_fh2(iF,jF-1,kF-1)] + + 8.0*fh[idx_fh2(iF,jF+1,kF-1)] - fh[idx_fh2(iF,jF+2,kF-1)]; + double t_kp1 = fh[idx_fh2(iF,jF-2,kF+1)] - 8.0*fh[idx_fh2(iF,jF-1,kF+1)] + + 8.0*fh[idx_fh2(iF,jF+1,kF+1)] - fh[idx_fh2(iF,jF+2,kF+1)]; + double t_kp2 = fh[idx_fh2(iF,jF-2,kF+2)] - 8.0*fh[idx_fh2(iF,jF-1,kF+2)] + + 8.0*fh[idx_fh2(iF,jF+1,kF+2)] - fh[idx_fh2(iF,jF+2,kF+2)]; + fyz[tid] = d_gp.Fdydz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2); + } + } + else if ((iF+1)<=imaxF && (iF-1)>=iminF && + (jF+1)<=jmaxF && (jF-1)>=jminF && + (kF+1)<=kmaxF && (kF-1)>=kminF) + { + double c = fh[idx_fh2(iF,jF,kF)]; + fxx[tid] = d_gp.Sdxdx*(fh[idx_fh2(iF-1,jF,kF)] - 2.0*c + fh[idx_fh2(iF+1,jF,kF)]); + fyy[tid] = d_gp.Sdydy*(fh[idx_fh2(iF,jF-1,kF)] - 2.0*c + fh[idx_fh2(iF,jF+1,kF)]); + fzz[tid] = d_gp.Sdzdz*(fh[idx_fh2(iF,jF,kF-1)] - 2.0*c + fh[idx_fh2(iF,jF,kF+1)]); + fxy[tid] = d_gp.Sdxdy*(fh[idx_fh2(iF-1,jF-1,kF)] - fh[idx_fh2(iF+1,jF-1,kF)] + -fh[idx_fh2(iF-1,jF+1,kF)] + fh[idx_fh2(iF+1,jF+1,kF)]); + fxz[tid] = d_gp.Sdxdz*(fh[idx_fh2(iF-1,jF,kF-1)] - fh[idx_fh2(iF+1,jF,kF-1)] + -fh[idx_fh2(iF-1,jF,kF+1)] + fh[idx_fh2(iF+1,jF,kF+1)]); + fyz[tid] = d_gp.Sdydz*(fh[idx_fh2(iF,jF-1,kF-1)] - fh[idx_fh2(iF,jF+1,kF-1)] + -fh[idx_fh2(iF,jF-1,kF+1)] + fh[idx_fh2(iF,jF+1,kF+1)]); + } + else { + fxx[tid]=0; fxy[tid]=0; fxz[tid]=0; + fyy[tid]=0; fyz[tid]=0; fzz[tid]=0; + } + } +} + +/* ---- Lopsided (upwind advection) kernel ---- */ +__global__ __launch_bounds__(128, 4) +void kern_lopsided(const double * __restrict__ fh, + double * __restrict__ f_rhs, + const double * __restrict__ Sfx, + const double * __restrict__ Sfy, + const double * __restrict__ Sfz) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) continue; + + int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + double val = 0.0; + + /* --- x direction --- */ + double sfx = Sfx[tid]; + if (sfx > 0.0) { + if (i0 <= nx - 4) { + val += sfx * d_gp.d12dx * ( + -3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)] + + fh[idx_fh3(iF+3,jF,kF)]); + } else if (i0 <= nx - 3) { + val += sfx * d_gp.d12dx * ( + fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)] + +8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]); + } else if (i0 <= nx - 2) { + val -= sfx * d_gp.d12dx * ( + -3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)] + + fh[idx_fh3(iF-3,jF,kF)]); + } + } else if (sfx < 0.0) { + if ((i0 - 2) >= iminF) { + val -= sfx * d_gp.d12dx * ( + -3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)] + + fh[idx_fh3(iF-3,jF,kF)]); + } else if ((i0 - 1) >= iminF) { + val += sfx * d_gp.d12dx * ( + fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)] + +8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]); + } else if (i0 >= iminF) { + val += sfx * d_gp.d12dx * ( + -3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)] + + fh[idx_fh3(iF+3,jF,kF)]); + } + } + + /* --- y direction --- */ + double sfy = Sfy[tid]; + if (sfy > 0.0) { + if (j0 <= ny - 4) { + val += sfy * d_gp.d12dy * ( + -3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)] + + fh[idx_fh3(iF,jF+3,kF)]); + } else if (j0 <= ny - 3) { + val += sfy * d_gp.d12dy * ( + fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)] + +8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]); + } else if (j0 <= ny - 2) { + val -= sfy * d_gp.d12dy * ( + -3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)] + + fh[idx_fh3(iF,jF-3,kF)]); + } + } else if (sfy < 0.0) { + if ((j0 - 2) >= jminF) { + val -= sfy * d_gp.d12dy * ( + -3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)] + + fh[idx_fh3(iF,jF-3,kF)]); + } else if ((j0 - 1) >= jminF) { + val += sfy * d_gp.d12dy * ( + fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)] + +8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]); + } else if (j0 >= jminF) { + val += sfy * d_gp.d12dy * ( + -3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)] + + fh[idx_fh3(iF,jF+3,kF)]); + } + } + + /* --- z direction --- */ + double sfz = Sfz[tid]; + if (sfz > 0.0) { + if (k0 <= nz - 4) { + val += sfz * d_gp.d12dz * ( + -3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)] + + fh[idx_fh3(iF,jF,kF+3)]); + } else if (k0 <= nz - 3) { + val += sfz * d_gp.d12dz * ( + fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)] + +8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]); + } else if (k0 <= nz - 2) { + val -= sfz * d_gp.d12dz * ( + -3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)] + + fh[idx_fh3(iF,jF,kF-3)]); + } + } else if (sfz < 0.0) { + if ((k0 - 2) >= kminF) { + val -= sfz * d_gp.d12dz * ( + -3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)] + + fh[idx_fh3(iF,jF,kF-3)]); + } else if ((k0 - 1) >= kminF) { + val += sfz * d_gp.d12dz * ( + fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)] + +8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]); + } else if (k0 >= kminF) { + val += sfz * d_gp.d12dz * ( + -3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)] + +18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)] + + fh[idx_fh3(iF,jF,kF+3)]); + } + } + + f_rhs[tid] += val; + } +} + +/* ---- KO dissipation kernel (ord=3, 6th-order) ---- */ +__global__ __launch_bounds__(128, 4) +void kern_kodis(const double * __restrict__ fh, + double * __restrict__ f_rhs, + double eps_val) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const double cof = 64.0; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < d_gp.all; + tid += blockDim.x * gridDim.x) + { + int i0 = tid % nx; + int j0 = (tid / nx) % ny; + int k0 = tid / (nx * ny); + int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + + if ((iF-3) >= iminF && (iF+3) <= imaxF && + (jF-3) >= jminF && (jF+3) <= jmaxF && + (kF-3) >= kminF && (kF+3) <= kmaxF) + { + double Dx = (fh[idx_fh3(iF-3,jF,kF)] + fh[idx_fh3(iF+3,jF,kF)]) + - 6.0*(fh[idx_fh3(iF-2,jF,kF)] + fh[idx_fh3(iF+2,jF,kF)]) + +15.0*(fh[idx_fh3(iF-1,jF,kF)] + fh[idx_fh3(iF+1,jF,kF)]) + -20.0* fh[idx_fh3(iF,jF,kF)]; + Dx /= d_gp.dX; + + double Dy = (fh[idx_fh3(iF,jF-3,kF)] + fh[idx_fh3(iF,jF+3,kF)]) + - 6.0*(fh[idx_fh3(iF,jF-2,kF)] + fh[idx_fh3(iF,jF+2,kF)]) + +15.0*(fh[idx_fh3(iF,jF-1,kF)] + fh[idx_fh3(iF,jF+1,kF)]) + -20.0* fh[idx_fh3(iF,jF,kF)]; + Dy /= d_gp.dY; + + double Dz = (fh[idx_fh3(iF,jF,kF-3)] + fh[idx_fh3(iF,jF,kF+3)]) + - 6.0*(fh[idx_fh3(iF,jF,kF-2)] + fh[idx_fh3(iF,jF,kF+2)]) + +15.0*(fh[idx_fh3(iF,jF,kF-1)] + fh[idx_fh3(iF,jF,kF+1)]) + -20.0* fh[idx_fh3(iF,jF,kF)]; + Dz /= d_gp.dZ; + + f_rhs[tid] += (eps_val / cof) * (Dx + Dy + Dz); + } + } +} + +/* ================================================================== */ +/* Host wrapper helpers */ +/* ================================================================== */ +struct LopsidedKodisTables { + const double *adv_fields[32]; + const double *ko_fields[32]; + double *rhs_fields[32]; + int soa_signs[3 * 32]; +}; + +struct FDerivTables { + const double *src_fields[32]; + double *fx_fields[32]; + double *fy_fields[32]; + double *fz_fields[32]; + int soa_signs[3 * 32]; +}; + +struct FDDerivTables { + const double *src_fields[32]; + double *fxx_fields[32]; + double *fxy_fields[32]; + double *fxz_fields[32]; + double *fyy_fields[32]; + double *fyz_fields[32]; + double *fzz_fields[32]; + int soa_signs[3 * 32]; +}; + +static constexpr int PHASE10_METRIC_FIELD_COUNT = 6; + +struct Phase10RicciTables { + const double *src_fields[PHASE10_METRIC_FIELD_COUNT]; + double *dst_fields[PHASE10_METRIC_FIELD_COUNT]; + int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT]; +}; + +struct Rk4FinalizeTables { + const double *f0_fields[BSSN_STATE_COUNT]; + double *rhs_fields[BSSN_STATE_COUNT]; + double *accum_fields[BSSN_STATE_COUNT]; +}; + +struct PatchBoundaryTables { + const double *src_fields[32]; + double *dst_fields[BSSN_STATE_COUNT]; +}; + +static const int BLK = 128; +static inline int grid(size_t n) { + if (n == 0) return 1; + size_t g = (n + BLK - 1) / BLK; + if (g > 2147483647u) g = 2147483647u; + return (int)g; +} + +__global__ __launch_bounds__(128, 4) +void kern_fderivs_batched(FDerivTables tables, int field_count) +{ + const int field = blockIdx.y; + if (field >= field_count) return; + + const double *src = tables.src_fields[field]; + double *fx = tables.fx_fields[field]; + double *fy = tables.fy_fields[field]; + double *fz = tables.fz_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fx[tid] = 0.0; + fy[tid] = 0.0; + fz[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fx[tid] = d_gp.d12dx * ( + fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fy[tid] = d_gp.d12dy * ( + fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fz[tid] = d_gp.d12dz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fx[tid] = d_gp.d2dx * ( + -fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fy[tid] = d_gp.d2dy * ( + -fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fz[tid] = d_gp.d2dz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + +fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + } + else { + fx[tid] = 0.0; + fy[tid] = 0.0; + fz[tid] = 0.0; + } +} + +__global__ __launch_bounds__(128, 4) +void kern_fdderivs_batched(FDDerivTables tables, int field_count) +{ + const int field = blockIdx.y; + if (field >= field_count) return; + + const double *src = tables.src_fields[field]; + double *fxx = tables.fxx_fields[field]; + double *fxy = tables.fxy_fields[field]; + double *fxz = tables.fxz_fields[field]; + double *fyy = tables.fyy_fields[field]; + double *fyz = tables.fyz_fields[field]; + double *fzz = tables.fzz_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx[tid] = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fyy[tid] = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fzz[tid] = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + + const double t_jm2 = + fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2); + const double t_jm1 = + fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2); + const double t_jp1 = + fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2); + const double t_jp2 = + fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2); + fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2); + fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2); + fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx[tid] = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fyy[tid] = d_gp.Sdydy * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fzz[tid] = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + fxy[tid] = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)); + fxz[tid] = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)); + fyz[tid] = d_gp.Sdydz * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)); + } + else { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + } +} + +/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */ +static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, + double SoA0, double SoA1, double SoA2, int all) +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); + + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); +} + +/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */ +static void gpu_fdderivs(double *d_f, + double *d_fxx, double *d_fxy, double *d_fxz, + double *d_fyy, double *d_fyz, double *d_fzz, + double SoA0, double SoA1, double SoA2, int all) +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); + + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); +} + +static void gpu_fderivs_batch(int field_count, + double *const *src_fields, + double *const *fx_fields, + double *const *fy_fields, + double *const *fz_fields, + const int *soa_signs, + int all) +{ + if (field_count <= 0) return; + FDerivTables tables = {}; + for (int i = 0; i < field_count; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.fx_fields[i] = fx_fields[i]; + tables.fy_fields[i] = fy_fields[i]; + tables.fz_fields[i] = fz_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count); + kern_fderivs_batched<<>>(tables, field_count); +} + +static void gpu_fdderivs_batch(int field_count, + double *const *src_fields, + double *const *fxx_fields, + double *const *fxy_fields, + double *const *fxz_fields, + double *const *fyy_fields, + double *const *fyz_fields, + double *const *fzz_fields, + const int *soa_signs, + int all) +{ + if (field_count <= 0) return; + FDDerivTables tables = {}; + for (int i = 0; i < field_count; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.fxx_fields[i] = fxx_fields[i]; + tables.fxy_fields[i] = fxy_fields[i]; + tables.fxz_fields[i] = fxz_fields[i]; + tables.fyy_fields[i] = fyy_fields[i]; + tables.fyz_fields[i] = fyz_fields[i]; + tables.fzz_fields[i] = fzz_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count); + kern_fdderivs_batched<<>>(tables, field_count); +} + +__global__ __launch_bounds__(128, 4) +void kern_phase10_ricci_batched(const double * __restrict__ gupxx, + const double * __restrict__ gupxy, + const double * __restrict__ gupxz, + const double * __restrict__ gupyy, + const double * __restrict__ gupyz, + const double * __restrict__ gupzz, + Phase10RicciTables tables) +{ + const int field = blockIdx.y; + if (field >= PHASE10_METRIC_FIELD_COUNT) return; + + const double *src = tables.src_fields[field]; + double *dst = tables.dst_fields[field]; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + double fxx = 0.0, fxy = 0.0, fxz = 0.0; + double fyy = 0.0, fyz = 0.0, fzz = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + fyy = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + fzz = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + + const double t_jm2 = + fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2); + const double t_jm1 = + fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2); + const double t_jp1 = + fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2); + const double t_jp2 = + fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2); + fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_x = + fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2); + fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2); + const double t_km1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2); + const double t_kp1_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2); + const double t_kp2_y = + fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2); + fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2); + fxx = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)); + fyy = d_gp.Sdydy * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)); + fzz = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 2.0 * c + + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)); + fxy = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)); + fxz = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)); + fyz = d_gp.Sdydz * ( + fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2) + - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2) + + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)); + } + } + + dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz + + 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz); +} + +static void gpu_phase10_ricci_batch(const double *gupxx, + const double *gupxy, + const double *gupxz, + const double *gupyy, + const double *gupyz, + const double *gupzz, + double *const *src_fields, + double *const *dst_fields, + const int *soa_signs, + int all) +{ + Phase10RicciTables tables = {}; + for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) { + tables.src_fields[i] = src_fields[i]; + tables.dst_fields[i] = dst_fields[i]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT); + kern_phase10_ricci_batched<<>>( + gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables); +} + +__global__ __launch_bounds__(128, 4) +void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap, + const double * __restrict__ chi, + double * __restrict__ fxx, + double * __restrict__ fxy, + double * __restrict__ fxz, + double * __restrict__ fyy, + double * __restrict__ fyz, + double * __restrict__ fzz, + double * __restrict__ chix_out, + double * __restrict__ chiy_out, + double * __restrict__ chiz_out) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0; + return; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1); + fxx[tid] = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1)); + fyy[tid] = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1)); + fzz[tid] = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1) + -30.0 * lap_c + +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1); + fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1); + fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1); + fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + + chix_out[tid] = d_gp.d12dx * ( + fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1)); + chiy_out[tid] = d_gp.d12dy * ( + fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1)); + chiz_out[tid] = d_gp.d12dz * ( + fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1)); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1); + fxx[tid] = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)); + fyy[tid] = d_gp.Sdydy * ( + fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)); + fzz[tid] = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * lap_c + + fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)); + fxy[tid] = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)); + fxz[tid] = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)); + fyz[tid] = d_gp.Sdydz * ( + fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)); + chix_out[tid] = d_gp.d2dx * ( + -fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)); + chiy_out[tid] = d_gp.d2dy * ( + -fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)); + chiz_out[tid] = d_gp.d2dz * ( + -fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + +fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)); + } + else { + fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0; + fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0; + chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0; + } +} + +/* Combined ord=3 advection + KO dissipation. + * When advection and KO use the same source field, symmetry packing is shared. + * If they differ (e.g. gxx advection + dxx KO), only KO repacks. + */ +static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs, + double *d_Sfx, double *d_Sfy, double *d_Sfz, + double SoA0, double SoA1, double SoA2, + double eps_val, int all) +{ + double *fh = g_buf.d_fh3; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); + + kern_symbd_pack_ord3<<>>(d_f_adv, fh, SoA0, SoA1, SoA2); + kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); + + if (eps_val > 0.0) { + if (d_f_ko != d_f_adv) { + kern_symbd_pack_ord3<<>>(d_f_ko, fh, SoA0, SoA1, SoA2); + } + kern_kodis<<>>(fh, d_f_rhs, eps_val); + } +} + +__global__ __launch_bounds__(128, 4) +void kern_lopsided_kodis_batched(const double * __restrict__ Sfx, + const double * __restrict__ Sfy, + const double * __restrict__ Sfz, + LopsidedKodisTables tables, + double eps_val, + int do_lopsided, + int do_kodis) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int field = blockIdx.y; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int SoA0 = tables.soa_signs[3 * field + 0]; + const int SoA1 = tables.soa_signs[3 * field + 1]; + const int SoA2 = tables.soa_signs[3 * field + 2]; + const double *adv_src = tables.adv_fields[field]; + const double *ko_src = tables.ko_fields[field]; + double *rhs = tables.rhs_fields[field]; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if (do_lopsided && i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) { + double val = 0.0; + + const double sfx = Sfx[tid]; + if (sfx > 0.0) { + if (i0 <= nx - 4) { + val += sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)); + } else if (i0 <= nx - 3) { + val += sfx * d_gp.d12dx * ( + fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + } else { + val -= sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)); + } + } else if (sfx < 0.0) { + if ((i0 - 2) >= iminF) { + val -= sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)); + } else if ((i0 - 1) >= iminF) { + val += sfx * d_gp.d12dx * ( + fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + } else if (i0 >= iminF) { + val += sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)); + } + } + + const double sfy = Sfy[tid]; + if (sfy > 0.0) { + if (j0 <= ny - 4) { + val += sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)); + } else if (j0 <= ny - 3) { + val += sfy * d_gp.d12dy * ( + fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + } else { + val -= sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)); + } + } else if (sfy < 0.0) { + if ((j0 - 2) >= jminF) { + val -= sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)); + } else if ((j0 - 1) >= jminF) { + val += sfy * d_gp.d12dy * ( + fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + } else if (j0 >= jminF) { + val += sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)); + } + } + + const double sfz = Sfz[tid]; + if (sfz > 0.0) { + if (k0 <= nz - 4) { + val += sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)); + } else if (k0 <= nz - 3) { + val += sfz * d_gp.d12dz * ( + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } else { + val -= sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)); + } + } else if (sfz < 0.0) { + if ((k0 - 2) >= kminF) { + val -= sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)); + } else if ((k0 - 1) >= kminF) { + val += sfz * d_gp.d12dz * ( + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } else if (k0 >= kminF) { + val += sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)); + } + } + + rhs[tid] += val; + } + + if (do_kodis && eps_val > 0.0 && + (iF - 3) >= iminF && (iF + 3) <= imaxF && + (jF - 3) >= jminF && (jF + 3) <= jmaxF && + (kF - 3) >= kminF && (kF + 3) <= kmaxF) + { + const double cof = 64.0; + const double Dx = + (fetch_sym_ord3_direct(ko_src, iF - 3, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + const double Dy = + (fetch_sym_ord3_direct(ko_src, iF, jF - 3, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + const double Dz = + (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 3, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ); + } +} + +static void gpu_lopsided_kodis_state_batch(double eps_val, int all) +{ + LopsidedKodisTables tables = {}; + for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) { + tables.adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]]; + tables.ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]]; + tables.rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]]; + } + std::memcpy(tables.soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs)); + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_LK_FIELD_COUNT); + kern_lopsided_kodis_batched<<>>( + g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val, 1, 1); +} + +__global__ void kern_rk4_finalize(const double * __restrict__ f0, + double * __restrict__ frhs, + double * __restrict__ accum, + double dT, + int rk4_stage) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + const double rhs = frhs[i]; + switch (rk4_stage) { + case 0: + accum[i] = rhs; + frhs[i] = f0[i] + 0.5 * dT * rhs; + break; + case 1: + accum[i] += 2.0 * rhs; + frhs[i] = f0[i] + 0.5 * dT * rhs; + break; + case 2: + accum[i] += 2.0 * rhs; + frhs[i] = f0[i] + dT * rhs; + break; + default: + frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs); + break; + } + } +} + +__global__ __launch_bounds__(128, 4) +void kern_rk4_finalize_batched(Rk4FinalizeTables tables, + double dT, + int rk4_stage, + double chitiny) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int field = blockIdx.y; + const double *f0 = tables.f0_fields[field]; + double *frhs = tables.rhs_fields[field]; + double *accum = tables.accum_fields[field]; + + const double rhs = frhs[tid]; + switch (rk4_stage) { + case 0: + accum[tid] = rhs; + frhs[tid] = f0[tid] + 0.5 * dT * rhs; + break; + case 1: + accum[tid] += 2.0 * rhs; + frhs[tid] = f0[tid] + 0.5 * dT * rhs; + break; + case 2: + accum[tid] += 2.0 * rhs; + frhs[tid] = f0[tid] + dT * rhs; + break; + default: + frhs[tid] = f0[tid] + (dT / 6.0) * (accum[tid] + rhs); + break; + } + + if (field == 0 && frhs[tid] < chitiny) frhs[tid] = chitiny; +} + +static void gpu_rk4_finalize_batch(const StepContext &ctx, + size_t all, + double dT, + int rk4_stage, + double chitiny) +{ + Rk4FinalizeTables tables = {}; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + tables.f0_fields[i] = ctx.d_state0[i]; + tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]]; + tables.accum_fields[i] = ctx.d_accum[i]; + } + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT); + kern_rk4_finalize_batched<<>>(tables, dT, rk4_stage, chitiny); +} + +__global__ __launch_bounds__(128, 4) +void kern_copy_patch_boundary_batched(PatchBoundaryTables tables, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int nx = d_gp.ex[0]; + const int ny = d_gp.ex[1]; + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + const bool on_boundary = + (touch_xmin && i0 == 0) || + (touch_xmax && i0 == nx - 1) || + (touch_ymin && j0 == 0) || + (touch_ymax && j0 == ny - 1) || + (touch_zmin && k0 == 0) || + (touch_zmax && k0 == d_gp.ex[2] - 1); + if (!on_boundary) return; + + const int field = blockIdx.y; + tables.dst_fields[field][tid] = tables.src_fields[field][tid]; +} + +static void gpu_copy_patch_boundary_batch(const StepContext &ctx, + int all, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax)) + return; + + PatchBoundaryTables tables = {}; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + tables.src_fields[i] = ctx.d_state0[i]; + tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]]; + } + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT); + kern_copy_patch_boundary_batched<<>>( + tables, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); +} + +__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, + double * __restrict__ gxy, + double * __restrict__ gxz, + double * __restrict__ dyy, + double * __restrict__ gyz, + double * __restrict__ dzz, + double * __restrict__ Axx, + double * __restrict__ Axy, + double * __restrict__ Axz, + double * __restrict__ Ayy, + double * __restrict__ Ayz, + double * __restrict__ Azz) +{ + constexpr double F1O3 = 1.0 / 3.0; + constexpr double ONE = 1.0; + constexpr double TWO = 2.0; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + double lgxx = dxx[i] + ONE; + double lgyy = dyy[i] + ONE; + double lgzz = dzz[i] + ONE; + double lgxy = gxy[i]; + double lgxz = gxz[i]; + double lgyz = gyz[i]; + + double lscale = lgxx * lgyy * lgzz + + lgxy * lgyz * lgxz + + lgxz * lgxy * lgyz + - lgxz * lgyy * lgxz + - lgxy * lgxy * lgzz + - lgxx * lgyz * lgyz; + + lscale = ONE / cbrt(lscale); + + lgxx *= lscale; + lgxy *= lscale; + lgxz *= lscale; + lgyy *= lscale; + lgyz *= lscale; + lgzz *= lscale; + + dxx[i] = lgxx - ONE; + gxy[i] = lgxy; + gxz[i] = lgxz; + dyy[i] = lgyy - ONE; + gyz[i] = lgyz; + dzz[i] = lgzz - ONE; + + const double lgupxx = (lgyy * lgzz - lgyz * lgyz); + const double lgupxy = - (lgxy * lgzz - lgyz * lgxz); + const double lgupxz = (lgxy * lgyz - lgyy * lgxz); + const double lgupyy = (lgxx * lgzz - lgxz * lgxz); + const double lgupyz = - (lgxx * lgyz - lgxy * lgxz); + const double lgupzz = (lgxx * lgyy - lgxy * lgxy); + + const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i] + + TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]); + + Axx[i] -= F1O3 * lgxx * ltrA; + Axy[i] -= F1O3 * lgxy * ltrA; + Axz[i] -= F1O3 * lgxz * ltrA; + Ayy[i] -= F1O3 * lgyy * ltrA; + Ayz[i] -= F1O3 * lgyz * ltrA; + Azz[i] -= F1O3 * lgzz * ltrA; + } +} + +__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + if (chi[i] < tinny) chi[i] = tinny; + } +} + +enum SommerFace { + FACE_XMAX = 0, + FACE_YMAX = 1, + FACE_ZMAX = 2, + FACE_XMIN = 3, + FACE_YMIN = 4, + FACE_ZMIN = 5 +}; + +__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh, + double * __restrict__ f_rhs, + int face, + double velocity, + double x0, + double y0, + double z0) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1; + const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1; + const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1; + const int imax = nx, jmax = ny, kmax = nz; + const int plane_count = + (face == FACE_XMAX || face == FACE_XMIN) ? ny * nz : + (face == FACE_YMAX || face == FACE_YMIN) ? nx * nz : + nx * ny; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < plane_count; + tid += blockDim.x * gridDim.x) + { + int i0 = 0, j0 = 0, k0 = 0; + if (face == FACE_XMAX || face == FACE_XMIN) { + j0 = tid % ny; + k0 = tid / ny; + i0 = (face == FACE_XMAX) ? (nx - 1) : 0; + } else if (face == FACE_YMAX || face == FACE_YMIN) { + i0 = tid % nx; + k0 = tid / nx; + j0 = (face == FACE_YMAX) ? (ny - 1) : 0; + } else { + i0 = tid % nx; + j0 = tid / nx; + k0 = (face == FACE_ZMAX) ? (nz - 1) : 0; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + const int p = idx_ex_d(i0, j0, k0); + + const double x = x0 + i0 * d_gp.dX; + const double y = y0 + j0 * d_gp.dY; + const double z = z0 + k0 * d_gp.dZ; + const double r = sqrt(x * x + y * y + z * z); + if (r == 0.0) continue; + + const double wx = velocity * x / r; + const double wy = velocity * y / r; + const double wz = velocity * z / r; + + double fx = 0.0, fy = 0.0, fz = 0.0; + if (wx > 0.0) { + if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]); + else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]); + else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wx < 0.0) { + if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]); + else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]); + } + + if (wy > 0.0) { + if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]); + else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]); + else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wy < 0.0) { + if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]); + else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]); + } + + if (wz > 0.0) { + if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]); + else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]); + else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wz < 0.0) { + if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]); + else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]); + } + + f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r; + } +} + +static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs, + double velocity, + double SoA0, double SoA1, double SoA2, + double *X, double *Y, double *Z, + const double *bbox, + int Symmetry) +{ + if (velocity == 0.0) return; + const int nx = g_buf.prev_nx; + const int ny = g_buf.prev_ny; + const int nz = g_buf.prev_nz; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX; + const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY; + const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ; + const bool touch_xmin = fabs(X[0] - bbox[0]) < dX && + !(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5); + const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY && + !(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5); + const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ && + !(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5); + + const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2); + kern_symbd_pack_ord2<<>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2); + if (touch_xmax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]); + if (touch_ymax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]); + if (touch_zmax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]); + if (touch_xmin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]); + if (touch_ymin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]); + if (touch_zmin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]); +} + +/* ================================================================== */ +/* C. Point-wise computation kernels */ +/* ================================================================== */ + +/* Phase 1: alpn1, chin1, gxx=dxx+1, gyy=dyy+1, gzz=dzz+1 */ +__global__ void kern_phase1_prep( + const double* __restrict__ Lap, const double* __restrict__ chi, + const double* __restrict__ dxx, const double* __restrict__ dyy, + const double* __restrict__ dzz, + double* __restrict__ alpn1, double* __restrict__ chin1, + double* __restrict__ gxx, double* __restrict__ gyy, double* __restrict__ gzz) +{ + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + alpn1[i] = Lap[i] + 1.0; + chin1[i] = chi[i] + 1.0; + gxx[i] = dxx[i] + 1.0; + gyy[i] = dyy[i] + 1.0; + gzz[i] = dzz[i] + 1.0; + } +} + +/* Phase 2a: chi_rhs, gij_rhs */ +__global__ void kern_phase2_metric_rhs( + const double* __restrict__ alpn1, const double* __restrict__ chin1, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ trK, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ betaxx, const double* __restrict__ betaxy, + const double* __restrict__ betaxz, const double* __restrict__ betayx, + const double* __restrict__ betayy, const double* __restrict__ betayz, + const double* __restrict__ betazx, const double* __restrict__ betazy, + const double* __restrict__ betazz, + double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs, + double* __restrict__ gyy_rhs, double* __restrict__ gzz_rhs, + double* __restrict__ gxy_rhs, double* __restrict__ gyz_rhs, + double* __restrict__ gxz_rhs) +{ + const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double db = betaxx[i] + betayy[i] + betazz[i]; + chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db); + gxx_rhs[i] = -TWO*alpn1[i]*Axx[i] - F2o3*gxx[i]*db + + TWO*(gxx[i]*betaxx[i] + gxy[i]*betayx[i] + gxz[i]*betazx[i]); + gyy_rhs[i] = -TWO*alpn1[i]*Ayy[i] - F2o3*gyy[i]*db + + TWO*(gxy[i]*betaxy[i] + gyy[i]*betayy[i] + gyz[i]*betazy[i]); + gzz_rhs[i] = -TWO*alpn1[i]*Azz[i] - F2o3*gzz[i]*db + + TWO*(gxz[i]*betaxz[i] + gyz[i]*betayz[i] + gzz[i]*betazz[i]); + gxy_rhs[i] = -TWO*alpn1[i]*Axy[i] + F1o3*gxy[i]*db + + gxx[i]*betaxy[i] + gxz[i]*betazy[i] + gyy[i]*betayx[i] + + gyz[i]*betazx[i] - gxy[i]*betazz[i]; + gyz_rhs[i] = -TWO*alpn1[i]*Ayz[i] + F1o3*gyz[i]*db + + gxy[i]*betaxz[i] + gyy[i]*betayz[i] + gxz[i]*betaxy[i] + + gzz[i]*betazy[i] - gyz[i]*betaxx[i]; + gxz_rhs[i] = -TWO*alpn1[i]*Axz[i] + F1o3*gxz[i]*db + + gxx[i]*betaxz[i] + gxy[i]*betayz[i] + gyz[i]*betayx[i] + + gzz[i]*betazx[i] - gxz[i]*betayy[i]; + } +} + +/* Phase 2b: metric inverse */ +__global__ void kern_phase2_inverse( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + double* __restrict__ gupxx, double* __restrict__ gupxy, + double* __restrict__ gupxz, double* __restrict__ gupyy, + double* __restrict__ gupyz, double* __restrict__ gupzz) +{ + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double det = gxx[i]*gyy[i]*gzz[i] + gxy[i]*gyz[i]*gxz[i] + gxz[i]*gxy[i]*gyz[i] + - gxz[i]*gyy[i]*gxz[i] - gxy[i]*gxy[i]*gzz[i] - gxx[i]*gyz[i]*gyz[i]; + double inv = 1.0 / det; + gupxx[i] = (gyy[i]*gzz[i] - gyz[i]*gyz[i]) * inv; + gupxy[i] = -(gxy[i]*gzz[i] - gyz[i]*gxz[i]) * inv; + gupxz[i] = (gxy[i]*gyz[i] - gyy[i]*gxz[i]) * inv; + gupyy[i] = (gxx[i]*gzz[i] - gxz[i]*gxz[i]) * inv; + gupyz[i] = -(gxx[i]*gyz[i] - gxy[i]*gxz[i]) * inv; + gupzz[i] = (gxx[i]*gyy[i] - gxy[i]*gxy[i]) * inv; + } +} + +/* Phase 3: Gamma constraint residuals (co==0 only) */ +__global__ void kern_phase3_gamma_constraint( + const double* __restrict__ Gamx, const double* __restrict__ Gamy, + const double* __restrict__ Gamz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ gxxx, const double* __restrict__ gxyx, + const double* __restrict__ gxzx, const double* __restrict__ gyyx, + const double* __restrict__ gyzx, const double* __restrict__ gzzx, + const double* __restrict__ gxxy, const double* __restrict__ gxyy, + const double* __restrict__ gxzy, const double* __restrict__ gyyy, + const double* __restrict__ gyzy, const double* __restrict__ gzzy, + const double* __restrict__ gxxz, const double* __restrict__ gxyz, + const double* __restrict__ gxzz, const double* __restrict__ gyyz, + const double* __restrict__ gyzz, const double* __restrict__ gzzz, + double* __restrict__ Gmx_Res, double* __restrict__ Gmy_Res, + double* __restrict__ Gmz_Res) +{ + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i], uxy=gupxy[i], uxz=gupxz[i]; + double uyy=gupyy[i], uyz=gupyz[i], uzz=gupzz[i]; + + Gmx_Res[i] = Gamx[i] - ( + uxx*(uxx*gxxx[i]+uxy*gxyx[i]+uxz*gxzx[i]) + + uxy*(uxx*gxyx[i]+uxy*gyyx[i]+uxz*gyzx[i]) + + uxz*(uxx*gxzx[i]+uxy*gyzx[i]+uxz*gzzx[i]) + + uxx*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) + + uxy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) + + uxz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) + + uxx*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) + + uxy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) + + uxz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i])); + + Gmy_Res[i] = Gamy[i] - ( + uxx*(uxy*gxxx[i]+uyy*gxyx[i]+uyz*gxzx[i]) + + uxy*(uxy*gxyx[i]+uyy*gyyx[i]+uyz*gyzx[i]) + + uxz*(uxy*gxzx[i]+uyy*gyzx[i]+uyz*gzzx[i]) + + uxy*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) + + uyy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) + + uyz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) + + uxy*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) + + uyy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) + + uyz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i])); + + Gmz_Res[i] = Gamz[i] - ( + uxx*(uxz*gxxx[i]+uyz*gxyx[i]+uzz*gxzx[i]) + + uxy*(uxz*gxyx[i]+uyz*gyyx[i]+uzz*gyzx[i]) + + uxz*(uxz*gxzx[i]+uyz*gyzx[i]+uzz*gzzx[i]) + + uxy*(uxz*gxxy[i]+uyz*gxyy[i]+uzz*gxzy[i]) + + uyy*(uxz*gxyy[i]+uyz*gyyy[i]+uzz*gyzy[i]) + + uyz*(uxz*gxzy[i]+uyz*gyzy[i]+uzz*gzzy[i]) + + uxz*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) + + uyz*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) + + uzz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i])); + } +} + +/* Phase 4: 18 Christoffel symbols */ +__global__ __launch_bounds__(128, 4) +void kern_phase4_christoffel( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ gxxx, const double* __restrict__ gxyx, + const double* __restrict__ gxzx, const double* __restrict__ gyyx, + const double* __restrict__ gyzx, const double* __restrict__ gzzx, + const double* __restrict__ gxxy, const double* __restrict__ gxyy, + const double* __restrict__ gxzy, const double* __restrict__ gyyy, + const double* __restrict__ gyzy, const double* __restrict__ gzzy, + const double* __restrict__ gxxz, const double* __restrict__ gxyz, + const double* __restrict__ gxzz, const double* __restrict__ gyyz, + const double* __restrict__ gyzz, const double* __restrict__ gzzz, + double* __restrict__ Gxxx, double* __restrict__ Gxxy, double* __restrict__ Gxxz, + double* __restrict__ Gxyy, double* __restrict__ Gxyz, double* __restrict__ Gxzz, + double* __restrict__ Gyxx, double* __restrict__ Gyxy, double* __restrict__ Gyxz, + double* __restrict__ Gyyy, double* __restrict__ Gyyz, double* __restrict__ Gyzz, + double* __restrict__ Gzxx, double* __restrict__ Gzxy, double* __restrict__ Gzxz, + double* __restrict__ Gzyy, double* __restrict__ Gzyz, double* __restrict__ Gzzz_o) +{ + const double H = 0.5, TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + /* Gamma^x_{xx} */ + Gxxx[i]=H*(uxx*gxxx[i]+uxy*(TWO*gxyx[i]-gxxy[i])+uxz*(TWO*gxzx[i]-gxxz[i])); + Gyxx[i]=H*(uxy*gxxx[i]+uyy*(TWO*gxyx[i]-gxxy[i])+uyz*(TWO*gxzx[i]-gxxz[i])); + Gzxx[i]=H*(uxz*gxxx[i]+uyz*(TWO*gxyx[i]-gxxy[i])+uzz*(TWO*gxzx[i]-gxxz[i])); + /* yy */ + Gxyy[i]=H*(uxx*(TWO*gxyy[i]-gyyx[i])+uxy*gyyy[i]+uxz*(TWO*gyzy[i]-gyyz[i])); + Gyyy[i]=H*(uxy*(TWO*gxyy[i]-gyyx[i])+uyy*gyyy[i]+uyz*(TWO*gyzy[i]-gyyz[i])); + Gzyy[i]=H*(uxz*(TWO*gxyy[i]-gyyx[i])+uyz*gyyy[i]+uzz*(TWO*gyzy[i]-gyyz[i])); + /* zz */ + Gxzz[i]=H*(uxx*(TWO*gxzz[i]-gzzx[i])+uxy*(TWO*gyzz[i]-gzzy[i])+uxz*gzzz[i]); + Gyzz[i]=H*(uxy*(TWO*gxzz[i]-gzzx[i])+uyy*(TWO*gyzz[i]-gzzy[i])+uyz*gzzz[i]); + Gzzz_o[i]=H*(uxz*(TWO*gxzz[i]-gzzx[i])+uyz*(TWO*gyzz[i]-gzzy[i])+uzz*gzzz[i]); + /* xy */ + Gxxy[i]=H*(uxx*gxxy[i]+uxy*gyyx[i]+uxz*(gxzy[i]+gyzx[i]-gxyz[i])); + Gyxy[i]=H*(uxy*gxxy[i]+uyy*gyyx[i]+uyz*(gxzy[i]+gyzx[i]-gxyz[i])); + Gzxy[i]=H*(uxz*gxxy[i]+uyz*gyyx[i]+uzz*(gxzy[i]+gyzx[i]-gxyz[i])); + /* xz */ + Gxxz[i]=H*(uxx*gxxz[i]+uxy*(gxyz[i]+gyzx[i]-gxzy[i])+uxz*gzzx[i]); + Gyxz[i]=H*(uxy*gxxz[i]+uyy*(gxyz[i]+gyzx[i]-gxzy[i])+uyz*gzzx[i]); + Gzxz[i]=H*(uxz*gxxz[i]+uyz*(gxyz[i]+gyzx[i]-gxzy[i])+uzz*gzzx[i]); + /* yz */ + Gxyz[i]=H*(uxx*(gxyz[i]+gxzy[i]-gyzx[i])+uxy*gyyz[i]+uxz*gzzy[i]); + Gyyz[i]=H*(uxy*(gxyz[i]+gxzy[i]-gyzx[i])+uyy*gyyz[i]+uyz*gzzy[i]); + Gzyz[i]=H*(uxz*(gxyz[i]+gxzy[i]-gyzx[i])+uyz*gyyz[i]+uzz*gzzy[i]); + } +} + +/* Phase 5: A^ij = gup^ia gup^jb A_ab (stored temporarily in Rxx..Rzz) */ +__global__ void kern_phase5_raise_A( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz, + double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + Rxx[i]=uxx*uxx*Axx[i]+uxy*uxy*Ayy[i]+uxz*uxz*Azz[i] + +TWO*(uxx*uxy*Axy[i]+uxx*uxz*Axz[i]+uxy*uxz*Ayz[i]); + Ryy[i]=uxy*uxy*Axx[i]+uyy*uyy*Ayy[i]+uyz*uyz*Azz[i] + +TWO*(uxy*uyy*Axy[i]+uxy*uyz*Axz[i]+uyy*uyz*Ayz[i]); + Rzz[i]=uxz*uxz*Axx[i]+uyz*uyz*Ayy[i]+uzz*uzz*Azz[i] + +TWO*(uxz*uyz*Axy[i]+uxz*uzz*Axz[i]+uyz*uzz*Ayz[i]); + Rxy[i]=uxx*uxy*Axx[i]+uxy*uyy*Ayy[i]+uxz*uyz*Azz[i] + +(uxx*uyy+uxy*uxy)*Axy[i]+(uxx*uyz+uxz*uxy)*Axz[i]+(uxy*uyz+uxz*uyy)*Ayz[i]; + Rxz[i]=uxx*uxz*Axx[i]+uxy*uyz*Ayy[i]+uxz*uzz*Azz[i] + +(uxx*uyz+uxy*uxz)*Axy[i]+(uxx*uzz+uxz*uxz)*Axz[i]+(uxy*uzz+uxz*uyz)*Ayz[i]; + Ryz[i]=uxy*uxz*Axx[i]+uyy*uyz*Ayy[i]+uyz*uzz*Azz[i] + +(uxy*uyz+uyy*uxz)*Axy[i]+(uxy*uzz+uyz*uxz)*Axz[i]+(uyy*uzz+uyz*uyz)*Ayz[i]; + } +} + +/* Phase 6: Gamma_rhs part 1 (before fdderivs(beta) and fderivs(Gamma)) */ +__global__ __launch_bounds__(128, 4) +void kern_phase6_gamma_rhs_part1( + const double* __restrict__ Lapx, const double* __restrict__ Lapy, + const double* __restrict__ Lapz, + const double* __restrict__ alpn1, const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Kx, const double* __restrict__ Ky, + const double* __restrict__ Kz, + const double* __restrict__ Sx, const double* __restrict__ Sy, + const double* __restrict__ Sz, + const double* __restrict__ Rxx, const double* __restrict__ Rxy, + const double* __restrict__ Rxz, const double* __restrict__ Ryy, + const double* __restrict__ Ryz, const double* __restrict__ Rzz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs, + double* __restrict__ Gamz_rhs) +{ + const double TWO=2.0, F3o2=1.5, F2o3=2.0/3.0, EIGHT=8.0; + const double PI_V = 3.14159265358979323846; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i]; + double a=alpn1[i], c1=chin1[i]; + double cx=chix[i],cy=chiy[i],cz=chiz[i]; + + Gamx_rhs[i] = -TWO*(lx*Rxx[i]+ly*Rxy[i]+lz*Rxz[i]) + + TWO*a*( + -F3o2/c1*(cx*Rxx[i]+cy*Rxy[i]+cz*Rxz[i]) + -uxx*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i]) + -uxy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i]) + -uxz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i]) + +Gxxx[i]*Rxx[i]+Gxyy[i]*Ryy[i]+Gxzz[i]*Rzz[i] + +TWO*(Gxxy[i]*Rxy[i]+Gxxz[i]*Rxz[i]+Gxyz[i]*Ryz[i])); + + Gamy_rhs[i] = -TWO*(lx*Rxy[i]+ly*Ryy[i]+lz*Ryz[i]) + + TWO*a*( + -F3o2/c1*(cx*Rxy[i]+cy*Ryy[i]+cz*Ryz[i]) + -uxy*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i]) + -uyy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i]) + -uyz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i]) + +Gyxx[i]*Rxx[i]+Gyyy[i]*Ryy[i]+Gyzz[i]*Rzz[i] + +TWO*(Gyxy[i]*Rxy[i]+Gyxz[i]*Rxz[i]+Gyyz[i]*Ryz[i])); + + Gamz_rhs[i] = -TWO*(lx*Rxz[i]+ly*Ryz[i]+lz*Rzz[i]) + + TWO*a*( + -F3o2/c1*(cx*Rxz[i]+cy*Ryz[i]+cz*Rzz[i]) + -uxz*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i]) + -uyz*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i]) + -uzz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i]) + +Gzxx[i]*Rxx[i]+Gzyy[i]*Ryy[i]+Gzzz[i]*Rzz[i] + +TWO*(Gzxy[i]*Rxy[i]+Gzxz[i]*Rxz[i]+Gzyz[i]*Ryz[i])); + } +} + +/* Phase 5+6 fused: raise A^ij in registers, then consume immediately in Gamma_rhs. */ +__global__ __launch_bounds__(128, 4) +void kern_phase5_6_gamma_rhs_part1_fused( + const double* __restrict__ Lapx, const double* __restrict__ Lapy, + const double* __restrict__ Lapz, + const double* __restrict__ alpn1, const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ Kx, const double* __restrict__ Ky, + const double* __restrict__ Kz, + const double* __restrict__ Sx, const double* __restrict__ Sy, + const double* __restrict__ Sz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs, + double* __restrict__ Gamz_rhs) +{ + const double TWO = 2.0, F3o2 = 1.5, F2o3 = 2.0 / 3.0, EIGHT = 8.0; + const double PI_V = 3.14159265358979323846; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) { + const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i]; + const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i]; + const double Axx_v = Axx[i], Axy_v = Axy[i], Axz_v = Axz[i]; + const double Ayy_v = Ayy[i], Ayz_v = Ayz[i], Azz_v = Azz[i]; + + const double Rxx_v = uxx * uxx * Axx_v + uxy * uxy * Ayy_v + uxz * uxz * Azz_v + + TWO * (uxx * uxy * Axy_v + uxx * uxz * Axz_v + uxy * uxz * Ayz_v); + const double Ryy_v = uxy * uxy * Axx_v + uyy * uyy * Ayy_v + uyz * uyz * Azz_v + + TWO * (uxy * uyy * Axy_v + uxy * uyz * Axz_v + uyy * uyz * Ayz_v); + const double Rzz_v = uxz * uxz * Axx_v + uyz * uyz * Ayy_v + uzz * uzz * Azz_v + + TWO * (uxz * uyz * Axy_v + uxz * uzz * Axz_v + uyz * uzz * Ayz_v); + const double Rxy_v = uxx * uxy * Axx_v + uxy * uyy * Ayy_v + uxz * uyz * Azz_v + + (uxx * uyy + uxy * uxy) * Axy_v + + (uxx * uyz + uxz * uxy) * Axz_v + + (uxy * uyz + uxz * uyy) * Ayz_v; + const double Rxz_v = uxx * uxz * Axx_v + uxy * uyz * Ayy_v + uxz * uzz * Azz_v + + (uxx * uyz + uxy * uxz) * Axy_v + + (uxx * uzz + uxz * uxz) * Axz_v + + (uxy * uzz + uxz * uyz) * Ayz_v; + const double Ryz_v = uxy * uxz * Axx_v + uyy * uyz * Ayy_v + uyz * uzz * Azz_v + + (uxy * uyz + uyy * uxz) * Axy_v + + (uxy * uzz + uyz * uxz) * Axz_v + + (uyy * uzz + uyz * uyz) * Ayz_v; + + const double lx = Lapx[i], ly = Lapy[i], lz = Lapz[i]; + const double a = alpn1[i], c1 = chin1[i]; + const double cx = chix[i], cy = chiy[i], cz = chiz[i]; + + Gamx_rhs[i] = -TWO * (lx * Rxx_v + ly * Rxy_v + lz * Rxz_v) + + TWO * a * ( + -F3o2 / c1 * (cx * Rxx_v + cy * Rxy_v + cz * Rxz_v) + -uxx * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i]) + -uxy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i]) + -uxz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i]) + + Gxxx[i] * Rxx_v + Gxyy[i] * Ryy_v + Gxzz[i] * Rzz_v + + TWO * (Gxxy[i] * Rxy_v + Gxxz[i] * Rxz_v + Gxyz[i] * Ryz_v)); + + Gamy_rhs[i] = -TWO * (lx * Rxy_v + ly * Ryy_v + lz * Ryz_v) + + TWO * a * ( + -F3o2 / c1 * (cx * Rxy_v + cy * Ryy_v + cz * Ryz_v) + -uxy * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i]) + -uyy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i]) + -uyz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i]) + + Gyxx[i] * Rxx_v + Gyyy[i] * Ryy_v + Gyzz[i] * Rzz_v + + TWO * (Gyxy[i] * Rxy_v + Gyxz[i] * Rxz_v + Gyyz[i] * Ryz_v)); + + Gamz_rhs[i] = -TWO * (lx * Rxz_v + ly * Ryz_v + lz * Rzz_v) + + TWO * a * ( + -F3o2 / c1 * (cx * Rxz_v + cy * Ryz_v + cz * Rzz_v) + -uxz * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i]) + -uyz * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i]) + -uzz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i]) + + Gzxx[i] * Rxx_v + Gzyy[i] * Ryy_v + Gzzz[i] * Rzz_v + + TWO * (Gzxy[i] * Rxy_v + Gzxz[i] * Rxz_v + Gzyz[i] * Ryz_v)); + } +} + +/* Phase 8: Gamma_rhs part 2 — after fdderivs(beta) and fderivs(Gamma) + * Computes: fxx=div(beta_xx), Gamxa, then updates Gamx_rhs etc. + * Input arrays gxxx..gzzz here hold fdderivs(beta) results, + * Gamxx..Gamzz hold fderivs(Gamma) results. + */ +__global__ __launch_bounds__(128, 4) +void kern_phase8_gamma_rhs_part2( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + /* fdderivs(betax) -> gxxx,gxyx,gxzx,gyyx,gyzx,gzzx */ + const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy, + const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy, + const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz, + /* fdderivs(betay) -> gxxy,gxyy,gxzy,gyyy,gyzy,gzzy */ + const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy, + const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy, + const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz, + /* fdderivs(betaz) -> gxxz,gxyz,gxzz,gyyz,gyzz,gzzz */ + const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy, + const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy, + const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz, + /* fderivs(Gamx) -> Gamxx,Gamxy,Gamxz */ + const double* __restrict__ Gamxx, const double* __restrict__ Gamxy, + const double* __restrict__ Gamxz, + /* fderivs(Gamy) -> Gamyx,Gamyy,Gamyz */ + const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d, + const double* __restrict__ Gamyz_d, + /* fderivs(Gamz) -> Gamzx,Gamzy,Gamzz */ + const double* __restrict__ Gamzx, const double* __restrict__ Gamzy, + const double* __restrict__ Gamzz_d, + /* Christoffel symbols */ + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + /* betaij first derivs */ + const double* __restrict__ betaxx, const double* __restrict__ betaxy, + const double* __restrict__ betaxz, const double* __restrict__ betayx, + const double* __restrict__ betayy, const double* __restrict__ betayz, + const double* __restrict__ betazx, const double* __restrict__ betazy, + const double* __restrict__ betazz, + double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs, + double* __restrict__ Gamz_rhs, + double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out, + double* __restrict__ Gamza_out) +{ + const double TWO=2.0, F2o3=2.0/3.0, F1o3=1.0/3.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + /* div(beta_second_derivs) */ + double fxx_v = bxx_xx[i]+bxy_xy[i]+bxz_xz[i]; + double fxy_v = bxx_xy[i]+bxy_yy[i]+bxz_yz[i]; + double fxz_v = bxx_xz[i]+bxy_yz[i]+bxz_zz[i]; + /* Gamma^a contracted */ + double Ga_x = uxx*Gxxx[i]+uyy*Gxyy[i]+uzz*Gxzz[i] + +TWO*(uxy*Gxxy[i]+uxz*Gxxz[i]+uyz*Gxyz[i]); + double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i] + +TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]); + double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i] + +TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]); + Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z; + double db = betaxx[i] + betayy[i] + betazz[i]; + Gamx_rhs[i] += F2o3*Ga_x*db + - Ga_x*betaxx[i] - Ga_y*betaxy[i] - Ga_z*betaxz[i] + + F1o3*(uxx*fxx_v+uxy*fxy_v+uxz*fxz_v) + + uxx*bxx_xx[i]+uyy*bxx_yy[i]+uzz*bxx_zz[i] + + TWO*(uxy*bxx_xy[i]+uxz*bxx_xz[i]+uyz*bxx_yz[i]); + Gamy_rhs[i] += F2o3*Ga_y*db + - Ga_x*betayx[i] - Ga_y*betayy[i] - Ga_z*betayz[i] + + F1o3*(uxy*fxx_v+uyy*fxy_v+uyz*fxz_v) + + uxx*bxy_xx[i]+uyy*bxy_yy[i]+uzz*bxy_zz[i] + + TWO*(uxy*bxy_xy[i]+uxz*bxy_xz[i]+uyz*bxy_yz[i]); + Gamz_rhs[i] += F2o3*Ga_z*db + - Ga_x*betazx[i] - Ga_y*betazy[i] - Ga_z*betazz[i] + + F1o3*(uxz*fxx_v+uyz*fxy_v+uzz*fxz_v) + + uxx*bxz_xx[i]+uyy*bxz_yy[i]+uzz*bxz_zz[i] + + TWO*(uxy*bxz_xy[i]+uxz*bxz_xz[i]+uyz*bxz_yz[i]); + } +} + +/* Phase 9: Christoffel contract — compute g_{ia} Gamma^a_{bc} products + * Overwrites gxxx..gzzz with lowered Christoffel products needed for Ricci. + */ +__global__ __launch_bounds__(128, 4) +void kern_phase9_christoffel_contract( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + /* output: lowered products g_{ia} Gamma^a_{bc} */ + double* __restrict__ o_gxxx, double* __restrict__ o_gxyx, + double* __restrict__ o_gxzx, double* __restrict__ o_gyyx, + double* __restrict__ o_gyzx, double* __restrict__ o_gzzx, + double* __restrict__ o_gxxy, double* __restrict__ o_gxyy, + double* __restrict__ o_gxzy, double* __restrict__ o_gyyy, + double* __restrict__ o_gyzy, double* __restrict__ o_gzzy, + double* __restrict__ o_gxxz, double* __restrict__ o_gxyz, + double* __restrict__ o_gxzz, double* __restrict__ o_gyyz, + double* __restrict__ o_gyzz, double* __restrict__ o_gzzz) +{ + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double g11=gxx[i],g12=gxy[i],g13=gxz[i]; + double g22=gyy[i],g23=gyz[i],g33=gzz[i]; + /* row x: g_{x,a} Gamma^a_{bc} */ + o_gxxx[i]=g11*Gxxx[i]+g12*Gyxx[i]+g13*Gzxx[i]; + o_gxyx[i]=g11*Gxxy[i]+g12*Gyxy[i]+g13*Gzxy[i]; + o_gxzx[i]=g11*Gxxz[i]+g12*Gyxz[i]+g13*Gzxz[i]; + o_gyyx[i]=g11*Gxyy[i]+g12*Gyyy[i]+g13*Gzyy[i]; + o_gyzx[i]=g11*Gxyz[i]+g12*Gyyz[i]+g13*Gzyz[i]; + o_gzzx[i]=g11*Gxzz[i]+g12*Gyzz[i]+g13*Gzzz[i]; + /* row y: g_{y,a} Gamma^a_{bc} */ + o_gxxy[i]=g12*Gxxx[i]+g22*Gyxx[i]+g23*Gzxx[i]; + o_gxyy[i]=g12*Gxxy[i]+g22*Gyxy[i]+g23*Gzxy[i]; + o_gxzy[i]=g12*Gxxz[i]+g22*Gyxz[i]+g23*Gzxz[i]; + o_gyyy[i]=g12*Gxyy[i]+g22*Gyyy[i]+g23*Gzyy[i]; + o_gyzy[i]=g12*Gxyz[i]+g22*Gyyz[i]+g23*Gzyz[i]; + o_gzzy[i]=g12*Gxzz[i]+g22*Gyzz[i]+g23*Gzzz[i]; + /* row z: g_{z,a} Gamma^a_{bc} */ + o_gxxz[i]=g13*Gxxx[i]+g23*Gyxx[i]+g33*Gzxx[i]; + o_gxyz[i]=g13*Gxxy[i]+g23*Gyxy[i]+g33*Gzxy[i]; + o_gxzz[i]=g13*Gxxz[i]+g23*Gyxz[i]+g33*Gzxz[i]; + o_gyyz[i]=g13*Gxyy[i]+g23*Gyyy[i]+g33*Gzyy[i]; + o_gyzz[i]=g13*Gxyz[i]+g23*Gyyz[i]+g33*Gzyz[i]; + o_gzzz[i]=g13*Gxzz[i]+g23*Gyzz[i]+g33*Gzzz[i]; + } +} + +/* Phase 8+9 fused: update Gamma rhs, contract Gamma^a, and lower Christoffels in one pass. */ +__global__ __launch_bounds__(128, 2) +void kern_phase8_9_gamma_rhs_contract_fused( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy, + const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy, + const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz, + const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy, + const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy, + const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz, + const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy, + const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy, + const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + const double* __restrict__ betaxx, const double* __restrict__ betaxy, + const double* __restrict__ betaxz, const double* __restrict__ betayx, + const double* __restrict__ betayy, const double* __restrict__ betayz, + const double* __restrict__ betazx, const double* __restrict__ betazy, + const double* __restrict__ betazz, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs, + double* __restrict__ Gamz_rhs, + double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out, + double* __restrict__ Gamza_out, + double* __restrict__ o_gxxx, double* __restrict__ o_gxyx, + double* __restrict__ o_gxzx, double* __restrict__ o_gyyx, + double* __restrict__ o_gyzx, double* __restrict__ o_gzzx, + double* __restrict__ o_gxxy, double* __restrict__ o_gxyy, + double* __restrict__ o_gxzy, double* __restrict__ o_gyyy, + double* __restrict__ o_gyzy, double* __restrict__ o_gzzy, + double* __restrict__ o_gxxz, double* __restrict__ o_gxyz, + double* __restrict__ o_gxzz, double* __restrict__ o_gyyz, + double* __restrict__ o_gyzz, double* __restrict__ o_gzzz) +{ + const double TWO = 2.0, F2o3 = 2.0 / 3.0, F1o3 = 1.0 / 3.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) { + const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i]; + const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i]; + + const double Gxxx_v = Gxxx[i], Gxxy_v = Gxxy[i], Gxxz_v = Gxxz[i]; + const double Gxyy_v = Gxyy[i], Gxyz_v = Gxyz[i], Gxzz_v = Gxzz[i]; + const double Gyxx_v = Gyxx[i], Gyxy_v = Gyxy[i], Gyxz_v = Gyxz[i]; + const double Gyyy_v = Gyyy[i], Gyyz_v = Gyyz[i], Gyzz_v = Gyzz[i]; + const double Gzxx_v = Gzxx[i], Gzxy_v = Gzxy[i], Gzxz_v = Gzxz[i]; + const double Gzyy_v = Gzyy[i], Gzyz_v = Gzyz[i], Gzzz_v = Gzzz[i]; + + const double fxx_v = bxx_xx[i] + bxy_xy[i] + bxz_xz[i]; + const double fxy_v = bxx_xy[i] + bxy_yy[i] + bxz_yz[i]; + const double fxz_v = bxx_xz[i] + bxy_yz[i] + bxz_zz[i]; + + const double Ga_x = uxx * Gxxx_v + uyy * Gxyy_v + uzz * Gxzz_v + + TWO * (uxy * Gxxy_v + uxz * Gxxz_v + uyz * Gxyz_v); + const double Ga_y = uxx * Gyxx_v + uyy * Gyyy_v + uzz * Gyzz_v + + TWO * (uxy * Gyxy_v + uxz * Gyxz_v + uyz * Gyyz_v); + const double Ga_z = uxx * Gzxx_v + uyy * Gzyy_v + uzz * Gzzz_v + + TWO * (uxy * Gzxy_v + uxz * Gzxz_v + uyz * Gzyz_v); + + Gamxa_out[i] = Ga_x; + Gamya_out[i] = Ga_y; + Gamza_out[i] = Ga_z; + + const double betaxx_v = betaxx[i], betaxy_v = betaxy[i], betaxz_v = betaxz[i]; + const double betayx_v = betayx[i], betayy_v = betayy[i], betayz_v = betayz[i]; + const double betazx_v = betazx[i], betazy_v = betazy[i], betazz_v = betazz[i]; + const double db = betaxx_v + betayy_v + betazz_v; + + Gamx_rhs[i] += F2o3 * Ga_x * db + - Ga_x * betaxx_v - Ga_y * betaxy_v - Ga_z * betaxz_v + + F1o3 * (uxx * fxx_v + uxy * fxy_v + uxz * fxz_v) + + uxx * bxx_xx[i] + uyy * bxx_yy[i] + uzz * bxx_zz[i] + + TWO * (uxy * bxx_xy[i] + uxz * bxx_xz[i] + uyz * bxx_yz[i]); + Gamy_rhs[i] += F2o3 * Ga_y * db + - Ga_x * betayx_v - Ga_y * betayy_v - Ga_z * betayz_v + + F1o3 * (uxy * fxx_v + uyy * fxy_v + uyz * fxz_v) + + uxx * bxy_xx[i] + uyy * bxy_yy[i] + uzz * bxy_zz[i] + + TWO * (uxy * bxy_xy[i] + uxz * bxy_xz[i] + uyz * bxy_yz[i]); + Gamz_rhs[i] += F2o3 * Ga_z * db + - Ga_x * betazx_v - Ga_y * betazy_v - Ga_z * betazz_v + + F1o3 * (uxz * fxx_v + uyz * fxy_v + uzz * fxz_v) + + uxx * bxz_xx[i] + uyy * bxz_yy[i] + uzz * bxz_zz[i] + + TWO * (uxy * bxz_xy[i] + uxz * bxz_xz[i] + uyz * bxz_yz[i]); + + const double g11 = gxx[i], g12 = gxy[i], g13 = gxz[i]; + const double g22 = gyy[i], g23 = gyz[i], g33 = gzz[i]; + + o_gxxx[i] = g11 * Gxxx_v + g12 * Gyxx_v + g13 * Gzxx_v; + o_gxyx[i] = g11 * Gxxy_v + g12 * Gyxy_v + g13 * Gzxy_v; + o_gxzx[i] = g11 * Gxxz_v + g12 * Gyxz_v + g13 * Gzxz_v; + o_gyyx[i] = g11 * Gxyy_v + g12 * Gyyy_v + g13 * Gzyy_v; + o_gyzx[i] = g11 * Gxyz_v + g12 * Gyyz_v + g13 * Gzyz_v; + o_gzzx[i] = g11 * Gxzz_v + g12 * Gyzz_v + g13 * Gzzz_v; + + o_gxxy[i] = g12 * Gxxx_v + g22 * Gyxx_v + g23 * Gzxx_v; + o_gxyy[i] = g12 * Gxxy_v + g22 * Gyxy_v + g23 * Gzxy_v; + o_gxzy[i] = g12 * Gxxz_v + g22 * Gyxz_v + g23 * Gzxz_v; + o_gyyy[i] = g12 * Gxyy_v + g22 * Gyyy_v + g23 * Gzyy_v; + o_gyzy[i] = g12 * Gxyz_v + g22 * Gyyz_v + g23 * Gzyz_v; + o_gzzy[i] = g12 * Gxzz_v + g22 * Gyzz_v + g23 * Gzzz_v; + + o_gxxz[i] = g13 * Gxxx_v + g23 * Gyxx_v + g33 * Gzxx_v; + o_gxyz[i] = g13 * Gxxy_v + g23 * Gyxy_v + g33 * Gzxy_v; + o_gxzz[i] = g13 * Gxxz_v + g23 * Gyxz_v + g33 * Gzxz_v; + o_gyyz[i] = g13 * Gxyy_v + g23 * Gyyy_v + g33 * Gzyy_v; + o_gyzz[i] = g13 * Gxyz_v + g23 * Gyyz_v + g33 * Gzyz_v; + o_gzzz[i] = g13 * Gxzz_v + g23 * Gyzz_v + g33 * Gzzz_v; + } +} + +/* Phase 10: After fdderivs of a metric component, contract with gup^{ij} + * R_comp = gup^xx*fxx + gup^yy*fyy + gup^zz*fzz + 2*(gup^xy*fxy + gup^xz*fxz + gup^yz*fyz) + */ +__global__ void kern_phase10_ricci_contract( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ fxx, const double* __restrict__ fxy, + const double* __restrict__ fxz, const double* __restrict__ fyy, + const double* __restrict__ fyz, const double* __restrict__ fzz, + double* __restrict__ R_comp) +{ + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + R_comp[i] = gupxx[i]*fxx[i] + gupyy[i]*fyy[i] + gupzz[i]*fzz[i] + + 2.0*(gupxy[i]*fxy[i] + gupxz[i]*fxz[i] + gupyz[i]*fyz[i]); + } +} + +/* Phase 11a: Ricci diagonal assembly (Rxx, Ryy, Rzz) */ +__global__ __launch_bounds__(128, 4) +void kern_phase11_ricci_diag( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gamxa, const double* __restrict__ Gamya, + const double* __restrict__ Gamza, + const double* __restrict__ Gamxx, const double* __restrict__ Gamxy, + const double* __restrict__ Gamxz, + const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d, + const double* __restrict__ Gamyz_d, + const double* __restrict__ Gamzx, const double* __restrict__ Gamzy, + const double* __restrict__ Gamzz_d, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + /* lowered Christoffel products */ + const double* __restrict__ lxxx, const double* __restrict__ lxyx, + const double* __restrict__ lxzx, const double* __restrict__ lyyx, + const double* __restrict__ lyzx, const double* __restrict__ lzzx, + const double* __restrict__ lxxy, const double* __restrict__ lxyy, + const double* __restrict__ lxzy, const double* __restrict__ lyyy, + const double* __restrict__ lyzy, const double* __restrict__ lzzy, + const double* __restrict__ lxxz, const double* __restrict__ lxyz, + const double* __restrict__ lxzz, const double* __restrict__ lyyz, + const double* __restrict__ lyzz, const double* __restrict__ lzzz, + double* __restrict__ Rxx, double* __restrict__ Ryy, double* __restrict__ Rzz) +{ + const double H = 0.5, TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + /* Rxx */ + Rxx[i] = -H*Rxx[i] + + gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i] + + Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i] + + uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i]) + +(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i])) + + uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]) + +(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i]) + +(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])) + + uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]) + +(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i]) + +(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])) + + uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]) + +(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]) + +(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])) + + uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])); + + /* Ryy */ + Ryy[i] = -H*Ryy[i] + + gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i] + + Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i] + + uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i]) + +(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i]) + +(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])) + + uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]) + +(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i]) + +(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])) + + uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i]) + +(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i])) + + uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]) + +(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i]) + +(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])) + + uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])); + + /* Rzz */ + Rzz[i] = -H*Rzz[i] + + gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i] + + Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i] + + uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])) + + uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]) + +(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]) + +(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i])) + + uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i]) + +(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i]) + +(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i])) + + uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])) + + uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i]) + +(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i]) + +(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i])) + + uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i]) + +(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i])); + } +} + +/* Phase 11b: Ricci off-diagonal assembly (Rxy, Rxz, Ryz) */ +__global__ __launch_bounds__(128, 4) +void kern_phase11_ricci_offdiag( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gamxa, const double* __restrict__ Gamya, + const double* __restrict__ Gamza, + const double* __restrict__ Gamxx, const double* __restrict__ Gamxy, + const double* __restrict__ Gamxz, + const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d, + const double* __restrict__ Gamyz_d, + const double* __restrict__ Gamzx, const double* __restrict__ Gamzy, + const double* __restrict__ Gamzz_d, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + const double* __restrict__ lxxx, const double* __restrict__ lxyx, + const double* __restrict__ lxzx, const double* __restrict__ lyyx, + const double* __restrict__ lyzx, const double* __restrict__ lzzx, + const double* __restrict__ lxxy, const double* __restrict__ lxyy, + const double* __restrict__ lxzy, const double* __restrict__ lyyy, + const double* __restrict__ lyzy, const double* __restrict__ lzzy, + const double* __restrict__ lxxz, const double* __restrict__ lxyz, + const double* __restrict__ lxzz, const double* __restrict__ lyyz, + const double* __restrict__ lyzz, const double* __restrict__ lzzz, + double* __restrict__ Rxy, double* __restrict__ Rxz, double* __restrict__ Ryz) +{ + const double H = 0.5, TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + + /* Rxy */ + Rxy[i] = H*( + -Rxy[i] + +gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i] + +gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i] + +Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i] + +Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i]) + +uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i] + +Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]) + +uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i] + +Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i] + +Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i] + +Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i]) + +uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i] + +Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]) + +uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i] + +Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]); + + /* Rxz */ + Rxz[i] = H*( + -Rxz[i] + +gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i] + +gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i] + +Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i] + +Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i]) + +uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i] + +Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]) + +uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i] + +Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i] + +Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i] + +Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i] + +Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i]) + +uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i] + +Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i] + +Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]); + + /* Ryz */ + Ryz[i] = H*( + -Ryz[i] + +gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i] + +gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i] + +Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i] + +Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i]) + +uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]) + +uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i] + +Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i] + +Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i] + +Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]) + +uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i] + +Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i] + +Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i] + +Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i] + +Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i] + +Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i]) + +uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i] + +Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]); + } +} + +/* Phase 11: fused Ricci assembly (diag + off-diag) */ +__global__ __launch_bounds__(128, 2) +void kern_phase11_ricci_fused( + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gamxa, const double* __restrict__ Gamya, + const double* __restrict__ Gamza, + const double* __restrict__ Gamxx, const double* __restrict__ Gamxy, + const double* __restrict__ Gamxz, + const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d, + const double* __restrict__ Gamyz_d, + const double* __restrict__ Gamzx, const double* __restrict__ Gamzy, + const double* __restrict__ Gamzz_d, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + const double* __restrict__ lxxx, const double* __restrict__ lxyx, + const double* __restrict__ lxzx, const double* __restrict__ lyyx, + const double* __restrict__ lyzx, const double* __restrict__ lzzx, + const double* __restrict__ lxxy, const double* __restrict__ lxyy, + const double* __restrict__ lxzy, const double* __restrict__ lyyy, + const double* __restrict__ lyzy, const double* __restrict__ lzzy, + const double* __restrict__ lxxz, const double* __restrict__ lxyz, + const double* __restrict__ lxzz, const double* __restrict__ lyyz, + const double* __restrict__ lyzz, const double* __restrict__ lzzz, + double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz, + double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double H = 0.5, TWO = 2.0; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) { + double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i]; + double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i]; + + Rxx[i] = -H*Rxx[i] + + gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i] + + Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i] + + uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i]) + +(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i])) + + uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]) + +(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i]) + +(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])) + + uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]) + +(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i]) + +(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])) + + uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]) + +(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]) + +(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])) + + uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])); + + Ryy[i] = -H*Ryy[i] + + gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i] + + Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i] + + uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]) + +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i])) + + uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i]) + +(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i]) + +(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])) + + uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]) + +(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i]) + +(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])) + + uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i]) + +(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i])) + + uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]) + +(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i]) + +(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])) + + uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])); + + Rzz[i] = -H*Rzz[i] + + gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i] + + Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i] + + uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]) + +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i])) + + uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]) + +(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]) + +(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i])) + + uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i]) + +(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i]) + +(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i])) + + uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]) + +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i])) + + uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i]) + +(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i]) + +(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i])) + + uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i]) + +(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i])); + + Rxy[i] = H*( + -Rxy[i] + +gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i] + +gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i] + +Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i] + +Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i]) + +uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i] + +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i] + +Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]) + +uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i] + +Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i] + +Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i] + +Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i] + +Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i]) + +uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i] + +Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i] + +Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]) + +uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i] + +Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]); + + Rxz[i] = H*( + -Rxz[i] + +gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i] + +gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i] + +Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i] + +Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i]) + +uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i] + +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i] + +Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]) + +uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i] + +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i] + +Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i] + +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i]) + +uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i] + +Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i] + +Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i] + +Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i] + +Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i]) + +uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i] + +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i] + +Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i] + +Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i] + +Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i] + +Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]); + + Ryz[i] = H*( + -Ryz[i] + +gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i] + +gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i] + +Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i] + +Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i]) + +uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i] + +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i] + +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]) + +uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i] + +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i] + +Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i] + +Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i] + +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i] + +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]) + +uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i] + +Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i] + +Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i] + +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i] + +Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i] + +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i]) + +uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i] + +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i] + +Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]) + +uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i] + +Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i] + +Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i] + +Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i] + +Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i] + +Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i]) + +uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i] + +Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i] + +Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]); + } +} + +/* Phase 13: chi correction to Ricci tensor + * After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f, + * then add chi contribution to Rxx..Rzz. + */ +__global__ __launch_bounds__(128, 4) +void kern_phase12_13_chi_correction_fused( + const double* __restrict__ chi, + const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + double* __restrict__ Rxx, double* __restrict__ Rxy, + double* __restrict__ Rxz, double* __restrict__ Ryy, + double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double TWO = 2.0; + const double F3o2 = 1.5; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + + double cxx = 0.0, cxy = 0.0, cxz = 0.0; + double cyy = 0.0, cyz = 0.0, czz = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1); + cxx = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1)); + cyy = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1)); + czz = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1); + cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1); + cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1); + cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1); + cxx = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)); + cyy = d_gp.Sdydy * ( + fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)); + czz = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)); + cxy = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)); + cxz = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)); + cyz = d_gp.Sdydz * ( + fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)); + } + } + + const double cx = chix[tid]; + const double cy = chiy[tid]; + const double cz = chiz[tid]; + const double c1 = chin1[tid]; + + cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz; + cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz; + cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz; + cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz; + cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz; + czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz; + + const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid]; + const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid]; + const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx) + + uyy * (cyy - F3o2 / c1 * cy * cy) + + uzz * (czz - F3o2 / c1 * cz * cz) + + TWO * uxy * (cxy - F3o2 / c1 * cx * cy) + + TWO * uxz * (cxz - F3o2 / c1 * cx * cz) + + TWO * uyz * (cyz - F3o2 / c1 * cy * cz); + + const double inv2c = 1.0 / (c1 * TWO); + Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c; + Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c; + Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c; + Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c; + Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c; + Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c; +} + +__global__ __launch_bounds__(128, 4) +void kern_phase13_chi_correction( + const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + double* __restrict__ fxx, double* __restrict__ fxy, + double* __restrict__ fxz, double* __restrict__ fyy, + double* __restrict__ fyz, double* __restrict__ fzz, + double* __restrict__ Rxx, double* __restrict__ Rxy, + double* __restrict__ Rxz, double* __restrict__ Ryy, + double* __restrict__ Ryz, double* __restrict__ Rzz) +{ + const double H=0.5, TWO=2.0, F3o2=1.5; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double cx=chix[i],cy=chiy[i],cz=chiz[i],c1=chin1[i]; + /* subtract Christoffel * chi_deriv */ + fxx[i] -= Gxxx[i]*cx+Gyxx[i]*cy+Gzxx[i]*cz; + fxy[i] -= Gxxy[i]*cx+Gyxy[i]*cy+Gzxy[i]*cz; + fxz[i] -= Gxxz[i]*cx+Gyxz[i]*cy+Gzxz[i]*cz; + fyy[i] -= Gxyy[i]*cx+Gyyy[i]*cy+Gzyy[i]*cz; + fyz[i] -= Gxyz[i]*cx+Gyyz[i]*cy+Gzyz[i]*cz; + fzz[i] -= Gxzz[i]*cx+Gyzz[i]*cy+Gzzz[i]*cz; + + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + double f_val = uxx*(fxx[i]-F3o2/c1*cx*cx) + + uyy*(fyy[i]-F3o2/c1*cy*cy) + + uzz*(fzz[i]-F3o2/c1*cz*cz) + + TWO*uxy*(fxy[i]-F3o2/c1*cx*cy) + + TWO*uxz*(fxz[i]-F3o2/c1*cx*cz) + + TWO*uyz*(fyz[i]-F3o2/c1*cy*cz); + + double inv2c = 1.0/(c1*TWO); + Rxx[i] += (fxx[i]-cx*cx*inv2c+gxx[i]*f_val)*inv2c; + Ryy[i] += (fyy[i]-cy*cy*inv2c+gyy[i]*f_val)*inv2c; + Rzz[i] += (fzz[i]-cz*cz*inv2c+gzz[i]*f_val)*inv2c; + Rxy[i] += (fxy[i]-cx*cy*inv2c+gxy[i]*f_val)*inv2c; + Rxz[i] += (fxz[i]-cx*cz*inv2c+gxz[i]*f_val)*inv2c; + Ryz[i] += (fyz[i]-cy*cz*inv2c+gyz[i]*f_val)*inv2c; + } +} + +/* Phase 15: trK_rhs, Aij_rhs, gauge. + * Also updates Christoffel with physical chi correction and computes Lap second derivatives on the fly. + */ +__global__ __launch_bounds__(128, 4) +void kern_phase15_trK_Aij_gauge( + const double* __restrict__ alpn1, const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gxx, const double* __restrict__ gxy, + const double* __restrict__ gxz, const double* __restrict__ gyy, + const double* __restrict__ gyz, const double* __restrict__ gzz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ trK, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ Lapx, const double* __restrict__ Lapy, + const double* __restrict__ Lapz, + const double* __restrict__ betaxx, const double* __restrict__ betaxy, + const double* __restrict__ betaxz, const double* __restrict__ betayx, + const double* __restrict__ betayy, const double* __restrict__ betayz, + const double* __restrict__ betazx, const double* __restrict__ betazy, + const double* __restrict__ betazz, + const double* __restrict__ rho, + const double* __restrict__ Sx_m, const double* __restrict__ Sy_m, + const double* __restrict__ Sz_m, + const double* __restrict__ Sxx_m, const double* __restrict__ Sxy_m, + const double* __restrict__ Sxz_m, const double* __restrict__ Syy_m, + const double* __restrict__ Syz_m, const double* __restrict__ Szz_m, + const double* __restrict__ dtSfx, const double* __restrict__ dtSfy, + const double* __restrict__ dtSfz, + const double* __restrict__ Rxx, const double* __restrict__ Rxy, + const double* __restrict__ Rxz, const double* __restrict__ Ryy, + const double* __restrict__ Ryz, const double* __restrict__ Rzz, + double* __restrict__ Gxxx, double* __restrict__ Gxxy, + double* __restrict__ Gxxz, double* __restrict__ Gxyy, + double* __restrict__ Gxyz_o, double* __restrict__ Gxzz, + double* __restrict__ Gyxx, double* __restrict__ Gyxy, + double* __restrict__ Gyxz, double* __restrict__ Gyyy, + double* __restrict__ Gyyz, double* __restrict__ Gyzz, + double* __restrict__ Gzxx, double* __restrict__ Gzxy, + double* __restrict__ Gzxz, double* __restrict__ Gzyy, + double* __restrict__ Gzyz, double* __restrict__ Gzzz, + double* __restrict__ dtSfx_rhs, double* __restrict__ dtSfy_rhs, + double* __restrict__ dtSfz_rhs, + double* __restrict__ trK_rhs, + double* __restrict__ Axx_rhs, double* __restrict__ Axy_rhs, + double* __restrict__ Axz_rhs, double* __restrict__ Ayy_rhs, + double* __restrict__ Ayz_rhs, double* __restrict__ Azz_rhs, + double* __restrict__ Lap_rhs, + double* __restrict__ betax_rhs, double* __restrict__ betay_rhs, + double* __restrict__ betaz_rhs, + double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs, + double* __restrict__ Gamz_rhs, + double* __restrict__ f_arr, double* __restrict__ S_arr) +{ + const double TWO=2.0, FOUR=4.0, EIGHT=8.0, H=0.5; + const double F1o3=1.0/3.0, F2o3=2.0/3.0, F3o2=1.5; + const double PI_V=3.14159265358979323846; + const double F16=16.0, F8=8.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int i0 = i % nx; + const int j0 = (i / nx) % ny; + const int k0 = i / (nx * ny); + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF; + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + double a=alpn1[i], c1=chin1[i]; + double cx=chix[i],cy=chiy[i],cz=chiz[i]; + double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i]; + double fxx_v = 0.0, fxy_v = 0.0, fxz_v = 0.0; + double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0; + + if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) { + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1); + fxx_v = d_gp.Fdxdx * ( + -fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF, 1, 1, 1)); + fyy_v = d_gp.Fdydy * ( + -fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF, 1, 1, 1)); + fzz_v = d_gp.Fdzdz * ( + -fetch_sym_ord2_direct(alpn1, iF, jF, kF - 2, 1, 1, 1) + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1) + -30.0 * c + +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF, kF + 2, 1, 1, 1)); + + const double t_jm2 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF - 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF - 2, kF, 1, 1, 1); + const double t_jm1 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF - 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF - 1, kF, 1, 1, 1); + const double t_jp1 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF + 1, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF + 1, kF, 1, 1, 1); + const double t_jp2 = + fetch_sym_ord2_direct(alpn1, iF - 2, jF + 2, kF, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 2, kF, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 2, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF + 2, kF, 1, 1, 1); + fxy_v = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2); + + const double t_km2_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 2, 1, 1, 1); + const double t_km1_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 1, 1, 1, 1); + const double t_kp1_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 1, 1, 1, 1); + const double t_kp2_x = + fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 2, 1, 1, 1); + fxz_v = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x); + + const double t_km2_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 2, 1, 1, 1); + const double t_km1_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 1, 1, 1, 1); + const double t_kp1_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 1, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 1, 1, 1, 1); + const double t_kp2_y = + fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 2, 1, 1, 1) + - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 2, 1, 1, 1) + + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 2, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 2, 1, 1, 1); + fyz_v = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y); + } + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1); + fxx_v = d_gp.Sdxdx * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1)); + fyy_v = d_gp.Sdydy * ( + fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1)); + fzz_v = d_gp.Sdzdz * ( + fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1) + - 2.0 * c + + fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1)); + fxy_v = d_gp.Sdxdy * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1)); + fxz_v = d_gp.Sdxdz * ( + fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1)); + fyz_v = d_gp.Sdydz * ( + fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1) + - fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1) + + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1)); + } + } + + /* raised chi/chi */ + double gx=(uxx*cx+uxy*cy+uxz*cz)/c1; + double gy=(uxy*cx+uyy*cy+uyz*cz)/c1; + double gz=(uxz*cx+uyz*cy+uzz*cz)/c1; + + /* Christoffel physical correction */ + Gxxx[i]-=((cx+cx)/c1-gxx[i]*gx)*H; + Gyxx[i]-=(0.0-gxx[i]*gy)*H; + Gzxx[i]-=(0.0-gxx[i]*gz)*H; + Gxyy[i]-=(0.0-gyy[i]*gx)*H; + Gyyy[i]-=((cy+cy)/c1-gyy[i]*gy)*H; + Gzyy[i]-=(0.0-gyy[i]*gz)*H; + Gxzz[i]-=(0.0-gzz[i]*gx)*H; + Gyzz[i]-=(0.0-gzz[i]*gy)*H; + Gzzz[i]-=((cz+cz)/c1-gzz[i]*gz)*H; + Gxxy[i]-=(cy/c1-gxy[i]*gx)*H; + Gyxy[i]-=(cx/c1-gxy[i]*gy)*H; + Gzxy[i]-=(0.0-gxy[i]*gz)*H; + Gxxz[i]-=(cz/c1-gxz[i]*gx)*H; + Gyxz[i]-=(0.0-gxz[i]*gy)*H; + Gzxz[i]-=(cx/c1-gxz[i]*gz)*H; + Gxyz_o[i]-=(0.0-gyz[i]*gx)*H; + Gyyz[i]-=(cz/c1-gyz[i]*gy)*H; + Gzyz[i]-=(cy/c1-gyz[i]*gz)*H; + + /* Lap second-derivative correction: subtract Gamma*Lap_deriv */ + fxx_v -= Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz; + fyy_v -= Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz; + fzz_v -= Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz; + fxy_v -= Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz; + fxz_v -= Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz; + fyz_v -= Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz; + + /* D^i D_i alpha */ + double DDA = uxx*fxx_v+uyy*fyy_v+uzz*fzz_v + +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v); + + /* trace of S_ij (physical) */ + double S_v = c1*(uxx*Sxx_m[i]+uyy*Syy_m[i]+uzz*Szz_m[i] + +TWO*(uxy*Sxy_m[i]+uxz*Sxz_m[i]+uyz*Syz_m[i])); + + /* A^ij A_ij */ + double AijAij = + uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i] + +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i])) + +uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i] + +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i])) + +uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i] + +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i])) + +TWO*( + uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i] + +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i]) + +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i]) + +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i])) + +uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i] + +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i]) + +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i]) + +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i])) + +uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i] + +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i]) + +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i]) + +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]))); + + double trK_v = trK[i]; + double db = betaxx[i] + betayy[i] + betazz[i]; + + /* trK_rhs step 1: store D^iD_i alpha * chin1 */ + trK_rhs[i] = c1 * DDA; + + /* f_arr = -(1/3) * (DDA + alpha/chi * (2/3*K^2 - AijAij - 16pi*rho + 8pi*S)) */ + double f_v = F2o3*trK_v*trK_v - AijAij - F16*PI_V*rho[i] + EIGHT*PI_V*S_v; + f_arr[i] = -F1o3*(uxx*fxx_v+uyy*fyy_v+uzz*fzz_v + +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v) + +(a/c1)*f_v); + + /* fij = alpha*(Rij - 8pi*Sij) - D_iD_j alpha */ + double fij_xx=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx_v; + double fij_xy=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy_v; + double fij_xz=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz_v; + double fij_yy=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy_v; + double fij_yz=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz_v; + double fij_zz=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz_v; + + /* Aij_rhs = chi*(fij - gij*f) */ + Axx_rhs[i]=fij_xx-gxx[i]*f_arr[i]; + Ayy_rhs[i]=fij_yy-gyy[i]*f_arr[i]; + Azz_rhs[i]=fij_zz-gzz[i]*f_arr[i]; + Axy_rhs[i]=fij_xy-gxy[i]*f_arr[i]; + Axz_rhs[i]=fij_xz-gxz[i]*f_arr[i]; + Ayz_rhs[i]=fij_yz-gyz[i]*f_arr[i]; + + /* A_il A^l_j */ + double AA_xx=uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i] + +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]); + double AA_yy=uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i] + +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]); + double AA_zz=uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i] + +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]); + double AA_xy=uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i] + +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i]) + +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i]) + +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]); + double AA_xz=uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i] + +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i]) + +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i]) + +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]); + double AA_yz=uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i] + +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i]) + +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i]) + +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]); + + /* trK_rhs final */ + trK_rhs[i] = -trK_rhs[i] + + a*(F1o3*trK_v*trK_v + +uxx*AA_xx+uyy*AA_yy+uzz*AA_zz + +TWO*(uxy*AA_xy+uxz*AA_xz+uyz*AA_yz) + +FOUR*PI_V*(rho[i]+S_v)); + + /* Aij_rhs final */ + Axx_rhs[i]=c1*Axx_rhs[i]+a*(trK_v*Axx[i]-TWO*AA_xx) + +TWO*(Axx[i]*betaxx[i]+Axy[i]*betayx[i]+Axz[i]*betazx[i])-F2o3*Axx[i]*db; + Ayy_rhs[i]=c1*Ayy_rhs[i]+a*(trK_v*Ayy[i]-TWO*AA_yy) + +TWO*(Axy[i]*betaxy[i]+Ayy[i]*betayy[i]+Ayz[i]*betazy[i])-F2o3*Ayy[i]*db; + Azz_rhs[i]=c1*Azz_rhs[i]+a*(trK_v*Azz[i]-TWO*AA_zz) + +TWO*(Axz[i]*betaxz[i]+Ayz[i]*betayz[i]+Azz[i]*betazz[i])-F2o3*Azz[i]*db; + Axy_rhs[i]=c1*Axy_rhs[i]+a*(trK_v*Axy[i]-TWO*AA_xy) + +Axx[i]*betaxy[i]+Axz[i]*betazy[i]+Ayy[i]*betayx[i] + +Ayz[i]*betazx[i]+F1o3*Axy[i]*db-Axy[i]*betazz[i]; + Ayz_rhs[i]=c1*Ayz_rhs[i]+a*(trK_v*Ayz[i]-TWO*AA_yz) + +Axy[i]*betaxz[i]+Ayy[i]*betayz[i]+Axz[i]*betaxy[i] + +Azz[i]*betazy[i]+F1o3*Ayz[i]*db-Ayz[i]*betaxx[i]; + Axz_rhs[i]=c1*Axz_rhs[i]+a*(trK_v*Axz[i]-TWO*AA_xz) + +Axx[i]*betaxz[i]+Axy[i]*betayz[i]+Ayz[i]*betayx[i] + +Azz[i]*betazx[i]+F1o3*Axz[i]*db-Axz[i]*betayy[i]; + + /* gauge */ + Lap_rhs[i] = -TWO*a*trK_v; + betax_rhs[i] = 0.75*dtSfx[i]; + betay_rhs[i] = 0.75*dtSfy[i]; + betaz_rhs[i] = 0.75*dtSfz[i]; +#if (GAUGE == 0) + dtSfx_rhs[i] = Gamx_rhs[i] - 2.0*dtSfx[i]; + dtSfy_rhs[i] = Gamy_rhs[i] - 2.0*dtSfy[i]; + dtSfz_rhs[i] = Gamz_rhs[i] - 2.0*dtSfz[i]; +#endif + } +} + +/* Phase 18: Hamilton & momentum constraints (co==0 only) */ +__global__ __launch_bounds__(128, 4) +void kern_phase18_constraints( + const double* __restrict__ chin1, + const double* __restrict__ chix, const double* __restrict__ chiy, + const double* __restrict__ chiz, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ trK, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ Rxx, const double* __restrict__ Rxy, + const double* __restrict__ Rxz, const double* __restrict__ Ryy, + const double* __restrict__ Ryz, const double* __restrict__ Rzz, + const double* __restrict__ rho, + const double* __restrict__ Sx_m, const double* __restrict__ Sy_m, + const double* __restrict__ Sz_m, + const double* __restrict__ Kx, const double* __restrict__ Ky, + const double* __restrict__ Kz, + const double* __restrict__ Gxxx, const double* __restrict__ Gxxy, + const double* __restrict__ Gxxz, const double* __restrict__ Gxyy, + const double* __restrict__ Gxyz, const double* __restrict__ Gxzz, + const double* __restrict__ Gyxx, const double* __restrict__ Gyxy, + const double* __restrict__ Gyxz, const double* __restrict__ Gyyy, + const double* __restrict__ Gyyz, const double* __restrict__ Gyzz, + const double* __restrict__ Gzxx, const double* __restrict__ Gzxy, + const double* __restrict__ Gzxz, const double* __restrict__ Gzyy, + const double* __restrict__ Gzyz, const double* __restrict__ Gzzz, + /* dA/dx arrays (fderivs of Aij) */ + const double* __restrict__ dAxx_x, const double* __restrict__ dAxx_y, + const double* __restrict__ dAxx_z, + const double* __restrict__ dAxy_x, const double* __restrict__ dAxy_y, + const double* __restrict__ dAxy_z, + const double* __restrict__ dAxz_x, const double* __restrict__ dAxz_y, + const double* __restrict__ dAxz_z, + const double* __restrict__ dAyy_x, const double* __restrict__ dAyy_y, + const double* __restrict__ dAyy_z, + const double* __restrict__ dAyz_x, const double* __restrict__ dAyz_y, + const double* __restrict__ dAyz_z, + const double* __restrict__ dAzz_x, const double* __restrict__ dAzz_y, + const double* __restrict__ dAzz_z, + double* __restrict__ ham_Res, + double* __restrict__ movx_Res, double* __restrict__ movy_Res, + double* __restrict__ movz_Res) +{ + const double TWO=2.0, F2o3=2.0/3.0, F8=8.0, F16=16.0; + const double PI_V=3.14159265358979323846; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; + i += blockDim.x*gridDim.x) + { + double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i]; + double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i]; + double c1=chin1[i]; + + /* Hamiltonian constraint */ + double R_sc = uxx*Rxx[i]+uyy*Ryy[i]+uzz*Rzz[i] + +TWO*(uxy*Rxy[i]+uxz*Rxz[i]+uyz*Ryz[i]); + /* AijAij (same as in phase15) */ + double AijAij = + uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i] + +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i])) + +uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i] + +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i])) + +uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i] + +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i])) + +TWO*(uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i] + +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i]) + +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i]) + +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i])) + +uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i] + +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i]) + +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i]) + +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i])) + +uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i] + +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i]) + +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i]) + +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]))); + + ham_Res[i] = c1*R_sc + F2o3*trK[i]*trK[i] - AijAij - F16*PI_V*rho[i]; + + /* Momentum constraints: need covariant derivative of A */ + double cx=chix[i],cy=chiy[i],cz=chiz[i]; + /* D_j A^j_x etc — subtract Christoffel and chi terms */ + /* gxxx = dAxx_x - 2*Gxxx*Axx - ... - chix*Axx/chin1 etc */ + double mx_xx = dAxx_x[i]-(Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i] + +Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i])-cx*Axx[i]/c1; + double mx_xy = dAxy_x[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i] + +Gxxx[i]*Axy[i]+Gyxx[i]*Ayy[i]+Gzxx[i]*Ayz[i])-cx*Axy[i]/c1; + double mx_xz = dAxz_x[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i] + +Gxxx[i]*Axz[i]+Gyxx[i]*Ayz[i]+Gzxx[i]*Azz[i])-cx*Axz[i]/c1; + double mx_yy = dAyy_x[i]-(Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i] + +Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cx*Ayy[i]/c1; + double mx_yz = dAyz_x[i]-(Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i] + +Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cx*Ayz[i]/c1; + double mx_zz = dAzz_x[i]-(Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i] + +Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cx*Azz[i]/c1; + + double my_xx = dAxx_y[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i] + +Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i])-cy*Axx[i]/c1; + double my_xy = dAxy_y[i]-(Gxyy[i]*Axx[i]+Gyyy[i]*Axy[i]+Gzyy[i]*Axz[i] + +Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cy*Axy[i]/c1; + double my_xz = dAxz_y[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i] + +Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cy*Axz[i]/c1; + double my_yy = dAyy_y[i]-(Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i] + +Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i])-cy*Ayy[i]/c1; + double my_yz = dAyz_y[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i] + +Gxyy[i]*Axz[i]+Gyyy[i]*Ayz[i]+Gzyy[i]*Azz[i])-cy*Ayz[i]/c1; + double my_zz = dAzz_y[i]-(Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i] + +Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cy*Azz[i]/c1; + + double mz_xx = dAxx_z[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i] + +Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i])-cz*Axx[i]/c1; + double mz_xy = dAxy_z[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i] + +Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i])-cz*Axy[i]/c1; + double mz_xz = dAxz_z[i]-(Gxzz[i]*Axx[i]+Gyzz[i]*Axy[i]+Gzzz[i]*Axz[i] + +Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cz*Axz[i]/c1; + double mz_yy = dAyy_z[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i] + +Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i])-cz*Ayy[i]/c1; + double mz_yz = dAyz_z[i]-(Gxzz[i]*Axy[i]+Gyzz[i]*Ayy[i]+Gzzz[i]*Ayz[i] + +Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cz*Ayz[i]/c1; + double mz_zz = dAzz_z[i]-(Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i] + +Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i])-cz*Azz[i]/c1; + + movx_Res[i] = uxx*mx_xx+uyy*my_xy+uzz*mz_xz + +uxy*mx_xy+uxz*mx_xz+uyz*my_xz + +uxy*my_xx+uxz*mz_xx+uyz*mz_xy + - F2o3*Kx[i] - F8*PI_V*Sx_m[i]; + movy_Res[i] = uxx*mx_xy+uyy*my_yy+uzz*mz_yz + +uxy*mx_yy+uxz*mx_yz+uyz*my_yz + +uxy*my_xy+uxz*mz_xy+uyz*mz_yy + - F2o3*Ky[i] - F8*PI_V*Sy_m[i]; + movz_Res[i] = uxx*mx_xz+uyy*my_yz+uzz*mz_zz + +uxy*mx_yz+uxz*mx_zz+uyz*my_zz + +uxy*my_xz+uxz*mz_xz+uyz*mz_yz + - F2o3*Kz[i] - F8*PI_V*Sz_m[i]; + } +} + +static void setup_grid_params(int *ex, + double *X, double *Y, double *Z, + int Symmetry, double eps, int co) +{ + const int nx = ex[0]; + const int ny = ex[1]; + const int nz = ex[2]; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const int NO_SYMM = 0; + const int EQ_SYMM = 1; + + ensure_gpu_buffers(nx, ny, nz); + + GridParams gp = {}; + gp.ex[0] = nx; + gp.ex[1] = ny; + gp.ex[2] = nz; + gp.all = nx * ny * nz; + gp.dX = dX; + gp.dY = dY; + gp.dZ = dZ; + gp.d12dx = 1.0 / (12.0 * dX); + gp.d12dy = 1.0 / (12.0 * dY); + gp.d12dz = 1.0 / (12.0 * dZ); + gp.d2dx = 1.0 / (2.0 * dX); + gp.d2dy = 1.0 / (2.0 * dY); + gp.d2dz = 1.0 / (2.0 * dZ); + gp.Fdxdx = 1.0 / (12.0 * dX * dX); + gp.Fdydy = 1.0 / (12.0 * dY * dY); + gp.Fdzdz = 1.0 / (12.0 * dZ * dZ); + gp.Sdxdx = 1.0 / (dX * dX); + gp.Sdydy = 1.0 / (dY * dY); + gp.Sdzdz = 1.0 / (dZ * dZ); + gp.Fdxdy = 1.0 / (144.0 * dX * dY); + gp.Fdxdz = 1.0 / (144.0 * dX * dZ); + gp.Fdydz = 1.0 / (144.0 * dY * dZ); + gp.Sdxdy = 0.25 / (dX * dY); + gp.Sdxdz = 0.25 / (dX * dZ); + gp.Sdydz = 0.25 / (dY * dZ); + gp.iminF = 1; + gp.jminF = 1; + gp.kminF = 1; + gp.imaxF = nx; + gp.jmaxF = ny; + gp.kmaxF = nz; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1; + gp.iminF3 = 1; + gp.jminF3 = 1; + gp.kminF3 = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2; + gp.Symmetry = Symmetry; + gp.eps = eps; + gp.co = co; + gp.fh2_nx = nx + 2; + gp.fh2_ny = ny + 2; + gp.fh2_nz = nz + 2; + gp.fh3_nx = nx + 3; + gp.fh3_ny = ny + 3; + gp.fh3_nz = nz + 3; + upload_grid_params_if_needed(gp); +} + +static void compute_patch_boundary_flags(int *ex, + double *X, double *Y, double *Z, + const double *bbox, + int Symmetry, + int &touch_xmin, int &touch_xmax, + int &touch_ymin, int &touch_ymax, + int &touch_zmin, int &touch_zmax) +{ + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const int NO_SYMM = 0; + const int OCTANT = 2; + + touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0; + touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0; + touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0; + + touch_xmin = (std::fabs(X[0] - bbox[0]) < dX && + !(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0; + touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY && + !(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0; + touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ && + !(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0; +} + +static void upload_state_inputs(double **state_host, size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); + } + CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyHostToDevice)); +} + +static void upload_matter_cache(StepContext &ctx, + double **matter_host, + size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes); + } + CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage, + (size_t)BSSN_MATTER_COUNT * bytes, + cudaMemcpyHostToDevice)); + ctx.matter_ready = true; +} + +static void zero_matter_cache(StepContext &ctx, size_t all) +{ + CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0, + (size_t)BSSN_MATTER_COUNT * all * sizeof(double))); + ctx.matter_ready = true; +} + +static void bind_matter_slots(const StepContext &ctx) +{ + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i]; + } +} + +static void bind_state_input_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + g_buf.slot[k_state_input_slots[i]] = state[i]; + } +} + +static void bind_state_output_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + g_buf.slot[k_state_rhs_slots[i]] = state[i]; + } +} + +static void launch_rhs_pipeline(int all, double eps, int co) +{ + const double SYM = 1.0; + const double ANTI = -1.0; + + #define D(s) g_buf.slot[s] + kern_phase1_prep<<>>( + D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), + D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz)); + + { + double *src_fields[] = { + D(S_betax), D(S_betay), D(S_betaz), D(S_chi), + D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), + D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK) + }; + double *fx_fields[] = { + D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), + D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx) + }; + double *fy_fields[] = { + D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), + D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky) + }; + double *fz_fields[] = { + D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), + D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz) + }; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } + + kern_phase2_metric_rhs<<>>( + D(S_alpn1), D(S_chin1), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), + D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs), + D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs)); + + kern_phase2_inverse<<>>( + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz)); + + if (co == 0) { + kern_phase3_gamma_constraint<<>>( + D(S_Gamx), D(S_Gamy), D(S_Gamz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res)); + } + + kern_phase4_christoffel<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz)); + + kern_phase5_6_gamma_rhs_part1_fused<<>>( + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_alpn1), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Kx), D(S_Ky), D(S_Kz), + D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs)); + + { + double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)}; + double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)}; + double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)}; + double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)}; + double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)}; + double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)}; + double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)}; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI + }; + gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fxx_fields, fxy_fields, fxz_fields, + fyy_fields, fyz_fields, fzz_fields, + soa_signs, all); + } + { + double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)}; + double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)}; + double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)}; + double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)}; + const int soa_signs[] = { + (int)ANTI, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)SYM, + (int)SYM, (int)SYM, (int)ANTI + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } + + kern_phase8_9_gamma_rhs_contract_fused<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_betaxx),D(S_betaxy),D(S_betaxz), + D(S_betayx),D(S_betayy),D(S_betayz), + D(S_betazx),D(S_betazy),D(S_betazz), + D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), + D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs), + D(S_Gamxa),D(S_Gamya),D(S_Gamza), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz)); + + { + double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)}; + double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)}; + const int soa_signs[] = { + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)ANTI, (int)ANTI + }; + gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + src_fields, dst_fields, soa_signs, all); + } + + kern_phase11_ricci_fused<<>>( + D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_Gamxa),D(S_Gamya),D(S_Gamza), + D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), + D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), + D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + D(S_Rxx),D(S_Rxy),D(S_Rxz), + D(S_Ryy),D(S_Ryz),D(S_Rzz)); + + kern_phase12_13_chi_correction_fused<<>>( + D(S_chi), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), + D(S_Ryy), D(S_Ryz), D(S_Rzz)); + + kern_phase15_trK_Aij_gauge<<>>( + D(S_alpn1), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), + D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_rho), + D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz), + D(S_dtSfx), D(S_dtSfy), D(S_dtSfz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs), + D(S_trK_rhs), + D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs), + D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs), + D(S_Lap_rhs), + D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs), + D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs), + D(S_f_arr), D(S_S_arr)); + + gpu_lopsided_kodis_state_batch(eps, all); + + if (co == 0) { + { + double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)}; + double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)}; + double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)}; + double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)}; + const int soa_signs[] = { + (int)SYM, (int)SYM, (int)SYM, + (int)ANTI, (int)ANTI, (int)SYM, + (int)ANTI, (int)SYM, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM, + (int)SYM, (int)ANTI, (int)ANTI, + (int)SYM, (int)SYM, (int)SYM + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, + soa_signs, all); + } + + kern_phase18_constraints<<>>( + D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Kx), D(S_Ky), D(S_Kz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_gxxx), D(S_gxxy), D(S_gxxz), + D(S_gxyx), D(S_gxyy), D(S_gxyz), + D(S_gxzx), D(S_gxzy), D(S_gxzz), + D(S_gyyx), D(S_gyyy), D(S_gyyz), + D(S_gyzx), D(S_gyzy), D(S_gyzz), + D(S_gzzx), D(S_gzzy), D(S_gzzz), + D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); + } + + #undef D +} + +static void download_state_outputs(double **state_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + +static void download_constraint_outputs(double **constraint_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res], + (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) { + std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + +__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int i0, int j0, int k0, + int sx, int sy, int sz, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst[(size_t)state_index * region_all + local] = + src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, + const double * __restrict__ src, + int nx, int ny, + int i0, int j0, int k0, + int sx, int sy, int sz, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny; + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)state_index * region_all + local]; + } +} + +__global__ void kern_pack_state_subset(const double * __restrict__ src_mem, + double * __restrict__ dst, + int subset_count, + int all) +{ + const int subset_slot = blockIdx.y; + if (subset_slot >= subset_count) return; + const int state_index = d_subset_state_indices[subset_slot]; + for (int src = blockIdx.x * blockDim.x + threadIdx.x; + src < all; + src += blockDim.x * gridDim.x) + { + dst[(size_t)subset_slot * all + src] = + src_mem[(size_t)state_index * all + src]; + } +} + +__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem, + const double * __restrict__ src, + int subset_count, + int all) +{ + const int subset_slot = blockIdx.y; + if (subset_slot >= subset_count) return; + const int state_index = d_subset_state_indices[subset_slot]; + for (int dst = blockIdx.x * blockDim.x + threadIdx.x; + dst < all; + dst += blockDim.x * gridDim.x) + { + dst_mem[(size_t)state_index * all + dst] = + src[(size_t)subset_slot * all + dst]; + } +} + +static void copy_state_region_cuda(void *block_tag, + int state_index, + double *host_state, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + const size_t pitch = (size_t)ex[0] * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + + cudaMemcpy3DParms p = {}; + p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); + p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + + if (kind == cudaMemcpyDeviceToHost) { + p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); + } else { + p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]); + p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]); + } + CUDA_CHECK(cudaMemcpy3D(&p)); +} + +static void copy_state_region_packed_cuda(void *block_tag, + int state_index, + double *host_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + const size_t src_pitch = (size_t)ex[0] * sizeof(double); + const size_t dst_pitch = (size_t)sx * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + + cudaMemcpy3DParms p = {}; + p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz); + + if (kind == cudaMemcpyDeviceToHost) { + p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); + p.dstPos = make_cudaPos(0, 0, 0); + } else { + p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy); + p.srcPos = make_cudaPos(0, 0, 0); + p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]); + p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0); + } + + CUDA_CHECK(cudaMemcpy3D(&p)); +} + +static void copy_state_region_packed_batch_cuda(void *block_tag, + int state_count, + double *host_buffer, + const int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz, + cudaMemcpyKind kind) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (sx <= 0 || sy <= 0 || sz <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + const size_t total_doubles = (size_t)state_count * (size_t)region_all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + + if (kind == cudaMemcpyDeviceToHost) { + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_pack_state_region_batch<<>>( + ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + } else { + CUDA_CHECK(cudaMemcpy(d_comm, host_buffer, + total_doubles * sizeof(double), + cudaMemcpyHostToDevice)); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_unpack_state_region_batch<<>>( + ctx.d_state_curr_mem, d_comm, ex[0], ex[1], + i0, j0, k0, sx, sy, sz, region_all, state_count, + ex[0] * ex[1] * ex[2]); + } +} + +static void download_resident_state(void *block_tag, int *ex, double **state_host_out) +{ + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, all); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem, + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + +static void copy_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host, + cudaMemcpyKind kind) +{ + if (subset_count <= 0) return; + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + StepContext &ctx = ensure_step_ctx(block_tag, all); + int active_state_indices[BSSN_STATE_COUNT]; + double *active_state_host[BSSN_STATE_COUNT]; + int active_count = 0; + + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices[i]; + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue; + if (!state_host[i]) continue; + active_state_indices[active_count] = state_index; + active_state_host[active_count] = state_host[i]; + ++active_count; + } + if (active_count <= 0) return; + + const size_t total_doubles = (size_t)active_count * all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles); + + CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices, + (size_t)active_count * sizeof(int), + 0, cudaMemcpyHostToDevice)); + if (kind == cudaMemcpyDeviceToHost) { + dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); + kern_pack_state_subset<<>>( + ctx.d_state_curr_mem, d_comm, active_count, (int)all); + CUDA_CHECK(cudaMemcpy(h_comm, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < active_count; ++i) { + std::memcpy(active_state_host[i], + h_comm + (size_t)i * all, + bytes); + } + } else { + for (int i = 0; i < active_count; ++i) { + std::memcpy(h_comm + (size_t)i * all, + active_state_host[i], + bytes); + } + CUDA_CHECK(cudaMemcpy(d_comm, h_comm, + total_doubles * sizeof(double), + cudaMemcpyHostToDevice)); + dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count); + kern_unpack_state_subset<<>>( + ctx.d_state_curr_mem, d_comm, active_count, (int)all); + } +} + +static bool has_resident_state(void *block_tag) +{ + auto it = g_step_ctx.find(block_tag); + return it != g_step_ctx.end() && it->second.state_ready; +} + + + +#define pow2(x) ((x) * (x)) +#define pow3(x) ((x) * (x) * (x)) +#define pow4(x) ((x) * (x) * (x) * (x)) + +__device__ __forceinline__ void z4c_rhs_point_device( + double A11, + double A12, + double A13, + double A22, + double A23, + double A33, + double alpha, + double B1, + double B2, + double B3, + double beta1, + double beta2, + double beta3, + double chi, + double chiDivFloor, + double da1, + double dA111, + double dA112, + double dA113, + double dA122, + double dA123, + double dA133, + double da2, + double dA211, + double dA212, + double dA213, + double dA222, + double dA223, + double dA233, + double da3, + double dA311, + double dA312, + double dA313, + double dA322, + double dA323, + double dA333, + double db11, + double dB11, + double db12, + double dB12, + double db13, + double dB13, + double db21, + double dB21, + double db22, + double dB22, + double db23, + double dB23, + double db31, + double dB31, + double db32, + double dB32, + double db33, + double dB33, + double dchi1, + double dchi2, + double dchi3, + double dda11, + double dda12, + double dda13, + double dda22, + double dda23, + double dda33, + double ddb111, + double ddb112, + double ddb113, + double ddb121, + double ddb122, + double ddb123, + double ddb131, + double ddb132, + double ddb133, + double ddb221, + double ddb222, + double ddb223, + double ddb231, + double ddb232, + double ddb233, + double ddb331, + double ddb332, + double ddb333, + double ddchi11, + double ddchi12, + double ddchi13, + double ddchi22, + double ddchi23, + double ddchi33, + double deldelg1111, + double deldelg1112, + double deldelg1113, + double deldelg1122, + double deldelg1123, + double deldelg1133, + double deldelg1211, + double deldelg1212, + double deldelg1213, + double deldelg1222, + double deldelg1223, + double deldelg1233, + double deldelg1311, + double deldelg1312, + double deldelg1313, + double deldelg1322, + double deldelg1323, + double deldelg1333, + double deldelg2211, + double deldelg2212, + double deldelg2213, + double deldelg2222, + double deldelg2223, + double deldelg2233, + double deldelg2311, + double deldelg2312, + double deldelg2313, + double deldelg2322, + double deldelg2323, + double deldelg2333, + double deldelg3311, + double deldelg3312, + double deldelg3313, + double deldelg3322, + double deldelg3323, + double deldelg3333, + double delG11, + double delg111, + double delg112, + double delg113, + double delG12, + double delg122, + double delg123, + double delG13, + double delg133, + double delG21, + double delg211, + double delg212, + double delg213, + double delG22, + double delg222, + double delg223, + double delG23, + double delg233, + double delG31, + double delg311, + double delg312, + double delg313, + double delG32, + double delg322, + double delg323, + double delG33, + double delg333, + double dKhat1, + double dKhat2, + double dKhat3, + double dTheta1, + double dTheta2, + double dTheta3, + double G1, + double g11, + double g12, + double g13, + double G2, + double g22, + double g23, + double G3, + double g33, + double kappa1, + double kappa2, + double Khat, + double &rA11, + double &rA12, + double &rA13, + double &rA22, + double &rA23, + double &rA33, + double &rchi, + double &rG1, + double &rg11, + double &rg12, + double &rg13, + double &rG2, + double &rg22, + double &rg23, + double &rG3, + double &rg33, + double &rKhat, + double &rTheta, + double Theta) +{ + double AA11; + double AA12; + double AA13; + double AA22; + double AA23; + double AA33; + double Ainv11; + double Ainv12; + double Ainv13; + double Ainv22; + double Ainv23; + double Ainv33; + double cAA; + double cdda11; + double cdda12; + double cdda13; + double cdda22; + double cdda23; + double cdda33; + double cddf11; + double cddf12; + double cddf13; + double cddf22; + double cddf23; + double cddf33; + double chiguard; + double chiguarded; + double chipsipower; + double ddf11; + double ddf12; + double ddf13; + double ddf22; + double ddf23; + double ddf33; + double detginv; + double df1; + double df2; + double df3; + double dGd11; + double dGd12; + double dGd13; + double dGd21; + double dGd22; + double dGd23; + double dGd31; + double dGd32; + double dGd33; + double dginv111; + double dginv112; + double dginv113; + double dginv122; + double dginv123; + double dginv133; + double dginv211; + double dginv212; + double dginv213; + double dginv222; + double dginv223; + double dginv233; + double dginv311; + double dginv312; + double dginv313; + double dginv322; + double dginv323; + double dginv333; + double divAinv1; + double divAinv2; + double divAinv3; + double divbeta; + double dK1; + double dK2; + double dK3; + double dphi1; + double dphi2; + double dphi3; + double dZ11; + double DZ11; + double dZ12; + double DZ12; + double dZ13; + double DZ13; + double dZ21; + double DZ21; + double dZ22; + double DZ22; + double dZ23; + double DZ23; + double dZ31; + double DZ31; + double dZ32; + double DZ32; + double dZ33; + double DZ33; + double dZinv11; + double DZinv11; + double dZinv12; + double DZinv12; + double dZinv13; + double DZinv13; + double dZinv21; + double DZinv21; + double dZinv22; + double DZinv22; + double dZinv23; + double DZinv23; + double dZinv31; + double DZinv31; + double dZinv32; + double DZinv32; + double dZinv33; + double DZinv33; + double DZsym11; + double DZsym12; + double DZsym13; + double DZsym21; + double DZsym22; + double DZsym23; + double DZsym31; + double DZsym32; + double DZsym33; + double f; + double ff; + double gamma111; + double gamma112; + double gamma113; + double gamma122; + double gamma123; + double gamma133; + double gamma211; + double gamma212; + double gamma213; + double gamma222; + double gamma223; + double gamma233; + double gamma311; + double gamma312; + double gamma313; + double gamma322; + double gamma323; + double gamma333; + double gammado111; + double gammado112; + double gammado113; + double gammado122; + double gammado123; + double gammado133; + double gammado211; + double gammado212; + double gammado213; + double gammado222; + double gammado223; + double gammado233; + double gammado311; + double gammado312; + double gammado313; + double gammado322; + double gammado323; + double gammado333; + double gammaF111; + double gammaF112; + double gammaF113; + double gammaF121; + double gammaF122; + double gammaF123; + double gammaF131; + double gammaF132; + double gammaF133; + double gammaF211; + double gammaF212; + double gammaF213; + double gammaF221; + double gammaF222; + double gammaF223; + double gammaF231; + double gammaF232; + double gammaF233; + double gammaF311; + double gammaF312; + double gammaF313; + double gammaF321; + double gammaF322; + double gammaF323; + double gammaF331; + double gammaF332; + double gammaF333; + double Gd1; + double Gd2; + double Gd3; + double Gfromg1; + double Gfromg2; + double Gfromg3; + double ginv11; + double ginv12; + double ginv13; + double ginv22; + double ginv23; + double ginv33; + double Hhat; + double K; + double lieA11; + double lieA12; + double lieA13; + double lieA22; + double lieA23; + double lieA33; + double liechi; + double lieg11; + double lieg12; + double lieg13; + double lieg22; + double lieg23; + double lieg33; + double oochipsipower; + double ootddivbeta1; + double ootddivbeta2; + double ootddivbeta3; + double pseudolieG1; + double pseudolieG2; + double pseudolieG3; + double psim4; + double R11; + double R12; + double R13; + double R22; + double R23; + double R33; + double Rhat; + double Rphi11; + double Rphi12; + double Rphi13; + double Rphi22; + double Rphi23; + double Rphi33; + double totdivbeta; + double trcdda; + double trcddf; + double trDZsym; + double Z1; + double Z2; + double Z3; + double Zinv1; + double Zinv2; + double Zinv3; + + chipsipower = + -4.; + + K = + Khat + 2. * Theta; + + dK1 = + dKhat1 + 2. * dTheta1; + + dK2 = + dKhat2 + 2. * dTheta2; + + dK3 = + dKhat3 + 2. * dTheta3; + + detginv = + 1 / (2. * g12 * g13 * g23 - g33 * pow2(g12) + g22 * (g11 * g33 - pow2(g13)) - + g11 * pow2(g23)); + + ginv11 = + detginv * (g22 * g33 - pow2(g23)); + + ginv12 = + detginv * (g13 * g23 - g12 * g33); + + ginv13 = + detginv * (-(g13 * g22) + g12 * g23); + + ginv22 = + detginv * (g11 * g33 - pow2(g13)); + + ginv23 = + detginv * (g12 * g13 - g11 * g23); + + ginv33 = + detginv * (g11 * g22 - pow2(g12)); + + dginv111 = + -2. * (delg123 * ginv12 * ginv13 + ginv11 * (delg112 * ginv12 + delg113 * ginv13)) - + delg111 * pow2(ginv11) - delg122 * pow2(ginv12) - delg133 * pow2(ginv13); + + dginv112 = + -(ginv11 * (delg111 * ginv12 + delg112 * ginv22 + delg113 * ginv23)) - + ginv12 * (delg113 * ginv13 + delg122 * ginv22 + delg123 * ginv23) - + ginv13 * (delg123 * ginv22 + delg133 * ginv23) - delg112 * pow2(ginv12); + + dginv113 = + -(ginv11 * (delg111 * ginv13 + delg112 * ginv23 + delg113 * ginv33)) - + ginv12 * (delg112 * ginv13 + delg122 * ginv23 + delg123 * ginv33) - + ginv13 * (delg123 * ginv23 + delg133 * ginv33) - delg113 * pow2(ginv13); + + dginv122 = + -2. * (delg123 * ginv22 * ginv23 + ginv12 * (delg112 * ginv22 + delg113 * ginv23)) - + delg111 * pow2(ginv12) - delg122 * pow2(ginv22) - delg133 * pow2(ginv23); + + dginv123 = + -(ginv13 * (delg112 * ginv22 + delg113 * ginv23)) - delg133 * ginv23 * ginv33 - + ginv12 * (delg111 * ginv13 + delg112 * ginv23 + delg113 * ginv33) - + ginv22 * (delg122 * ginv23 + delg123 * ginv33) - delg123 * pow2(ginv23); + + dginv133 = + -2. * (delg123 * ginv23 * ginv33 + ginv13 * (delg112 * ginv23 + delg113 * ginv33)) - + delg111 * pow2(ginv13) - delg122 * pow2(ginv23) - delg133 * pow2(ginv33); + + dginv211 = + -2. * (delg223 * ginv12 * ginv13 + ginv11 * (delg212 * ginv12 + delg213 * ginv13)) - + delg211 * pow2(ginv11) - delg222 * pow2(ginv12) - delg233 * pow2(ginv13); + + dginv212 = + -(ginv11 * (delg211 * ginv12 + delg212 * ginv22 + delg213 * ginv23)) - + ginv12 * (delg213 * ginv13 + delg222 * ginv22 + delg223 * ginv23) - + ginv13 * (delg223 * ginv22 + delg233 * ginv23) - delg212 * pow2(ginv12); + + dginv213 = + -(ginv11 * (delg211 * ginv13 + delg212 * ginv23 + delg213 * ginv33)) - + ginv12 * (delg212 * ginv13 + delg222 * ginv23 + delg223 * ginv33) - + ginv13 * (delg223 * ginv23 + delg233 * ginv33) - delg213 * pow2(ginv13); + + dginv222 = + -2. * (delg223 * ginv22 * ginv23 + ginv12 * (delg212 * ginv22 + delg213 * ginv23)) - + delg211 * pow2(ginv12) - delg222 * pow2(ginv22) - delg233 * pow2(ginv23); + + dginv223 = + -(ginv13 * (delg212 * ginv22 + delg213 * ginv23)) - delg233 * ginv23 * ginv33 - + ginv12 * (delg211 * ginv13 + delg212 * ginv23 + delg213 * ginv33) - + ginv22 * (delg222 * ginv23 + delg223 * ginv33) - delg223 * pow2(ginv23); + + dginv233 = + -2. * (delg223 * ginv23 * ginv33 + ginv13 * (delg212 * ginv23 + delg213 * ginv33)) - + delg211 * pow2(ginv13) - delg222 * pow2(ginv23) - delg233 * pow2(ginv33); + + dginv311 = + -2. * (delg323 * ginv12 * ginv13 + ginv11 * (delg312 * ginv12 + delg313 * ginv13)) - + delg311 * pow2(ginv11) - delg322 * pow2(ginv12) - delg333 * pow2(ginv13); + + dginv312 = + -(ginv11 * (delg311 * ginv12 + delg312 * ginv22 + delg313 * ginv23)) - + ginv12 * (delg313 * ginv13 + delg322 * ginv22 + delg323 * ginv23) - + ginv13 * (delg323 * ginv22 + delg333 * ginv23) - delg312 * pow2(ginv12); + + dginv313 = + -(ginv11 * (delg311 * ginv13 + delg312 * ginv23 + delg313 * ginv33)) - + ginv12 * (delg312 * ginv13 + delg322 * ginv23 + delg323 * ginv33) - + ginv13 * (delg323 * ginv23 + delg333 * ginv33) - delg313 * pow2(ginv13); + + dginv322 = + -2. * (delg323 * ginv22 * ginv23 + ginv12 * (delg312 * ginv22 + delg313 * ginv23)) - + delg311 * pow2(ginv12) - delg322 * pow2(ginv22) - delg333 * pow2(ginv23); + + dginv323 = + -(ginv13 * (delg312 * ginv22 + delg313 * ginv23)) - delg333 * ginv23 * ginv33 - + ginv12 * (delg311 * ginv13 + delg312 * ginv23 + delg313 * ginv33) - + ginv22 * (delg322 * ginv23 + delg323 * ginv33) - delg323 * pow2(ginv23); + + dginv333 = + -2. * (delg323 * ginv23 * ginv33 + ginv13 * (delg312 * ginv23 + delg313 * ginv33)) - + delg311 * pow2(ginv13) - delg322 * pow2(ginv23) - delg333 * pow2(ginv33); + + gammado111 = + 0.5 * delg111; + + gammado112 = + 0.5 * delg211; + + gammado113 = + 0.5 * delg311; + + gammado122 = + -0.5 * delg122 + delg212; + + gammado123 = + 0.5 * (-delg123 + delg213 + delg312); + + gammado133 = + -0.5 * delg133 + delg313; + + gammado211 = + delg112 - 0.5 * delg211; + + gammado212 = + 0.5 * delg122; + + gammado213 = + 0.5 * (delg123 - delg213 + delg312); + + gammado222 = + 0.5 * delg222; + + gammado223 = + 0.5 * delg322; + + gammado233 = + -0.5 * delg233 + delg323; + + gammado311 = + delg113 - 0.5 * delg311; + + gammado312 = + 0.5 * (delg123 + delg213 - delg312); + + gammado313 = + 0.5 * delg133; + + gammado322 = + delg223 - 0.5 * delg322; + + gammado323 = + 0.5 * delg233; + + gammado333 = + 0.5 * delg333; + + gamma111 = + gammado111 * ginv11 + gammado211 * ginv12 + gammado311 * ginv13; + + gamma112 = + gammado112 * ginv11 + gammado212 * ginv12 + gammado312 * ginv13; + + gamma113 = + gammado113 * ginv11 + gammado213 * ginv12 + gammado313 * ginv13; + + gamma122 = + gammado122 * ginv11 + gammado222 * ginv12 + gammado322 * ginv13; + + gamma123 = + gammado123 * ginv11 + gammado223 * ginv12 + gammado323 * ginv13; + + gamma133 = + gammado133 * ginv11 + gammado233 * ginv12 + gammado333 * ginv13; + + gamma211 = + gammado111 * ginv12 + gammado211 * ginv22 + gammado311 * ginv23; + + gamma212 = + gammado112 * ginv12 + gammado212 * ginv22 + gammado312 * ginv23; + + gamma213 = + gammado113 * ginv12 + gammado213 * ginv22 + gammado313 * ginv23; + + gamma222 = + gammado122 * ginv12 + gammado222 * ginv22 + gammado322 * ginv23; + + gamma223 = + gammado123 * ginv12 + gammado223 * ginv22 + gammado323 * ginv23; + + gamma233 = + gammado133 * ginv12 + gammado233 * ginv22 + gammado333 * ginv23; + + gamma311 = + gammado111 * ginv13 + gammado211 * ginv23 + gammado311 * ginv33; + + gamma312 = + gammado112 * ginv13 + gammado212 * ginv23 + gammado312 * ginv33; + + gamma313 = + gammado113 * ginv13 + gammado213 * ginv23 + gammado313 * ginv33; + + gamma322 = + gammado122 * ginv13 + gammado222 * ginv23 + gammado322 * ginv33; + + gamma323 = + gammado123 * ginv13 + gammado223 * ginv23 + gammado323 * ginv33; + + gamma333 = + gammado133 * ginv13 + gammado233 * ginv23 + gammado333 * ginv33; + + Gfromg1 = + gamma111 * ginv11 + gamma122 * ginv22 + + 2. * (gamma112 * ginv12 + gamma113 * ginv13 + gamma123 * ginv23) + gamma133 * ginv33; + + Gfromg2 = + gamma211 * ginv11 + gamma222 * ginv22 + + 2. * (gamma212 * ginv12 + gamma213 * ginv13 + gamma223 * ginv23) + gamma233 * ginv33; + + Gfromg3 = + gamma311 * ginv11 + gamma322 * ginv22 + + 2. * (gamma312 * ginv12 + gamma313 * ginv13 + gamma323 * ginv23) + gamma333 * ginv33; + + R11 = + delG11 * g11 + delG12 * g12 + delG13 * g13 + gammado111 * Gfromg1 + + gammado112 * Gfromg2 + gammado113 * Gfromg3 + + (-0.5 * deldelg1111 + 3. * gamma111 * gammado111 + + 2. * (gamma211 * gammado112 + gamma311 * gammado113) + + gamma211 * gammado211 + gamma311 * gammado311) * + ginv11 + + (-deldelg1211 + 3. * (gamma112 * gammado111 + gamma111 * gammado112) + + 2. * (gamma212 * gammado112 + gamma312 * gammado113 + + gamma211 * gammado122 + gamma311 * gammado123) + + gamma212 * gammado211 + + gamma211 * gammado212 + gamma312 * gammado311 + gamma311 * gammado312) * + ginv12 + + (-deldelg1311 + 3. * (gamma113 * gammado111 + gamma111 * gammado113) + + 2. * (gamma213 * gammado112 + gamma313 * gammado113 + + gamma211 * gammado123 + gamma311 * gammado133) + + gamma213 * gammado211 + + gamma211 * gammado213 + gamma313 * gammado311 + gamma311 * gammado313) * + ginv13 + + (-0.5 * deldelg2211 + 3. * gamma112 * gammado112 + + 2. * (gamma212 * gammado122 + gamma312 * gammado123) + + gamma212 * gammado212 + gamma312 * gammado312) * + ginv22 + + (-deldelg2311 + 3. * (gamma113 * gammado112 + gamma112 * gammado113) + + 2. * (gamma213 * gammado122 + (gamma212 + gamma313) * gammado123 + + gamma312 * gammado133) + + gamma213 * gammado212 + gamma212 * gammado213 + + gamma313 * gammado312 + gamma312 * gammado313) * + ginv23 + + (-0.5 * deldelg3311 + 3. * gamma113 * gammado113 + + 2. * (gamma213 * gammado123 + gamma313 * gammado133) + gamma213 * gammado213 + + gamma313 * gammado313) * + ginv33; + + R12 = + 0.5 * (delG21 * g11 + (delG11 + delG22) * g12 + delG23 * g13 + delG12 * g22 + + delG13 * g23 + (gammado112 + gammado211) * Gfromg1 + + (gammado122 + gammado212) * Gfromg2 + (gammado123 + gammado213) * Gfromg3) + + (-0.5 * deldelg1112 + gamma112 * gammado111 + + (gamma111 + gamma212) * gammado112 + gamma312 * gammado113 + + gamma111 * gammado211 + 2. * gamma211 * gammado212 + + gamma311 * (gammado213 + gammado312)) * + ginv11 + + (-deldelg1212 + gamma122 * gammado111 + + (2. * gamma112 + gamma222) * gammado112 + gamma322 * gammado113 + + (gamma111 + gamma212) * gammado122 + gamma112 * gammado211 + + (gamma111 + 2. * gamma212) * gammado212 + 2. * gamma211 * gammado222 + + gamma312 * (gammado123 + gammado213 + gammado312) + + gamma311 * (gammado223 + gammado322)) * + ginv12 + + (-deldelg1312 + gamma123 * gammado111 + (gamma113 + gamma223) * gammado112 + + (gamma112 + gamma323) * gammado113 + (gamma111 + gamma212) * gammado123 + + gamma312 * gammado133 + gamma113 * gammado211 + + (gamma111 + gamma313) * gammado213 + + 2. * (gamma213 * gammado212 + gamma211 * gammado223) + + gamma313 * gammado312 + gamma311 * (gammado233 + gammado323)) * + ginv13 + + (-0.5 * deldelg2212 + gamma122 * gammado112 + + (gamma112 + gamma222) * gammado122 + gamma322 * gammado123 + + gamma112 * gammado212 + 2. * gamma212 * gammado222 + + gamma312 * (gammado223 + gammado322)) * + ginv22 + + (-deldelg2312 + gamma123 * gammado112 + gamma122 * gammado113 + + (gamma113 + gamma223) * gammado122 + + (gamma112 + gamma222 + gamma323) * gammado123 + gamma322 * gammado133 + + gamma113 * gammado212 + gamma112 * gammado213 + + 2. * (gamma213 * gammado222 + gamma212 * gammado223) + + gamma313 * (gammado223 + gammado322) + + gamma312 * (gammado233 + gammado323)) * + ginv23 + + (-0.5 * deldelg3312 + gamma123 * gammado113 + + (gamma113 + gamma223) * gammado123 + gamma323 * gammado133 + + gamma113 * gammado213 + 2. * gamma213 * gammado223 + + gamma313 * (gammado233 + gammado323)) * + ginv33; + + R13 = + 0.5 * (delG31 * g11 + delG32 * g12 + (delG11 + delG33) * g13 + delG12 * g23 + + delG13 * g33 + (gammado113 + gammado311) * Gfromg1 + + (gammado123 + gammado312) * Gfromg2 + (gammado133 + gammado313) * Gfromg3) + + (-0.5 * deldelg1113 + gamma113 * gammado111 + gamma213 * gammado112 + + (gamma111 + gamma313) * gammado113 + gamma111 * gammado311 + + gamma211 * (gammado213 + gammado312) + 2. * gamma311 * gammado313) * + ginv11 + + (-deldelg1213 + gamma123 * gammado111 + (gamma113 + gamma223) * gammado112 + + (gamma112 + gamma323) * gammado113 + gamma213 * gammado122 + + (gamma111 + gamma313) * gammado123 + gamma112 * gammado311 + + gamma111 * gammado312 + gamma212 * (gammado213 + gammado312) + + gamma211 * (gammado223 + gammado322) + + 2. * (gamma312 * gammado313 + gamma311 * gammado323)) * + ginv12 + + (-deldelg1313 + gamma133 * gammado111 + gamma233 * gammado112 + + (2. * gamma113 + gamma333) * gammado113 + + (gamma111 + gamma313) * gammado133 + gamma113 * gammado311 + + gamma213 * (gammado123 + gammado213 + gammado312) + + (gamma111 + 2. * gamma313) * gammado313 + + gamma211 * (gammado233 + gammado323) + 2. * gamma311 * gammado333) * + ginv13 + + (-0.5 * deldelg2213 + gamma123 * gammado112 + gamma223 * gammado122 + + (gamma112 + gamma323) * gammado123 + gamma112 * gammado312 + + gamma212 * (gammado223 + gammado322) + 2. * gamma312 * gammado323) * + ginv22 + + (-deldelg2313 + gamma133 * gammado112 + gamma123 * gammado113 + + gamma233 * gammado122 + (gamma113 + gamma223 + gamma333) * gammado123 + + (gamma112 + gamma323) * gammado133 + gamma113 * gammado312 + + gamma112 * gammado313 + gamma213 * (gammado223 + gammado322) + + gamma212 * (gammado233 + gammado323) + + 2. * (gamma313 * gammado323 + gamma312 * gammado333)) * + ginv23 + + (-0.5 * deldelg3313 + gamma133 * gammado113 + gamma233 * gammado123 + + (gamma113 + gamma333) * gammado133 + gamma113 * gammado313 + + gamma213 * (gammado233 + gammado323) + 2. * gamma313 * gammado333) * + ginv33; + + R22 = + delG21 * g12 + delG22 * g22 + delG23 * g23 + gammado212 * Gfromg1 + + gammado222 * Gfromg2 + gammado223 * Gfromg3 + + (-0.5 * deldelg1122 + gamma112 * (gammado112 + 2. * gammado211) + + 3. * gamma212 * gammado212 + gamma312 * (2. * gammado213 + gammado312)) * + ginv11 + + (-deldelg1222 + gamma122 * (gammado112 + 2. * gammado211) + + gamma112 * (gammado122 + 2. * gammado212) + + 3. * (gamma222 * gammado212 + gamma212 * gammado222) + + 2. * (gamma322 * gammado213 + gamma312 * gammado223) + + gamma322 * gammado312 + gamma312 * gammado322) * + ginv12 + + (-deldelg1322 + gamma123 * (gammado112 + 2. * gammado211) + + gamma112 * (gammado123 + 2. * gammado213) + + 3. * (gamma223 * gammado212 + gamma212 * gammado223) + + 2. * (gamma323 * gammado213 + gamma312 * gammado233) + + gamma323 * gammado312 + gamma312 * gammado323) * + ginv13 + + (-0.5 * deldelg2222 + gamma122 * (gammado122 + 2. * gammado212) + + 3. * gamma222 * gammado222 + gamma322 * (2. * gammado223 + gammado322)) * + ginv22 + + (-deldelg2322 + gamma123 * (gammado122 + 2. * gammado212) + + gamma122 * (gammado123 + 2. * gammado213) + + 3. * (gamma223 * gammado222 + gamma222 * gammado223) + + 2. * (gamma323 * gammado223 + gamma322 * gammado233) + + gamma323 * gammado322 + gamma322 * gammado323) * + ginv23 + + (-0.5 * deldelg3322 + gamma123 * (gammado123 + 2. * gammado213) + + 3. * gamma223 * gammado223 + gamma323 * (2. * gammado233 + gammado323)) * + ginv33; + + R23 = + 0.5 * (delG31 * g12 + delG21 * g13 + delG32 * g22 + (delG22 + delG33) * g23 + + delG23 * g33 + (gammado213 + gammado312) * Gfromg1 + + (gammado223 + gammado322) * Gfromg2 + (gammado233 + gammado323) * Gfromg3) + + (-0.5 * deldelg1123 + gamma113 * gammado211 + gamma213 * gammado212 + + (gamma212 + gamma313) * gammado213 + + gamma112 * (gammado113 + gammado311) + gamma212 * gammado312 + + 2. * gamma312 * gammado313) * + ginv11 + + (-deldelg1223 + gamma123 * gammado211 + (gamma113 + gamma223) * gammado212 + + (gamma222 + gamma323) * gammado213 + gamma213 * gammado222 + + (gamma212 + gamma313) * gammado223 + + gamma122 * (gammado113 + gammado311) + gamma222 * gammado312 + + gamma112 * (gammado123 + gammado312) + gamma212 * gammado322 + + 2. * (gamma322 * gammado313 + gamma312 * gammado323)) * + ginv12 + + (-deldelg1323 + gamma133 * gammado211 + gamma233 * gammado212 + + (gamma113 + gamma223 + gamma333) * gammado213 + gamma213 * gammado223 + + (gamma212 + gamma313) * gammado233 + + gamma123 * (gammado113 + gammado311) + gamma223 * gammado312 + + gamma112 * (gammado133 + gammado313) + gamma212 * gammado323 + + 2. * (gamma323 * gammado313 + gamma312 * gammado333)) * + ginv13 + + (-0.5 * deldelg2223 + gamma123 * gammado212 + gamma223 * gammado222 + + (gamma222 + gamma323) * gammado223 + + gamma122 * (gammado123 + gammado312) + gamma222 * gammado322 + + 2. * gamma322 * gammado323) * + ginv22 + + (-deldelg2323 + gamma133 * gammado212 + gamma233 * gammado222 + + (2. * gamma223 + gamma333) * gammado223 + + (gamma222 + gamma323) * gammado233 + + gamma123 * (gammado123 + gammado213 + gammado312) + + gamma122 * (gammado133 + gammado313) + gamma223 * gammado322 + + (gamma222 + 2. * gamma323) * gammado323 + 2. * gamma322 * gammado333) * + ginv23 + + (-0.5 * deldelg3323 + gamma133 * gammado213 + gamma233 * gammado223 + + (gamma223 + gamma333) * gammado233 + + gamma123 * (gammado133 + gammado313) + gamma223 * gammado323 + + 2. * gamma323 * gammado333) * + ginv33; + + R33 = + delG31 * g13 + delG32 * g23 + delG33 * g33 + gammado313 * Gfromg1 + + gammado323 * Gfromg2 + gammado333 * Gfromg3 + + (-0.5 * deldelg1133 + gamma113 * (gammado113 + 2. * gammado311) + + gamma213 * (gammado213 + 2. * gammado312) + 3. * gamma313 * gammado313) * + ginv11 + + (-deldelg1233 + gamma123 * (gammado113 + 2. * gammado311) + + gamma113 * (gammado123 + 2. * gammado312) + + gamma223 * (gammado213 + 2. * gammado312) + + gamma213 * (gammado223 + 2. * gammado322) + + 3. * (gamma323 * gammado313 + gamma313 * gammado323)) * + ginv12 + + (-deldelg1333 + gamma133 * (gammado113 + 2. * gammado311) + + gamma233 * (gammado213 + 2. * gammado312) + + gamma113 * (gammado133 + 2. * gammado313) + + gamma213 * (gammado233 + 2. * gammado323) + + 3. * (gamma333 * gammado313 + gamma313 * gammado333)) * + ginv13 + + (-0.5 * deldelg2233 + gamma123 * (gammado123 + 2. * gammado312) + + gamma223 * (gammado223 + 2. * gammado322) + 3. * gamma323 * gammado323) * + ginv22 + + (-deldelg2333 + gamma133 * (gammado123 + 2. * gammado312) + + gamma123 * (gammado133 + 2. * gammado313) + + gamma233 * (gammado223 + 2. * gammado322) + + gamma223 * (gammado233 + 2. * gammado323) + + 3. * (gamma333 * gammado323 + gamma323 * gammado333)) * + ginv23 + + (-0.5 * deldelg3333 + gamma133 * (gammado133 + 2. * gammado313) + + gamma233 * (gammado233 + 2. * gammado323) + 3. * gamma333 * gammado333) * + ginv33; + + chiguard = + chiDivFloor; + + chiguarded = + chi; + + if (chiguarded < chiguard) + chiguarded = chiguard; + + ff = + chiguarded; + + oochipsipower = + 1 / chipsipower; + + f = + oochipsipower * log(ff); + + psim4 = + exp(-4. * f); + + df1 = + (dchi1 * oochipsipower) / chiguarded; + + df2 = + (dchi2 * oochipsipower) / chiguarded; + + df3 = + (dchi3 * oochipsipower) / chiguarded; + + ddf11 = + (ddchi11 * oochipsipower) / chiguarded - chipsipower * pow2(df1); + + ddf12 = + -(chipsipower * df1 * df2) + (ddchi12 * oochipsipower) / chiguarded; + + ddf13 = + -(chipsipower * df1 * df3) + (ddchi13 * oochipsipower) / chiguarded; + + ddf22 = + (ddchi22 * oochipsipower) / chiguarded - chipsipower * pow2(df2); + + ddf23 = + -(chipsipower * df2 * df3) + (ddchi23 * oochipsipower) / chiguarded; + + ddf33 = + (ddchi33 * oochipsipower) / chiguarded - chipsipower * pow2(df3); + + cddf11 = + ddf11 - df1 * gamma111 - df2 * gamma211 - df3 * gamma311; + + cddf12 = + ddf12 - df1 * gamma112 - df2 * gamma212 - df3 * gamma312; + + cddf13 = + ddf13 - df1 * gamma113 - df2 * gamma213 - df3 * gamma313; + + cddf22 = + ddf22 - df1 * gamma122 - df2 * gamma222 - df3 * gamma322; + + cddf23 = + ddf23 - df1 * gamma123 - df2 * gamma223 - df3 * gamma323; + + cddf33 = + ddf33 - df1 * gamma133 - df2 * gamma233 - df3 * gamma333; + + trcddf = + cddf11 * ginv11 + cddf22 * ginv22 + + 2. * (cddf12 * ginv12 + cddf13 * ginv13 + cddf23 * ginv23) + cddf33 * ginv33; + + Rphi11 = + -2. * (cddf11 + g11 * trcddf) + (4. - 4. * g11 * ginv11) * pow2(df1) - + g11 * (8. * (df1 * (df2 * ginv12 + df3 * ginv13) + df2 * df3 * ginv23) + + 4. * (ginv22 * pow2(df2) + ginv33 * pow2(df3))); + + Rphi12 = + df1 * df2 * (4. - 8. * g12 * ginv12) - 2. * (cddf12 + g12 * trcddf) - + g12 * (8. * df3 * (df1 * ginv13 + df2 * ginv23) + + 4. * (ginv11 * pow2(df1) + ginv22 * pow2(df2) + ginv33 * pow2(df3))); + + Rphi13 = + df1 * (4. * df3 - 8. * df2 * g13 * ginv12) - 2. * (cddf13 + g13 * trcddf) - + g13 * (8. * df3 * (df1 * ginv13 + df2 * ginv23) + + 4. * (ginv11 * pow2(df1) + ginv22 * pow2(df2) + ginv33 * pow2(df3))); + + Rphi22 = + -2. * (cddf22 + g22 * trcddf) + (4. - 4. * g22 * ginv22) * pow2(df2) - + g22 * (8. * (df1 * (df2 * ginv12 + df3 * ginv13) + df2 * df3 * ginv23) + + 4. * (ginv11 * pow2(df1) + ginv33 * pow2(df3))); + + Rphi23 = + df2 * (-8. * df1 * g23 * ginv12 + df3 * (4. - 8. * g23 * ginv23)) - + 2. * (cddf23 + g23 * trcddf) - g23 * (8. * df1 * df3 * ginv13 + 4. * (ginv11 * pow2(df1) + ginv22 * pow2(df2) + ginv33 * pow2(df3))); + + Rphi33 = + -2. * (cddf33 + g33 * trcddf) - g33 * (8. * (df1 * (df2 * ginv12 + df3 * ginv13) + df2 * df3 * ginv23) + 4. * (ginv11 * pow2(df1) + ginv22 * pow2(df2))) + + (4. - 4. * g33 * ginv33) * pow2(df3); + + cdda11 = + dda11 - da2 * gamma211 - da3 * gamma311 + + da1 * (-gamma111 + df1 * (-4. + 2. * g11 * ginv11)) + + 2. * g11 * ((da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33); + + cdda12 = + dda12 - da1 * gamma112 - da2 * gamma212 - da3 * gamma312 + + 2. * (-(da2 * df1) - da1 * df2 + g12 * (da1 * df1 * ginv11 + (da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33)); + + cdda13 = + dda13 - da1 * gamma113 - da2 * gamma213 - da3 * gamma313 + + 2. * (-(da3 * df1) - da1 * df3 + g13 * (da1 * df1 * ginv11 + (da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33)); + + cdda22 = + dda22 - da1 * gamma122 - da2 * (4. * df2 + gamma222) - da3 * gamma322 + + 2. * g22 * (da1 * df1 * ginv11 + (da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33); + + cdda23 = + dda23 - da1 * gamma123 - da2 * gamma223 - da3 * gamma323 + + 2. * (-(da3 * df2) - da2 * df3 + g23 * (da1 * df1 * ginv11 + (da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33)); + + cdda33 = + dda33 - da1 * gamma133 - da2 * gamma233 - da3 * (4. * df3 + gamma333) + + 2. * g33 * (da1 * df1 * ginv11 + (da2 * df1 + da1 * df2) * ginv12 + (da3 * df1 + da1 * df3) * ginv13 + da2 * df2 * ginv22 + (da3 * df2 + da2 * df3) * ginv23 + da3 * df3 * ginv33); + + trcdda = + (cdda11 * ginv11 + cdda22 * ginv22 + + 2. * (cdda12 * ginv12 + cdda13 * ginv13 + cdda23 * ginv23) + cdda33 * ginv33) * + psim4; + + AA11 = + 2. * (A11 * (A12 * ginv12 + A13 * ginv13) + A12 * A13 * ginv23) + ginv11 * pow2(A11) + + ginv22 * pow2(A12) + ginv33 * pow2(A13); + + AA12 = + (A12 * A13 + A11 * A23) * ginv13 + A12 * (A11 * ginv11 + A22 * ginv22) + + (A13 * A22 + A12 * A23) * ginv23 + A13 * A23 * ginv33 + ginv12 * (A11 * A22 + pow2(A12)); + + AA13 = + (A12 * A13 + A11 * A23) * ginv12 + A12 * A23 * ginv22 + (A13 * A23 + A12 * A33) * ginv23 + + A13 * (A11 * ginv11 + A33 * ginv33) + ginv13 * (A11 * A33 + pow2(A13)); + + AA22 = + 2. * (A12 * (A22 * ginv12 + A23 * ginv13) + A22 * A23 * ginv23) + ginv11 * pow2(A12) + + ginv22 * pow2(A22) + ginv33 * pow2(A23); + + AA23 = + A12 * A13 * ginv11 + (A13 * A22 + A12 * A23) * ginv12 + (A13 * A23 + A12 * A33) * ginv13 + + A23 * (A22 * ginv22 + A33 * ginv33) + ginv23 * (A22 * A33 + pow2(A23)); + + AA33 = + 2. * (A13 * (A23 * ginv12 + A33 * ginv13) + A23 * A33 * ginv23) + ginv11 * pow2(A13) + + ginv22 * pow2(A23) + ginv33 * pow2(A33); + + cAA = + AA11 * ginv11 + AA22 * ginv22 + 2. * (AA12 * ginv12 + AA13 * ginv13 + AA23 * ginv23) + + AA33 * ginv33; + + Ainv11 = + 2. * (A23 * ginv12 * ginv13 + ginv11 * (A12 * ginv12 + A13 * ginv13)) + + A11 * pow2(ginv11) + A22 * pow2(ginv12) + A33 * pow2(ginv13); + + Ainv12 = + ginv11 * (A11 * ginv12 + A12 * ginv22 + A13 * ginv23) + + ginv12 * (A13 * ginv13 + A22 * ginv22 + A23 * ginv23) + + ginv13 * (A23 * ginv22 + A33 * ginv23) + A12 * pow2(ginv12); + + Ainv13 = + ginv11 * (A11 * ginv13 + A12 * ginv23 + A13 * ginv33) + + ginv12 * (A12 * ginv13 + A22 * ginv23 + A23 * ginv33) + + ginv13 * (A23 * ginv23 + A33 * ginv33) + A13 * pow2(ginv13); + + Ainv22 = + 2. * (A23 * ginv22 * ginv23 + ginv12 * (A12 * ginv22 + A13 * ginv23)) + + A11 * pow2(ginv12) + A22 * pow2(ginv22) + A33 * pow2(ginv23); + + Ainv23 = + ginv13 * (A12 * ginv22 + A13 * ginv23) + A33 * ginv23 * ginv33 + + ginv12 * (A11 * ginv13 + A12 * ginv23 + A13 * ginv33) + + ginv22 * (A22 * ginv23 + A23 * ginv33) + A23 * pow2(ginv23); + + Ainv33 = + 2. * (A23 * ginv23 * ginv33 + ginv13 * (A12 * ginv23 + A13 * ginv33)) + + A11 * pow2(ginv13) + A22 * pow2(ginv23) + A33 * pow2(ginv33); + + divAinv1 = + (-1.5 * (Ainv11 * dchi1 + Ainv12 * dchi2 + Ainv13 * dchi3)) / chiguarded + + Ainv11 * gamma111 + Ainv22 * gamma122 + + 2. * (Ainv12 * gamma112 + Ainv13 * gamma113 + Ainv23 * gamma123) + + Ainv33 * gamma133 - (0.66666666666666666667 * dKhat1 + 0.33333333333333333333 * dTheta1) * ginv11 - + (0.66666666666666666667 * dKhat2 + 0.33333333333333333333 * dTheta2) * ginv12 - + (0.66666666666666666667 * dKhat3 + 0.33333333333333333333 * dTheta3) * ginv13; + + divAinv2 = + (-1.5 * (Ainv12 * dchi1 + Ainv22 * dchi2 + Ainv23 * dchi3)) / chiguarded + + Ainv11 * gamma211 + Ainv22 * gamma222 + + 2. * (Ainv12 * gamma212 + Ainv13 * gamma213 + Ainv23 * gamma223) + + Ainv33 * gamma233 - (0.66666666666666666667 * dKhat1 + 0.33333333333333333333 * dTheta1) * ginv12 - + (0.66666666666666666667 * dKhat2 + 0.33333333333333333333 * dTheta2) * ginv22 - + (0.66666666666666666667 * dKhat3 + 0.33333333333333333333 * dTheta3) * ginv23; + + divAinv3 = + (-1.5 * (Ainv13 * dchi1 + Ainv23 * dchi2 + Ainv33 * dchi3)) / chiguarded + + Ainv11 * gamma311 + Ainv22 * gamma322 + + 2. * (Ainv12 * gamma312 + Ainv13 * gamma313 + Ainv23 * gamma323) + + Ainv33 * gamma333 - (0.66666666666666666667 * dKhat1 + 0.33333333333333333333 * dTheta1) * ginv13 - + (0.66666666666666666667 * dKhat2 + 0.33333333333333333333 * dTheta2) * ginv23 - + (0.66666666666666666667 * dKhat3 + 0.33333333333333333333 * dTheta3) * ginv33; + + Rhat = + psim4 * (ginv11 * (R11 + Rphi11) + ginv22 * (R22 + Rphi22) + + 2. * (ginv12 * (R12 + Rphi12) + ginv13 * (R13 + Rphi13) + + ginv23 * (R23 + Rphi23)) + + ginv33 * (R33 + Rphi33)); + + Hhat = + -cAA + Rhat + 0.66666666666666666667 * pow2(K); + + divbeta = + db11 + db22 + db33; + + totdivbeta = + 0.66666666666666666667 * divbeta; + + ootddivbeta1 = + 0.33333333333333333333 * (ddb111 + ddb122 + ddb133); + + ootddivbeta2 = + 0.33333333333333333333 * (ddb121 + ddb222 + ddb233); + + ootddivbeta3 = + 0.33333333333333333333 * (ddb131 + ddb232 + ddb333); + + lieg11 = + 2. * (db11 * g11 + db12 * g12 + db13 * g13) - g11 * totdivbeta; + + lieg12 = + db21 * g11 + db23 * g13 + db12 * g22 + db13 * g23 + g12 * (db11 + db22 - totdivbeta); + + lieg13 = + db31 * g11 + db32 * g12 + db12 * g23 + db13 * g33 + g13 * (db11 + db33 - totdivbeta); + + lieg22 = + 2. * (db21 * g12 + db22 * g22 + db23 * g23) - g22 * totdivbeta; + + lieg23 = + db31 * g12 + db21 * g13 + db32 * g22 + db23 * g33 + g23 * (db22 + db33 - totdivbeta); + + lieg33 = + 2. * (db31 * g13 + db32 * g23 + db33 * g33) - g33 * totdivbeta; + + lieA11 = + 2. * (A11 * db11 + A12 * db12 + A13 * db13) - A11 * totdivbeta; + + lieA12 = + A22 * db12 + A23 * db13 + A11 * db21 + A13 * db23 + A12 * (db11 + db22 - totdivbeta); + + lieA13 = + A23 * db12 + A33 * db13 + A11 * db31 + A12 * db32 + A13 * (db11 + db33 - totdivbeta); + + lieA22 = + 2. * (A12 * db21 + A22 * db22 + A23 * db23) - A22 * totdivbeta; + + lieA23 = + A13 * db21 + A33 * db23 + A12 * db31 + A22 * db32 + A23 * (db22 + db33 - totdivbeta); + + lieA33 = + 2. * (A13 * db31 + A23 * db32 + A33 * db33) - A33 * totdivbeta; + + liechi = + 0.16666666666666666667 * chiguarded * chipsipower * divbeta; + + pseudolieG1 = + -(db11 * Gfromg1) - db21 * Gfromg2 - db31 * Gfromg3 + ddb221 * ginv22 + + 2. * ddb231 * ginv23 + ddb331 * ginv33 + ginv11 * (ddb111 + ootddivbeta1) + + ginv12 * (2. * ddb121 + ootddivbeta2) + ginv13 * (2. * ddb131 + ootddivbeta3) + + Gfromg1 * totdivbeta; + + pseudolieG2 = + -(db12 * Gfromg1) - db22 * Gfromg2 - db32 * Gfromg3 + ddb112 * ginv11 + + 2. * ddb132 * ginv13 + ddb332 * ginv33 + ginv12 * (2. * ddb122 + ootddivbeta1) + + ginv22 * (ddb222 + ootddivbeta2) + ginv23 * (2. * ddb232 + ootddivbeta3) + + Gfromg2 * totdivbeta; + + pseudolieG3 = + -(db13 * Gfromg1) - db23 * Gfromg2 - db33 * Gfromg3 + ddb113 * ginv11 + + 2. * ddb123 * ginv12 + ddb223 * ginv22 + ginv13 * (2. * ddb133 + ootddivbeta1) + + ginv23 * (2. * ddb233 + ootddivbeta2) + ginv33 * (ddb333 + ootddivbeta3) + + Gfromg3 * totdivbeta; + + rg11 = + -2. * A11 * alpha + lieg11; + + rg12 = + -2. * A12 * alpha + lieg12; + + rg13 = + -2. * A13 * alpha + lieg13; + + rg22 = + -2. * A22 * alpha + lieg22; + + rg23 = + -2. * A23 * alpha + lieg23; + + rg33 = + -2. * A33 * alpha + lieg33; + + rA11 = + lieA11 + alpha * (-2. * AA11 + A11 * K + psim4 * R11 - 0.33333333333333333333 * g11 * Rhat) + psim4 * (-cdda11 + alpha * Rphi11) + + 0.33333333333333333333 * g11 * trcdda; + + rA12 = + lieA12 + alpha * (-2. * AA12 + A12 * K + psim4 * R12 - 0.33333333333333333333 * g12 * Rhat) + psim4 * (-cdda12 + alpha * Rphi12) + + 0.33333333333333333333 * g12 * trcdda; + + rA13 = + lieA13 + alpha * (-2. * AA13 + A13 * K + psim4 * R13 - 0.33333333333333333333 * g13 * Rhat) + psim4 * (-cdda13 + alpha * Rphi13) + + 0.33333333333333333333 * g13 * trcdda; + + rA22 = + lieA22 + alpha * (-2. * AA22 + A22 * K + psim4 * R22 - 0.33333333333333333333 * g22 * Rhat) + psim4 * (-cdda22 + alpha * Rphi22) + + 0.33333333333333333333 * g22 * trcdda; + + rA23 = + lieA23 + alpha * (-2. * AA23 + A23 * K + psim4 * R23 - 0.33333333333333333333 * g23 * Rhat) + psim4 * (-cdda23 + alpha * Rphi23) + + 0.33333333333333333333 * g23 * trcdda; + + rA33 = + lieA33 + alpha * (-2. * AA33 + A33 * K + psim4 * R33 - 0.33333333333333333333 * g33 * Rhat) + psim4 * (-cdda33 + alpha * Rphi33) + + 0.33333333333333333333 * g33 * trcdda; + + rG1 = + -2. * (Ainv11 * da1 + Ainv12 * da2 + Ainv13 * da3) + + alpha * (2. * divAinv1 + 2. * (-G1 + Gfromg1) * kappa1) + pseudolieG1; + + rG2 = + -2. * (Ainv12 * da1 + Ainv22 * da2 + Ainv23 * da3) + + alpha * (2. * divAinv2 + 2. * (-G2 + Gfromg2) * kappa1) + pseudolieG2; + + rG3 = + -2. * (Ainv13 * da1 + Ainv23 * da2 + Ainv33 * da3) + + alpha * (2. * divAinv3 + 2. * (-G3 + Gfromg3) * kappa1) + pseudolieG3; + + rKhat = + -trcdda + alpha * (cAA + kappa1 * (Theta - kappa2 * Theta) + + 0.33333333333333333333 * pow2(K)); + + rchi = + -0.16666666666666666667 * alpha * chiguarded * chipsipower * K + liechi; + + rTheta = + alpha * (0.5 * Hhat - kappa1 * (2. + kappa2) * Theta); +} + + +#undef pow4 +#undef pow3 +#undef pow2 + +__device__ __forceinline__ void z4c_contract_gamma_device_host_order( + const double gxx, const double gxy, const double gxz, + const double gyy, const double gyz, const double gzz, + const double gxxx, const double gxyx, const double gxzx, + const double gyyx, const double gyzx, const double gzzx, + const double gxxy, const double gxyy, const double gxzy, + const double gyyy, const double gyzy, const double gzzy, + const double gxxz, const double gxyz, const double gxzz, + const double gyyz, const double gyzz, const double gzzz, + double &Gamxa, double &Gamya, double &Gamza) +{ + double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - + gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz; + const double gupxx = (gyy * gzz - gyz * gyz) / det; + const double gupxy = -(gxy * gzz - gyz * gxz) / det; + const double gupxz = (gxy * gyz - gyy * gxz) / det; + const double gupyy = (gxx * gzz - gxz * gxz) / det; + const double gupyz = -(gxx * gyz - gxy * gxz) / det; + const double gupzz = (gxx * gyy - gxy * gxy) / det; + + const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz)); + const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz)); + const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz)); + + const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz)); + const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz)); + const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz)); + + const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz); + const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz); + const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz); + + const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz)); + const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz)); + const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz)); + + const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx); + const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx); + const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx); + + const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy); + const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy); + const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy); + + Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + + 2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz); + Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + + 2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz); + Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + + 2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz); +} + +__global__ __launch_bounds__(128, 2) +void kern_z4c_point_rhs( + const double *Axx, const double *Axy, const double *Axz, + const double *Ayy, const double *Ayz, const double *Azz, + const double *alpn1, const double *dtSfx, const double *dtSfy, const double *dtSfz, + const double *betax, const double *betay, const double *betaz, + const double *chin1, + const double *Lapx, const double *Lapy, const double *Lapz, + const double *Axxx, const double *Axyx, const double *Axzx, const double *Ayyx, const double *Ayzx, const double *Azzx, + const double *Axxy, const double *Axyy, const double *Axzy, const double *Ayyy, const double *Ayzy, const double *Azzy, + const double *Axxz, const double *Axyz, const double *Axzz, const double *Ayyz, const double *Ayzz, const double *Azzz, + const double *betaxx, const double *dBxx, const double *betayx, const double *dByx, const double *betazx, const double *dBzx, + const double *betaxy, const double *dBxy, const double *betayy, const double *dByy, const double *betazy, const double *dBzy, + const double *betaxz, const double *dBxz, const double *betayz, const double *dByz, const double *betazz, const double *dBzz, + const double *chix, const double *chiy, const double *chiz, + const double *Lapxx, const double *Lapxy, const double *Lapxz, const double *Lapyy, const double *Lapyz, const double *Lapzz, + const double *sfxxx, const double *sfyxx, const double *sfzxx, + const double *sfxxy, const double *sfyxy, const double *sfzxy, + const double *sfxxz, const double *sfyxz, const double *sfzxz, + const double *sfxyy, const double *sfyyy, const double *sfzyy, + const double *sfxyz, const double *sfyyz, const double *sfzyz, + const double *sfxzz, const double *sfyzz, const double *sfzzz, + const double *chixx, const double *chixy, const double *chixz, const double *chiyy, const double *chiyz, const double *chizz, + const double *gxxxx, const double *gxyxx, const double *gxzxx, const double *gyyxx, const double *gyzxx, const double *gzzxx, + const double *gxxxy, const double *gxyxy, const double *gxzxy, const double *gyyxy, const double *gyzxy, const double *gzzxy, + const double *gxxxz, const double *gxyxz, const double *gxzxz, const double *gyyxz, const double *gyzxz, const double *gzzxz, + const double *gxxyy, const double *gxyyy, const double *gxzyy, const double *gyyyy, const double *gyzyy, const double *gzzyy, + const double *gxxyz, const double *gxyyz, const double *gxzyz, const double *gyyyz, const double *gyzyz, const double *gzzyz, + const double *gxxzz, const double *gxyzz, const double *gxzzz, const double *gyyzz, const double *gyzzz, const double *gzzzz, + const double *Gamxx, const double *gxxx, const double *gxyx, const double *gxzx, + const double *Gamyx, const double *gyyx, const double *gyzx, + const double *Gamzx, const double *gzzx, + const double *Gamxy, const double *gxxy, const double *gxyy, const double *gxzy, + const double *Gamyy, const double *gyyy, const double *gyzy, + const double *Gamzy, const double *gzzy, + const double *Gamxz, const double *gxxz, const double *gxyz, const double *gxzz, + const double *Gamyz, const double *gyyz, const double *gyzz, + const double *Gamzz, const double *gzzz, + const double *Kx, const double *Ky, const double *Kz, + const double *TZx, const double *TZy, const double *TZz, + const double *Gamx, const double *gxx, const double *gxy, const double *gxz, + const double *Gamy, const double *gyy, const double *gyz, + const double *Gamz, const double *gzz, const double *trK, const double *TZ, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *chi_rhs, + double *Gamx_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, + double *Gamy_rhs, double *gyy_rhs, double *gyz_rhs, + double *Gamz_rhs, double *gzz_rhs, double *trK_rhs, double *TZ_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs) +{ + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_gp.all; idx += blockDim.x * gridDim.x) { + double point_kappa1 = 0.0; + double kappa2 = 0.0; + double chi_div_floor = 1.0e-5; + double rA11 = 0.0, rA12 = 0.0, rA13 = 0.0, rA22 = 0.0, rA23 = 0.0, rA33 = 0.0; + double rchi = 0.0, rG1 = 0.0, rg11 = 0.0, rg12 = 0.0, rg13 = 0.0; + double rG2 = 0.0, rg22 = 0.0, rg23 = 0.0, rG3 = 0.0, rg33 = 0.0, rKhat = 0.0, rTheta = 0.0; + + z4c_rhs_point_device( + Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx], + alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx], + betax[idx], betay[idx], betaz[idx], + chin1[idx], chi_div_floor, + Lapx[idx], + Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx], + Lapy[idx], + Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx], + Lapz[idx], + Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx], + betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx], + betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx], + betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx], + chix[idx], chiy[idx], chiz[idx], + Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx], + sfxxx[idx], sfyxx[idx], sfzxx[idx], + sfxxy[idx], sfyxy[idx], sfzxy[idx], + sfxxz[idx], sfyxz[idx], sfzxz[idx], + sfxyy[idx], sfyyy[idx], sfzyy[idx], + sfxyz[idx], sfyyz[idx], sfzyz[idx], + sfxzz[idx], sfyzz[idx], sfzzz[idx], + chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx], + gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx], + gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx], + gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx], + gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx], + gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx], + gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx], + Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx], + Gamyx[idx], gyyx[idx], gyzx[idx], + Gamzx[idx], gzzx[idx], + Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx], + Gamyy[idx], gyyy[idx], gyzy[idx], + Gamzy[idx], gzzy[idx], + Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx], + Gamyz[idx], gyyz[idx], gyzz[idx], + Gamzz[idx], gzzz[idx], + Kx[idx], Ky[idx], Kz[idx], + TZx[idx], TZy[idx], TZz[idx], + Gamx[idx], gxx[idx], gxy[idx], gxz[idx], + Gamy[idx], gyy[idx], gyz[idx], + Gamz[idx], gzz[idx], + point_kappa1, kappa2, + trK[idx], + rA11, rA12, rA13, rA22, rA23, rA33, + rchi, + rG1, rg11, rg12, rg13, + rG2, rg22, rg23, + rG3, rg33, rKhat, rTheta, TZ[idx]); + + Axx_rhs[idx] = rA11; + Axy_rhs[idx] = rA12; + Axz_rhs[idx] = rA13; + Ayy_rhs[idx] = rA22; + Ayz_rhs[idx] = rA23; + Azz_rhs[idx] = rA33; + chi_rhs[idx] = rchi; + Gamx_rhs[idx] = rG1; + gxx_rhs[idx] = rg11; + gxy_rhs[idx] = rg12; + gxz_rhs[idx] = rg13; + Gamy_rhs[idx] = rG2; + gyy_rhs[idx] = rg22; + gyz_rhs[idx] = rg23; + Gamz_rhs[idx] = rG3; + gzz_rhs[idx] = rg33; + trK_rhs[idx] = rKhat; + TZ_rhs[idx] = rTheta; + + Lap_rhs[idx] = -2.0 * alpn1[idx] * trK[idx]; +#if (GAUGE == 0) + betax_rhs[idx] = 0.75 * dtSfx[idx]; + betay_rhs[idx] = 0.75 * dtSfy[idx]; + betaz_rhs[idx] = 0.75 * dtSfz[idx]; + dtSfx_rhs[idx] = Gamx_rhs[idx] - 2.0 * dtSfx[idx]; + dtSfy_rhs[idx] = Gamy_rhs[idx] - 2.0 * dtSfy[idx]; + dtSfz_rhs[idx] = Gamz_rhs[idx] - 2.0 * dtSfz[idx]; +#elif (GAUGE == 1) + betax_rhs[idx] = Gamx[idx] - 2.0 * betax[idx]; + betay_rhs[idx] = Gamy[idx] - 2.0 * betay[idx]; + betaz_rhs[idx] = Gamz[idx] - 2.0 * betaz[idx]; + dtSfx_rhs[idx] = 0.0; + dtSfy_rhs[idx] = 0.0; + dtSfz_rhs[idx] = 0.0; +#else +#error "z4c_rhs_cuda.cu currently supports GAUGE == 0 or GAUGE == 1 for Z4C" +#endif + } +} + +__global__ __launch_bounds__(128, 4) +void kern_z4c_damping(const double *alpn1, + const double *gxx, const double *gxy, const double *gxz, + const double *gyy, const double *gyz, const double *gzz, + const double *gxxx, const double *gxyx, const double *gxzx, + const double *gyyx, const double *gyzx, const double *gzzx, + const double *gxxy, const double *gxyy, const double *gxzy, + const double *gyyy, const double *gyzy, const double *gzzy, + const double *gxxz, const double *gxyz, const double *gxzz, + const double *gyyz, const double *gyzz, const double *gzzz, + const double *TZ, const double *Gamx, const double *Gamy, const double *Gamz, + double *TZ_rhs, double *trK_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs) +{ + const double kappa1 = 2.0e-2; + const double kappa2 = 0.0; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_gp.all; idx += blockDim.x * gridDim.x) { + double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0; + z4c_contract_gamma_device_host_order( + gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx], + gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx], + gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx], + gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx], + Gamxa, Gamya, Gamza); + const double a = alpn1[idx]; + const double theta = TZ[idx]; + TZ_rhs[idx] -= a * (2.0 + kappa2) * kappa1 * theta; + trK_rhs[idx] += a * kappa1 * (1.0 - kappa2) * theta; + Gamx_rhs[idx] -= 2.0 * a * kappa1 * (Gamx[idx] - Gamxa); + Gamy_rhs[idx] -= 2.0 * a * kappa1 * (Gamy[idx] - Gamya); + Gamz_rhs[idx] -= 2.0 * a * kappa1 * (Gamz[idx] - Gamza); + } +} + +static void gpu_z4c_lopsided_kodis_batch(double eps_val, int all, + int do_lopsided, int do_kodis) +{ + const int z4c_field_count = +#if (GAUGE == 0) + 25; +#else + 22; +#endif + const int adv_slots[25] = { + S_chi, S_trK, S_gxx, S_gxy, S_gxz, S_gyy, S_gyz, S_gzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, S_Lap, S_betax, S_betay, S_betaz, +#if (GAUGE == 0) + S_dtSfx, S_dtSfy, S_dtSfz, +#endif + S_TZ + }; + const int ko_slots[25] = { + S_chi, S_trK, S_gxx, S_gxy, S_gxz, S_gyy, S_gyz, S_gzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, S_Lap, S_betax, S_betay, S_betaz, +#if (GAUGE == 0) + S_dtSfx, S_dtSfy, S_dtSfz, +#endif + S_TZ + }; + const int rhs_slots[25] = { + S_chi_rhs, S_trK_rhs, S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, +#if (GAUGE == 0) + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, +#endif + S_TZ_rhs + }; + const int soa_signs[3 * 25] = { + 1, 1, 1, + 1, 1, 1, + 1, 1, 1, + -1, -1, 1, + -1, 1, -1, + 1, 1, 1, + 1, -1, -1, + 1, 1, 1, + 1, 1, 1, + -1, -1, 1, + -1, 1, -1, + 1, 1, 1, + 1, -1, -1, + 1, 1, 1, + -1, 1, 1, + 1, -1, 1, + 1, 1, -1, + 1, 1, 1, + -1, 1, 1, + 1, -1, 1, + 1, 1, -1, +#if (GAUGE == 0) + -1, 1, 1, + 1, -1, 1, + 1, 1, -1, +#endif + 1, 1, 1 + }; + LopsidedKodisTables tables = {}; + for (int i = 0; i < z4c_field_count; ++i) { + tables.adv_fields[i] = g_buf.slot[adv_slots[i]]; + tables.ko_fields[i] = g_buf.slot[ko_slots[i]]; + tables.rhs_fields[i] = g_buf.slot[rhs_slots[i]]; + tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0]; + tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1]; + tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)z4c_field_count); + kern_lopsided_kodis_batched<<>>( + g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], + tables, eps_val, do_lopsided, do_kodis); +} + +static void launch_z4c_rhs_pipeline(int all, double eps) +{ + const int SYM = 1; + const int ANTI = -1; +#define D(s) g_buf.slot[s] + kern_phase1_prep<<>>( + D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), + D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz)); + + { + double *src_fields[] = { + D(S_betax), D(S_betay), D(S_betaz), + D(S_dtSfx), D(S_dtSfy), D(S_dtSfz), + D(S_chi), D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz), + D(S_Gamx), D(S_Gamy), D(S_Gamz), D(S_Lap), D(S_trK), D(S_TZ), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz) + }; + double *fx_fields[] = { + D(S_betaxx), D(S_betayx), D(S_betazx), + D(S_dBxx), D(S_dByx), D(S_dBzx), + D(S_chix), D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_Gamxx), D(S_Gamyx), D(S_Gamzx), D(S_Lapx), D(S_Kx), D(S_TZx), + D(S_Axxx), D(S_Axyx), D(S_Axzx), D(S_Ayyx), D(S_Ayzx), D(S_Azzx) + }; + double *fy_fields[] = { + D(S_betaxy), D(S_betayy), D(S_betazy), + D(S_dBxy), D(S_dByy), D(S_dBzy), + D(S_chiy), D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy), D(S_Lapy), D(S_Ky), D(S_TZy), + D(S_Axxy), D(S_Axyy), D(S_Axzy), D(S_Ayyy), D(S_Ayzy), D(S_Azzy) + }; + double *fz_fields[] = { + D(S_betaxz), D(S_betayz), D(S_betazz), + D(S_dBxz), D(S_dByz), D(S_dBzz), + D(S_chiz), D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t), D(S_Lapz), D(S_Kz), D(S_TZz), + D(S_Axxz), D(S_Axyz), D(S_Axzz), D(S_Ayyz), D(S_Ayzz), D(S_Azzz) + }; + const int soa_signs[] = { + ANTI, SYM, SYM, SYM, ANTI, SYM, SYM, SYM, ANTI, + ANTI, SYM, SYM, SYM, ANTI, SYM, SYM, SYM, ANTI, + SYM, SYM, SYM, + SYM, SYM, SYM, ANTI, ANTI, SYM, ANTI, SYM, ANTI, + SYM, SYM, SYM, SYM, ANTI, ANTI, SYM, SYM, SYM, + ANTI, SYM, SYM, SYM, ANTI, SYM, SYM, SYM, ANTI, + SYM, SYM, SYM, SYM, SYM, SYM, SYM, SYM, SYM, + SYM, SYM, SYM, ANTI, ANTI, SYM, ANTI, SYM, ANTI, + SYM, SYM, SYM, SYM, ANTI, ANTI, SYM, SYM, SYM + }; + gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all); + } + + { + double *src_fields[] = { + D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz), + D(S_betax), D(S_betay), D(S_betaz), D(S_chi), D(S_Lap) + }; + double *fxx_fields[] = { + D(S_gxxxx), D(S_gyyxx), D(S_gzzxx), D(S_gxyxx), D(S_gxzxx), D(S_gyzxx), + D(S_sfxxx), D(S_sfyxx), D(S_sfzxx), D(S_chixx), D(S_Lapxx) + }; + double *fxy_fields[] = { + D(S_gxxxy), D(S_gyyxy), D(S_gzzxy), D(S_gxyxy), D(S_gxzxy), D(S_gyzxy), + D(S_sfxxy), D(S_sfyxy), D(S_sfzxy), D(S_chixy), D(S_Lapxy) + }; + double *fxz_fields[] = { + D(S_gxxxz), D(S_gyyxz), D(S_gzzxz), D(S_gxyxz), D(S_gxzxz), D(S_gyzxz), + D(S_sfxxz), D(S_sfyxz), D(S_sfzxz), D(S_chixz), D(S_Lapxz) + }; + double *fyy_fields[] = { + D(S_gxxyy), D(S_gyyyy), D(S_gzzyy), D(S_gxyyy), D(S_gxzyy), D(S_gyzyy), + D(S_sfxyy), D(S_sfyyy), D(S_sfzyy), D(S_chiyy), D(S_Lapyy) + }; + double *fyz_fields[] = { + D(S_gxxyz), D(S_gyyyz), D(S_gzzyz), D(S_gxyyz), D(S_gxzyz), D(S_gyzyz), + D(S_sfxyz), D(S_sfyyz), D(S_sfzyz), D(S_chiyz), D(S_Lapyz) + }; + double *fzz_fields[] = { + D(S_gxxzz), D(S_gyyzz), D(S_gzzzz), D(S_gxyzz), D(S_gxzzz), D(S_gyzzz), + D(S_sfxzz), D(S_sfyzz), D(S_sfzzz), D(S_chizz), D(S_Lapzz) + }; + const int soa_signs[] = { + SYM, SYM, SYM, SYM, SYM, SYM, SYM, SYM, SYM, + ANTI, ANTI, SYM, ANTI, SYM, ANTI, SYM, ANTI, ANTI, + ANTI, SYM, SYM, SYM, ANTI, SYM, SYM, SYM, ANTI, + SYM, SYM, SYM, SYM, SYM, SYM + }; + gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])), + src_fields, fxx_fields, fxy_fields, fxz_fields, + fyy_fields, fyz_fields, fzz_fields, soa_signs, all); + } + + kern_z4c_point_rhs<<>>( + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_alpn1), D(S_dtSfx), D(S_dtSfy), D(S_dtSfz), + D(S_betax), D(S_betay), D(S_betaz), D(S_chin1), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_Axxx), D(S_Axyx), D(S_Axzx), D(S_Ayyx), D(S_Ayzx), D(S_Azzx), + D(S_Axxy), D(S_Axyy), D(S_Axzy), D(S_Ayyy), D(S_Ayzy), D(S_Azzy), + D(S_Axxz), D(S_Axyz), D(S_Axzz), D(S_Ayyz), D(S_Ayzz), D(S_Azzz), + D(S_betaxx), D(S_dBxx), D(S_betayx), D(S_dByx), D(S_betazx), D(S_dBzx), + D(S_betaxy), D(S_dBxy), D(S_betayy), D(S_dByy), D(S_betazy), D(S_dBzy), + D(S_betaxz), D(S_dBxz), D(S_betayz), D(S_dByz), D(S_betazz), D(S_dBzz), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_Lapxx), D(S_Lapxy), D(S_Lapxz), D(S_Lapyy), D(S_Lapyz), D(S_Lapzz), + D(S_sfxxx), D(S_sfyxx), D(S_sfzxx), D(S_sfxxy), D(S_sfyxy), D(S_sfzxy), + D(S_sfxxz), D(S_sfyxz), D(S_sfzxz), D(S_sfxyy), D(S_sfyyy), D(S_sfzyy), + D(S_sfxyz), D(S_sfyyz), D(S_sfzyz), D(S_sfxzz), D(S_sfyzz), D(S_sfzzz), + D(S_chixx), D(S_chixy), D(S_chixz), D(S_chiyy), D(S_chiyz), D(S_chizz), + D(S_gxxxx), D(S_gxyxx), D(S_gxzxx), D(S_gyyxx), D(S_gyzxx), D(S_gzzxx), + D(S_gxxxy), D(S_gxyxy), D(S_gxzxy), D(S_gyyxy), D(S_gyzxy), D(S_gzzxy), + D(S_gxxxz), D(S_gxyxz), D(S_gxzxz), D(S_gyyxz), D(S_gyzxz), D(S_gzzxz), + D(S_gxxyy), D(S_gxyyy), D(S_gxzyy), D(S_gyyyy), D(S_gyzyy), D(S_gzzyy), + D(S_gxxyz), D(S_gxyyz), D(S_gxzyz), D(S_gyyyz), D(S_gyzyz), D(S_gzzyz), + D(S_gxxzz), D(S_gxyzz), D(S_gxzzz), D(S_gyyzz), D(S_gyzzz), D(S_gzzzz), + D(S_Gamxx), D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_Gamyx), D(S_gyyx), D(S_gyzx), D(S_Gamzx), D(S_gzzx), + D(S_Gamxy), D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_Gamyy_t), D(S_gyyy), D(S_gyzy), D(S_Gamzy), D(S_gzzy), + D(S_Gamxz), D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_Gamyz_t), D(S_gyyz), D(S_gyzz), D(S_Gamzz_t), D(S_gzzz), + D(S_Kx), D(S_Ky), D(S_Kz), D(S_TZx), D(S_TZy), D(S_TZz), + D(S_Gamx), D(S_gxx), D(S_gxy), D(S_gxz), D(S_Gamy), D(S_gyy), D(S_gyz), D(S_Gamz), D(S_gzz), D(S_trK), D(S_TZ), + D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs), D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs), + D(S_chi_rhs), D(S_Gamx_rhs), D(S_gxx_rhs), D(S_gxy_rhs), D(S_gxz_rhs), + D(S_Gamy_rhs), D(S_gyy_rhs), D(S_gyz_rhs), D(S_Gamz_rhs), D(S_gzz_rhs), D(S_trK_rhs), D(S_TZ_rhs), + D(S_Lap_rhs), D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs), D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs)); + + gpu_z4c_lopsided_kodis_batch(0.0, all, 1, 0); + + kern_z4c_damping<<>>( + D(S_alpn1), D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_TZ), D(S_Gamx), D(S_Gamy), D(S_Gamz), + D(S_TZ_rhs), D(S_trK_rhs), D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs)); + + gpu_z4c_lopsided_kodis_batch(eps, all, 0, 1); +#undef D +} + +} // namespace z4c_cuda + +extern "C" void f_constraint_bssn(int *, double *, double *, double *, + double *, double *, + double *, double *, double *, double *, double *, double *, + double *, double *, double *, double *, double *, double *, + double *, double *, double *, + double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, + double *, double *, double *, double *, double *, double *, + double *, double *, double *, double *, double *, double *, + double *, double *, double *, double *, double *, double *, + double *, double *, double *, double *, double *, double *, double *, double *, + double *, double *, double *, + int &); + +static int compute_rhs_z4c_cuda_host( + int *ex, double &T, double *X, double *Y, double *Z, + double *chi_state, double *chi_constraints, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *TZ, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *TZ_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon, + int &Symmetry, int &Lev, double &eps, int &co) +{ + (void)T; + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + const int nx = ex[0], ny = ex[1], nz = ex[2]; + const int all = nx * ny * nz; + const size_t bytes = (size_t)all * sizeof(double); + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + +#define D(s) g_buf.slot[s] + double *h2d_src[] = { + chi_state, trK, dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, TZ, + rho, Sx, Sy, Sz, + Sxx, Sxy, Sxz, Syy, Syz, Szz + }; + const int h2d_count = (int)(sizeof(h2d_src) / sizeof(h2d_src[0])); + for (int s = 0; s < h2d_count; ++s) { + std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes); + } + CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage, (size_t)h2d_count * bytes, cudaMemcpyHostToDevice)); + + launch_z4c_rhs_pipeline(all, eps); + + const int rhs_slots[] = { + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, S_TZ_rhs + }; + double *rhs_dst[] = { + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, TZ_rhs + }; + const int rhs_count = (int)(sizeof(rhs_slots) / sizeof(rhs_slots[0])); + for (int i = 0; i < rhs_count; ++i) { + CUDA_CHECK(cudaMemcpy(g_buf.h_stage + (size_t)i * all, D(rhs_slots[i]), bytes, cudaMemcpyDeviceToHost)); + } + for (int i = 0; i < rhs_count; ++i) { + std::memcpy(rhs_dst[i], g_buf.h_stage + (size_t)i * all, bytes); + } +#undef D + + if (co == 0) { +#if (ABV == 0) + f_ricci_gamma(ex, X, Y, Z, + chi_constraints, + dxx, gxy, gxz, dyy, gyz, dzz, + Gamx, Gamy, Gamz, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz, + Symmetry); +#endif + f_constraint_bssn(ex, X, Y, Z, + chi_constraints, trK, + dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, rho, Sx, Sy, Sz, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz, + Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon, + Symmetry); + } + return 0; +} + +extern "C" int f_compute_rhs_Z4c(int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *TZ, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *TZ_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon, + int &Symmetry, int &Lev, double &eps, int &co) +{ + return compute_rhs_z4c_cuda_host( + ex, T, X, Y, Z, + chi, chi, trK, + dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, + TZ, + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, + TZ_rhs, + rho, Sx, Sy, Sz, + Sxx, Sxy, Sxz, Syy, Syz, Szz, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz, + Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon, + Symmetry, Lev, eps, co); +} + +extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *TZ, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *TZ_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon, + int &Symmetry, int &Lev, double &eps, int &co, double &chitiny) +{ + const int all = ex[0] * ex[1] * ex[2]; + std::vector chi_clamped(chi, chi + all); + for (int i = 0; i < all; ++i) { + if (chi_clamped[i] < chitiny) chi_clamped[i] = chitiny; + } + return compute_rhs_z4c_cuda_host( + ex, T, X, Y, Z, + chi_clamped.data(), chi, trK, + dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, + TZ, + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, + TZ_rhs, + rho, Sx, Sy, Sz, + Sxx, Sxy, Sxz, Syy, Syz, Szz, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz, + Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon, + Symmetry, Lev, eps, co); +} + +extern "C" int z4c_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny) +{ + (void)T; + (void)state_host_out; + + if (RK4 < 0 || RK4 > 3) return 1; + + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + const bool profile = cuda_profile_enabled(); + const double t_total0 = profile ? cuda_profile_now_ms() : 0.0; + double state_ms = 0.0; + double rhs_ms = 0.0; + double bc_ms = 0.0; + double finalize_ms = 0.0; + double output_ms = 0.0; + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + int touch_xmin = 0, touch_xmax = 0; + int touch_ymin = 0, touch_ymax = 0; + int touch_zmin = 0, touch_zmax = 0; + + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + if (Lev > 0) { + compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + + StepContext &ctx = ensure_step_ctx(block_tag, all); + const bool use_resident_state = (keep_resident_state != 0); + if (use_resident_state) { + bind_state_input_slots(ctx.d_state_curr); + bind_state_output_slots(ctx.d_state_next); + } + + double t0 = profile ? cuda_profile_now_ms() : 0.0; + if (!use_resident_state || RK4 == 0 || !ctx.state_ready) { + upload_state_inputs(state_host_in, all); + } + if (apply_enforce_ga) { + kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], + g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], + g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], + g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + } + if (RK4 == 0) { + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice)); + } + if (profile) { + cuda_profile_sync(); + state_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + launch_z4c_rhs_pipeline((int)all, eps); + if (profile) { + cuda_profile_sync(); + rhs_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (apply_bam_bc) { + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]], + g_buf.slot[k_state_rhs_slots[i]], + propspeed[i], + soa_flat[3 * i + 0], + soa_flat[3 * i + 1], + soa_flat[3 * i + 2], + X, Y, Z, bbox, Symmetry); + } + } + if (profile) { + cuda_profile_sync(); + bc_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); + if (Lev > 0) { + gpu_copy_patch_boundary_batch(ctx, (int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + if (profile) { + cuda_profile_sync(); + finalize_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (use_resident_state) { + std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem); + ctx.d_state_curr.swap(ctx.d_state_next); + ctx.state_ready = true; + } else { + download_state_outputs(state_host_out, all); + } + if (profile) { + cuda_profile_sync(); + output_ms += cuda_profile_now_ms() - t0; + CudaProfileStats &stats = cuda_profile_stats(); + stats.calls++; + stats.total_ms += cuda_profile_now_ms() - t_total0; + stats.state_ms += state_ms; + stats.rhs_ms += rhs_ms; + stats.bc_ms += bc_ms; + stats.finalize_ms += finalize_ms; + stats.output_ms += output_ms; + cuda_profile_maybe_log(); + } + return 0; +} + +extern "C" int z4c_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + download_resident_state(block_tag, ex, state_host_out); + return 0; +} + +extern "C" int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice); + return 0; +} + +extern "C" int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex, + i0, j0, k0, sx, sy, sz, + cudaMemcpyHostToDevice); + return 0; +} + +extern "C" int z4c_cuda_download_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_out) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out, + cudaMemcpyDeviceToHost); + return 0; +} + +extern "C" int z4c_cuda_upload_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_in) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in, + cudaMemcpyHostToDevice); + return 0; +} + +extern "C" int z4c_cuda_has_resident_state(void *block_tag) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + return has_resident_state(block_tag) ? 1 : 0; +} + +extern "C" void z4c_cuda_release_step_ctx(void *block_tag) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + release_step_ctx(block_tag); +} diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h new file mode 100644 index 0000000..a1be80b --- /dev/null +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -0,0 +1,83 @@ +#ifndef Z4C_RHS_CUDA_H +#define Z4C_RHS_CUDA_H + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + Z4C_CUDA_STATE_COUNT = 25 +}; + +int z4c_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny); + +int z4c_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out); + +int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag, + int state_index, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, + int state_count, + double *host_buffer, + int *ex, + int i0, int j0, int k0, + int sx, int sy, int sz); + +int z4c_cuda_download_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_out); + +int z4c_cuda_upload_state_subset(void *block_tag, + int *ex, + int subset_count, + const int *state_indices, + double **state_host_in); + +int z4c_cuda_has_resident_state(void *block_tag); + +void z4c_cuda_release_step_ctx(void *block_tag); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/makefile_and_run.py b/makefile_and_run.py index 3419a05..4f7c377 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -70,9 +70,9 @@ def makefile_ABE(): ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE" elif (input_data.GPU_Calculation == "yes"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 ABE_CUDA" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA" else: print( " CPU/GPU numerical calculation setting is wrong " ) print( )