diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index a01fea2..97b3e61 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -21,9 +21,12 @@ using namespace std; #include "Ansorg.h" #include "fmisc.h" #include "Parallel.h" -#include "bssn_class.h" -#include "bssn_rhs.h" -#include "initial_puncture.h" +#include "bssn_class.h" +#include "bssn_rhs.h" +#if USE_CUDA_BSSN +#include "bssn_rhs_cuda.h" +#endif +#include "initial_puncture.h" #include "enforce_algebra.h" #include "rungekutta4_rout.h" #include "sommerfeld_rout.h" @@ -47,6 +50,35 @@ using namespace std; #define BSSN_ENABLE_MEM_USAGE_LOG 0 #endif +#if USE_CUDA_BSSN +namespace { + +bool fill_bssn_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = nullptr, + double *soa_flat = nullptr) +{ + int idx = 0; + while (vars && idx < BSSN_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == BSSN_CUDA_STATE_COUNT && vars == 0; +} + +} // namespace +#endif + //================================================================================================ // define bssn_class @@ -3104,104 +3136,148 @@ void bssn_class::Step(int lev, int YN) cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif - if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], - cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], - cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], - cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn], - cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn], - cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn], - cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn], - cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn], - cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn], - cg->fgfs[Lap_rhs->sgfn], - cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn], - cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn], - cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], - cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], - cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], - cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn], - cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn], - cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn], - cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn], - cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn], - cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], - cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], - cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], - cg->fgfs[Cons_Ham->sgfn], - cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], - cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], - Symmetry, lev, ndeps, pre)) - { - cout << "find NaN in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here - while (varl0) - { -#if (SommerType == 0) -#ifndef WithShell - if (lev == 0) // sommerfeld indeed - f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - cg->fgfs[varlrhs->data->sgfn], - cg->fgfs[varl0->data->sgfn], - varl0->data->propspeed, varl0->data->SoA, - Symmetry); - -#endif -#endif - f_rungekutta4_rout(cg->shape, dT_lev, - cg->fgfs[varl0->data->sgfn], - cg->fgfs[varl->data->sgfn], - cg->fgfs[varlrhs->data->sgfn], - iter_count); -#ifndef WithShell - if (lev > 0) // fix BD point -#endif - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], - varl0->data->SoA, - Symmetry, cor); - -#if (SommerType == 1) -#warning "shell part still bam type" - if (lev == 0) // Shibata type sommerfeld - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], - varl0->data->SoA, - Symmetry, pre); -#endif - - varl0 = varl0->next; - varl = varl->next; - varlrhs = varlrhs->next; - } - } - f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); + bool used_gpu_substep = false; +#if USE_CUDA_BSSN + { + double *state_in[BSSN_CUDA_STATE_COUNT]; + double *state_out[BSSN_CUDA_STATE_COUNT]; + double *matter[BSSN_CUDA_MATTER_COUNT] = { + cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]}; + double propspeed[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || + !fill_bssn_cuda_views(cg, SynchList_pre, state_out)) + { + cout << "CUDA BSSN state list mismatch on predictor step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + if (bssn_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, matter, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, pre)) + { + cout << "CUDA predictor substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + if (!used_gpu_substep) + { + if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], + cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], + cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], + cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn], + cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn], + cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn], + cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn], + cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn], + cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn], + cg->fgfs[Lap_rhs->sgfn], + cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn], + cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn], + cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], + cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn], + cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn], + cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn], + cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn], + cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn], + cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], + cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], + cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], + cg->fgfs[Cons_Ham->sgfn], + cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], + Symmetry, lev, ndeps, pre)) + { + cout << "find NaN in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + + // rk4 substep boundary fix + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here + while (varl0) + { +#if !USE_CUDA_BSSN +#if (SommerType == 0) +#ifndef WithShell + if (lev == 0) // sommerfeld indeed + f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + cg->fgfs[varlrhs->data->sgfn], + cg->fgfs[varl0->data->sgfn], + varl0->data->propspeed, varl0->data->SoA, + Symmetry); + +#endif +#endif + f_rungekutta4_rout(cg->shape, dT_lev, + cg->fgfs[varl0->data->sgfn], + cg->fgfs[varl->data->sgfn], + cg->fgfs[varlrhs->data->sgfn], + iter_count); +#endif +#ifndef WithShell + if (lev > 0) // fix BD point +#endif + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + +#if (SommerType == 1) +#warning "shell part still bam type" + if (lev == 0) // Shibata type sommerfeld + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn], + varl0->data->SoA, + Symmetry, pre); +#endif + + varl0 = varl0->next; + varl = varl->next; + varlrhs = varlrhs->next; + } + } + f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); } if (BP == Pp->data->ble) break; @@ -3469,102 +3545,148 @@ void bssn_class::Step(int lev, int YN) cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #endif - if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn], - cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn], - cg->fgfs[Lap->sgfn], - cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn], - cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn], - cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn], - cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn], - cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn], - cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn], - cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn], - cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn], - cg->fgfs[Lap1->sgfn], - cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn], - cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn], - cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], - cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], - cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], - cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn], - cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn], - cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn], - cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn], - cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn], - cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], - cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], - cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], - cg->fgfs[Cons_Ham->sgfn], - cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], - cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], - Symmetry, lev, ndeps, cor)) - { - cout << "find NaN in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here - while (varl0) - { -#if (SommerType == 0) -#ifndef WithShell - if (lev == 0) // sommerfeld indeed - f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - cg->fgfs[varl1->data->sgfn], - cg->fgfs[varl->data->sgfn], varl0->data->propspeed, varl0->data->SoA, - Symmetry); -#endif -#endif - f_rungekutta4_rout(cg->shape, dT_lev, - cg->fgfs[varl0->data->sgfn], - cg->fgfs[varl1->data->sgfn], - cg->fgfs[varlrhs->data->sgfn], - iter_count); - -#ifndef WithShell - if (lev > 0) // fix BD point -#endif - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], - varl0->data->SoA, - Symmetry, cor); - -#if (SommerType == 1) - if (lev == 1) // shibata type sommerfeld - f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], - Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], - Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], - dT_lev, - cg->fgfs[phi0->sgfn], - cg->fgfs[Lap0->sgfn], - cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn], - varl0->data->SoA, - Symmetry, cor); -#endif - - varl0 = varl0->next; - varl = varl->next; - varl1 = varl1->next; - varlrhs = varlrhs->next; - } - } - f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); + bool used_gpu_substep = false; +#if USE_CUDA_BSSN + { + double *state_in[BSSN_CUDA_STATE_COUNT]; + double *state_out[BSSN_CUDA_STATE_COUNT]; + double *matter[BSSN_CUDA_MATTER_COUNT] = { + cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]}; + double propspeed[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || + !fill_bssn_cuda_views(cg, SynchList_cor, state_out)) + { + cout << "CUDA BSSN state list mismatch on corrector step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + if (bssn_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, matter, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, cor)) + { + cout << "CUDA corrector substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + if (!used_gpu_substep) + { + if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn], + cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn], + cg->fgfs[Lap->sgfn], + cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn], + cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn], + cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn], + cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn], + cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn], + cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn], + cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn], + cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn], + cg->fgfs[Lap1->sgfn], + cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn], + cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn], + cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], + cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn], + cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn], + cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn], + cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn], + cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn], + cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], + cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], + cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], + cg->fgfs[Cons_Ham->sgfn], + cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], + Symmetry, lev, ndeps, cor)) + { + cout << "find NaN in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + // rk4 substep boundary fix + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; + // we do not check the correspondence here + while (varl0) + { +#if !USE_CUDA_BSSN +#if (SommerType == 0) +#ifndef WithShell + if (lev == 0) // sommerfeld indeed + f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + cg->fgfs[varl1->data->sgfn], + cg->fgfs[varl->data->sgfn], + varl0->data->propspeed, varl0->data->SoA, + Symmetry); +#endif +#endif + f_rungekutta4_rout(cg->shape, dT_lev, + cg->fgfs[varl0->data->sgfn], + cg->fgfs[varl1->data->sgfn], + cg->fgfs[varlrhs->data->sgfn], + iter_count); +#endif + +#ifndef WithShell + if (lev > 0) // fix BD point +#endif + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn], + varl0->data->SoA, + Symmetry, cor); + +#if (SommerType == 1) + if (lev == 1) // shibata type sommerfeld + f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2], + Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2], + Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5], + dT_lev, + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], + cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn], + varl0->data->SoA, + Symmetry, cor); +#endif + + varl0 = varl0->next; + varl = varl->next; + varl1 = varl1->next; + varlrhs = varlrhs->next; + } + } + f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); } if (BP == Pp->data->ble) break; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 640b1c1..15b5ba5 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5,10 +5,12 @@ * Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o. */ +#include #include #include #include #include +#include #include #include "macrodef.h" #include "bssn_rhs.h" @@ -219,6 +221,40 @@ static const int STAGE_SLOT_COUNT = ? H2D_INPUT_SLOT_COUNT : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT); +static constexpr int BSSN_STATE_COUNT = 24; +static constexpr int BSSN_MATTER_COUNT = 10; + +static const int k_state_input_slots[BSSN_STATE_COUNT] = { + S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, + S_Lap, S_betax, S_betay, S_betaz, + S_dtSfx, S_dtSfy, S_dtSfz +}; + +static const int k_state_rhs_slots[BSSN_STATE_COUNT] = { + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs +}; + +static const int k_matter_slots[BSSN_MATTER_COUNT] = { + S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz +}; + +struct StepContext { + double *d_state0_mem; + double *d_accum_mem; + std::array d_state0; + std::array d_accum; + size_t cap_all; +}; + +static std::unordered_map g_step_ctx; + static void ensure_gpu_buffers(int nx, int ny, int nz) { size_t all = (size_t)nx * ny * nz; size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); @@ -269,6 +305,38 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { g_buf.prev_ny = ny; g_buf.prev_nz = nz; } + +static StepContext &ensure_step_ctx(void *block_tag, size_t all) +{ + StepContext &ctx = g_step_ctx[block_tag]; + if (ctx.cap_all < all) { + if (ctx.d_state0_mem) { + cudaFree(ctx.d_state0_mem); + ctx.d_state0_mem = nullptr; + } + if (ctx.d_accum_mem) { + cudaFree(ctx.d_accum_mem); + ctx.d_accum_mem = nullptr; + } + CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + ctx.cap_all = all; + } + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; + ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; + } + return ctx; +} + +static void release_step_ctx(void *block_tag) +{ + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) return; + if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem); + if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem); + g_step_ctx.erase(it); +} /* ================================================================== */ /* A. Symmetry boundary kernels (ord=2 and ord=3) */ @@ -937,6 +1005,177 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs, kern_kodis<<>>(fh, d_f_rhs, eps_val); } } + +__global__ void kern_rk4_finalize(const double * __restrict__ f0, + double * __restrict__ frhs, + double * __restrict__ accum, + double dT, + int rk4_stage) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + const double rhs = frhs[i]; + switch (rk4_stage) { + case 0: + accum[i] = rhs; + frhs[i] = f0[i] + 0.5 * dT * rhs; + break; + case 1: + accum[i] += 2.0 * rhs; + frhs[i] = f0[i] + 0.5 * dT * rhs; + break; + case 2: + accum[i] += 2.0 * rhs; + frhs[i] = f0[i] + dT * rhs; + break; + default: + frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs); + break; + } + } +} + +__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) + { + if (chi[i] < tinny) chi[i] = tinny; + } +} + +enum SommerFace { + FACE_XMAX = 0, + FACE_YMAX = 1, + FACE_ZMAX = 2, + FACE_XMIN = 3, + FACE_YMIN = 4, + FACE_ZMIN = 5 +}; + +__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh, + double * __restrict__ f_rhs, + int face, + double velocity, + double x0, + double y0, + double z0) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1; + const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1; + const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1; + const int imax = nx, jmax = ny, kmax = nz; + const int plane_count = + (face == FACE_XMAX || face == FACE_XMIN) ? ny * nz : + (face == FACE_YMAX || face == FACE_YMIN) ? nx * nz : + nx * ny; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < plane_count; + tid += blockDim.x * gridDim.x) + { + int i0 = 0, j0 = 0, k0 = 0; + if (face == FACE_XMAX || face == FACE_XMIN) { + j0 = tid % ny; + k0 = tid / ny; + i0 = (face == FACE_XMAX) ? (nx - 1) : 0; + } else if (face == FACE_YMAX || face == FACE_YMIN) { + i0 = tid % nx; + k0 = tid / nx; + j0 = (face == FACE_YMAX) ? (ny - 1) : 0; + } else { + i0 = tid % nx; + j0 = tid / nx; + k0 = (face == FACE_ZMAX) ? (nz - 1) : 0; + } + + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + const int p = idx_ex_d(i0, j0, k0); + + const double x = x0 + i0 * d_gp.dX; + const double y = y0 + j0 * d_gp.dY; + const double z = z0 + k0 * d_gp.dZ; + const double r = sqrt(x * x + y * y + z * z); + if (r == 0.0) continue; + + const double wx = velocity * x / r; + const double wy = velocity * y / r; + const double wz = velocity * z / r; + + double fx = 0.0, fy = 0.0, fz = 0.0; + if (wx > 0.0) { + if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]); + else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]); + else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wx < 0.0) { + if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]); + else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]); + } + + if (wy > 0.0) { + if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]); + else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]); + else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wy < 0.0) { + if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]); + else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]); + } + + if (wz > 0.0) { + if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]); + else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]); + else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + } else if (wz < 0.0) { + if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]); + else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]); + else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]); + } + + f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r; + } +} + +static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs, + double velocity, + double SoA0, double SoA1, double SoA2, + double *X, double *Y, double *Z, + const double *bbox, + int Symmetry) +{ + if (velocity == 0.0) return; + const int nx = g_buf.prev_nx; + const int ny = g_buf.prev_ny; + const int nz = g_buf.prev_nz; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX; + const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY; + const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ; + const bool touch_xmin = fabs(X[0] - bbox[0]) < dX && + !(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5); + const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY && + !(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5); + const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ && + !(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5); + + const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2); + kern_symbd_pack_ord2<<>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2); + if (touch_xmax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]); + if (touch_ymax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]); + if (touch_zmax) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]); + if (touch_xmin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]); + if (touch_ymin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]); + if (touch_zmin) kern_sommerfeld_face_bam<<>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]); +} /* ================================================================== */ /* C. Point-wise computation kernels */ @@ -2075,14 +2314,435 @@ void kern_phase18_constraints( +uxy*my_xz+uxz*mz_xz+uyz*mz_yz - F2o3*Kz[i] - F8*PI_V*Sz_m[i]; } -} +} + +static void setup_grid_params(int *ex, + double *X, double *Y, double *Z, + int Symmetry, double eps, int co) +{ + const int nx = ex[0]; + const int ny = ex[1]; + const int nz = ex[2]; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const int NO_SYMM = 0; + const int EQ_SYMM = 1; + + ensure_gpu_buffers(nx, ny, nz); + + GridParams gp = {}; + gp.ex[0] = nx; + gp.ex[1] = ny; + gp.ex[2] = nz; + gp.all = nx * ny * nz; + gp.dX = dX; + gp.dY = dY; + gp.dZ = dZ; + gp.d12dx = 1.0 / (12.0 * dX); + gp.d12dy = 1.0 / (12.0 * dY); + gp.d12dz = 1.0 / (12.0 * dZ); + gp.d2dx = 1.0 / (2.0 * dX); + gp.d2dy = 1.0 / (2.0 * dY); + gp.d2dz = 1.0 / (2.0 * dZ); + gp.Fdxdx = 1.0 / (12.0 * dX * dX); + gp.Fdydy = 1.0 / (12.0 * dY * dY); + gp.Fdzdz = 1.0 / (12.0 * dZ * dZ); + gp.Sdxdx = 1.0 / (dX * dX); + gp.Sdydy = 1.0 / (dY * dY); + gp.Sdzdz = 1.0 / (dZ * dZ); + gp.Fdxdy = 1.0 / (144.0 * dX * dY); + gp.Fdxdz = 1.0 / (144.0 * dX * dZ); + gp.Fdydz = 1.0 / (144.0 * dY * dZ); + gp.Sdxdy = 0.25 / (dX * dY); + gp.Sdxdz = 0.25 / (dX * dZ); + gp.Sdydz = 0.25 / (dY * dZ); + gp.iminF = 1; + gp.jminF = 1; + gp.kminF = 1; + gp.imaxF = nx; + gp.jmaxF = ny; + gp.kmaxF = nz; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1; + gp.iminF3 = 1; + gp.jminF3 = 1; + gp.kminF3 = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2; + gp.Symmetry = Symmetry; + gp.eps = eps; + gp.co = co; + gp.fh2_nx = nx + 2; + gp.fh2_ny = ny + 2; + gp.fh2_nz = nz + 2; + gp.fh3_nx = nx + 3; + gp.fh3_ny = ny + 3; + gp.fh3_nz = nz + 3; + CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams))); +} + +static void upload_state_and_matter(double **state_host, + double **matter_host, + size_t all) +{ + static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT, + "state + matter upload must match contiguous input slots"); + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); + } + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all, + matter_host[i], bytes); + } + CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, + (size_t)H2D_INPUT_SLOT_COUNT * bytes, + cudaMemcpyHostToDevice)); +} + +static void launch_rhs_pipeline(int all, double eps, int co) +{ + const double SYM = 1.0; + const double ANTI = -1.0; + + #define D(s) g_buf.slot[s] + + kern_phase1_prep<<>>( + D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), + D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz)); + + gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all); + gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all); + gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all); + gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all); + gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all); + gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all); + gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all); + + kern_phase2_metric_rhs<<>>( + D(S_alpn1), D(S_chin1), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), + D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs), + D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs)); + + kern_phase2_inverse<<>>( + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz)); + + if (co == 0) { + kern_phase3_gamma_constraint<<>>( + D(S_Gamx), D(S_Gamy), D(S_Gamz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res)); + } + + kern_phase4_christoffel<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz)); + + kern_phase5_raise_A<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz)); + + kern_phase6_gamma_rhs_part1<<>>( + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_alpn1), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_Kx), D(S_Ky), D(S_Kz), + D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), + D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs)); + + gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx), + D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all); + gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy), + D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all); + gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz), + D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all); + gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all); + gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all); + gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all); + + kern_phase8_gamma_rhs_part2<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), + D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), + D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_betaxx),D(S_betaxy),D(S_betaxz), + D(S_betayx),D(S_betayy),D(S_betayz), + D(S_betazx),D(S_betazy),D(S_betazz), + D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs), + D(S_Gamxa),D(S_Gamya),D(S_Gamza)); + + kern_phase9_christoffel_contract<<>>( + D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz)); + + gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx)); + + gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy)); + + gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz)); + + gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy)); + + gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz)); + + gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all); + kern_phase10_ricci_contract<<>>( + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz)); + + kern_phase11_ricci_diag<<>>( + D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_Gamxa),D(S_Gamya),D(S_Gamza), + D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), + D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), + D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + D(S_Rxx),D(S_Ryy),D(S_Rzz)); + + kern_phase11_ricci_offdiag<<>>( + D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz), + D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz), + D(S_Gamxa),D(S_Gamya),D(S_Gamza), + D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), + D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), + D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), + D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz), + D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz), + D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz), + D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz), + D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz), + D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + D(S_Rxy),D(S_Rxz),D(S_Ryz)); + + gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); + + kern_phase13_chi_correction<<>>( + D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_fxx), D(S_fxy), D(S_fxz), + D(S_fyy), D(S_fyz), D(S_fzz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), + D(S_Ryy), D(S_Ryz), D(S_Rzz)); + + gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz), + D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs), + SYM,SYM,SYM, all); + + kern_phase15_trK_Aij_gauge<<>>( + D(S_alpn1), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), + D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_rho), + D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz), + D(S_dtSfx), D(S_dtSfy), D(S_dtSfz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_fxx), D(S_fxy), D(S_fxz), + D(S_fyy), D(S_fyz), D(S_fzz), + D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs), + D(S_trK_rhs), + D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs), + D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs), + D(S_Lap_rhs), + D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs), + D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs), + D(S_f_arr), D(S_S_arr)); + + gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); + gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + + if (co == 0) { + gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all); + gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all); + gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all); + gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all); + gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all); + + kern_phase18_constraints<<>>( + D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Kx), D(S_Ky), D(S_Kz), + D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), + D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), + D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), + D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz), + D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz), + D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz), + D(S_gxxx), D(S_gxxy), D(S_gxxz), + D(S_gxyx), D(S_gxyy), D(S_gxyz), + D(S_gxzx), D(S_gxzy), D(S_gxzz), + D(S_gyyx), D(S_gyyy), D(S_gyyz), + D(S_gyzx), D(S_gyzy), D(S_gyzz), + D(S_gzzx), D(S_gzzy), D(S_gzzz), + D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); + } + + #undef D +} + +static void download_state_outputs(double **state_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + +/* ================================================================== */ +/* Main host function — drop-in replacement for bssn_rhs_c.C */ +/* ================================================================== */ -/* ================================================================== */ -/* Main host function — drop-in replacement for bssn_rhs_c.C */ -/* ================================================================== */ - -extern "C" -int f_compute_rhs_bssn(int *ex, double &T, +extern "C" +int f_compute_rhs_bssn(int *ex, double &T, double *X, double *Y, double *Z, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, @@ -2563,3 +3223,76 @@ int f_compute_rhs_bssn(int *ex, double &T, #undef D return 0; } + +extern "C" +int bssn_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + double **matter_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co) +{ + (void)T; + (void)Lev; + + if (RK4 < 0 || RK4 > 3) return 1; + + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + upload_state_and_matter(state_host_in, matter_host, all); + + StepContext &ctx = ensure_step_ctx(block_tag, all); + if (RK4 == 0) { + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice)); + } + + launch_rhs_pipeline((int)all, eps, co); + + if (apply_bam_bc) { + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]], + g_buf.slot[k_state_rhs_slots[i]], + propspeed[i], + soa_flat[3 * i + 0], + soa_flat[3 * i + 1], + soa_flat[3 * i + 2], + X, Y, Z, bbox, Symmetry); + } + } + + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + kern_rk4_finalize<<>>(ctx.d_state0[i], + g_buf.slot[k_state_rhs_slots[i]], + ctx.d_accum[i], + dT, + RK4); + } + + download_state_outputs(state_host_out, all); + return 0; +} + +extern "C" +void bssn_cuda_release_step_ctx(void *block_tag) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + release_step_ctx(block_tag); +} diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index cec0111..4d3a41f 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -1,13 +1,18 @@ #ifndef BSSN_RHS_CUDA_H #define BSSN_RHS_CUDA_H -#ifdef __cplusplus -extern "C" { -#endif +#ifdef __cplusplus +extern "C" { +#endif + +enum { + BSSN_CUDA_STATE_COUNT = 24, + BSSN_CUDA_MATTER_COUNT = 10 +}; -int f_compute_rhs_bssn(int *ex, double &T, - double *X, double *Y, double *Z, - double *chi, double *trK, +int f_compute_rhs_bssn(int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, @@ -26,11 +31,30 @@ int f_compute_rhs_bssn(int *ex, double &T, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, - double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, - int &Symmetry, int &Lev, double &eps, int &co); - -#ifdef __cplusplus -} + double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, + int &Symmetry, int &Lev, double &eps, int &co); + +int bssn_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + double **matter_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co); + +void bssn_cuda_release_step_ctx(void *block_tag); + +#ifdef __cplusplus +} #endif #endif diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 7a8878b..6541f72 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -42,16 +42,16 @@ endif .for.o: $(f77) -c $< -o $@ -.cu.o: - $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) - -# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers) -bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h - $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) - -# C rewrite of BSSN RHS kernel and helpers -bssn_rhs_c.o: bssn_rhs_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ +.cu.o: + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + +# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers) +bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + +# C rewrite of BSSN RHS kernel and helpers +bssn_rhs_c.o: bssn_rhs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ fderivs_c.o: fderivs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ @@ -83,46 +83,49 @@ TwoPunctures.o: TwoPunctures.C TwoPunctureABE.o: TwoPunctureABE.C ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ -# Input files - -## CUDA BSSN RHS switch -## 1 : use the rewritten CUDA bssn_rhs backend -## 0 : keep the normal CPU/Fortran selection below -USE_CUDA_BSSN ?= 0 - -## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) -ifeq ($(USE_CXX_KERNELS),0) -# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below -CFILES_CPU = -else -# C++ mode (default): C rewrite of bssn_rhs and helper kernels -CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o -endif - -CFILES_CUDA_BSSN = bssn_rhs_cuda.o - -ifeq ($(USE_CUDA_BSSN),1) -CFILES = $(CFILES_CUDA_BSSN) -else -CFILES = $(CFILES_CPU) -endif - -## RK4 kernel switch (independent from USE_CXX_KERNELS) -ifeq ($(USE_CXX_RK4),1) -RK4_C_OBJ = rungekutta4_rout_c.o -RK4_F90_OBJ = -else -RK4_C_OBJ = -RK4_F90_OBJ = rungekutta4_rout.o -endif - -CFILES += $(RK4_C_OBJ) -ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ) - -ABE_LDLIBS = $(LDLIBS) -ifeq ($(USE_CUDA_BSSN),1) -ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH) -endif +# Input files + +## CUDA BSSN RHS switch +## 1 : use the rewritten CUDA bssn_rhs backend +## 0 : keep the normal CPU/Fortran selection below +USE_CUDA_BSSN ?= 1 + +CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) +CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN) + +## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below +CFILES_CPU = +else +# C++ mode (default): C rewrite of bssn_rhs and helper kernels +CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o +endif + +CFILES_CUDA_BSSN = bssn_rhs_cuda.o + +ifeq ($(USE_CUDA_BSSN),1) +CFILES = $(CFILES_CUDA_BSSN) +else +CFILES = $(CFILES_CPU) +endif + +## RK4 kernel switch (independent from USE_CXX_KERNELS) +ifeq ($(USE_CXX_RK4),1) +RK4_C_OBJ = rungekutta4_rout_c.o +RK4_F90_OBJ = +else +RK4_C_OBJ = +RK4_F90_OBJ = rungekutta4_rout.o +endif + +CFILES += $(RK4_C_OBJ) +ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ) + +ABE_LDLIBS = $(LDLIBS) +ifeq ($(USE_CUDA_BSSN),1) +ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH) +endif C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ @@ -171,8 +174,8 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o #CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o -# file dependences -$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh +# file dependences +$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ @@ -195,7 +198,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h -$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h +$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h TwoPunctureFILES: TwoPunctures.h @@ -203,18 +206,18 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h misc.o : zbesh.o -# projects -ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS) - -ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH) - -#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) -# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) +# projects +ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS) + +ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH) + +#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) +# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) TwoPunctureABE: $(TwoPunctureFILES) $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) -clean: - rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f +clean: + rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f