Integrate CUDA support into RK4 substep execution

This commit is contained in:
2026-04-12 22:11:44 +08:00
parent 86a683de26
commit 4fa12a2009
4 changed files with 1161 additions and 279 deletions

View File

@@ -21,9 +21,12 @@ using namespace std;
#include "Ansorg.h"
#include "fmisc.h"
#include "Parallel.h"
#include "bssn_class.h"
#include "bssn_rhs.h"
#include "initial_puncture.h"
#include "bssn_class.h"
#include "bssn_rhs.h"
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#include "initial_puncture.h"
#include "enforce_algebra.h"
#include "rungekutta4_rout.h"
#include "sommerfeld_rout.h"
@@ -47,6 +50,35 @@ using namespace std;
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
#if USE_CUDA_BSSN
namespace {
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = nullptr,
double *soa_flat = nullptr)
{
int idx = 0;
while (vars && idx < BSSN_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
}
} // namespace
#endif
//================================================================================================
// define bssn_class
@@ -3104,104 +3136,148 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn],
cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn],
cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn],
cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn],
cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn],
cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn],
cg->fgfs[Lap_rhs->sgfn],
cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn],
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
while (varl0)
{
#if (SommerType == 0)
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
cg->fgfs[varlrhs->data->sgfn],
cg->fgfs[varl0->data->sgfn],
varl0->data->propspeed, varl0->data->SoA,
Symmetry);
#endif
#endif
f_rungekutta4_rout(cg->shape, dT_lev,
cg->fgfs[varl0->data->sgfn],
cg->fgfs[varl->data->sgfn],
cg->fgfs[varlrhs->data->sgfn],
iter_count);
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#if (SommerType == 1)
#warning "shell part still bam type"
if (lev == 0) // Shibata type sommerfeld
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, pre);
#endif
varl0 = varl0->next;
varl = varl->next;
varlrhs = varlrhs->next;
}
}
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_CUDA_STATE_COUNT];
double *state_out[BSSN_CUDA_STATE_COUNT];
double *matter[BSSN_CUDA_MATTER_COUNT] = {
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]};
double propspeed[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
!fill_bssn_cuda_views(cg, SynchList_pre, state_out))
{
cout << "CUDA BSSN state list mismatch on predictor step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
if (bssn_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out, matter,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre))
{
cout << "CUDA predictor substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep)
{
if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[phi_rhs->sgfn], cg->fgfs[trK_rhs->sgfn],
cg->fgfs[gxx_rhs->sgfn], cg->fgfs[gxy_rhs->sgfn], cg->fgfs[gxz_rhs->sgfn],
cg->fgfs[gyy_rhs->sgfn], cg->fgfs[gyz_rhs->sgfn], cg->fgfs[gzz_rhs->sgfn],
cg->fgfs[Axx_rhs->sgfn], cg->fgfs[Axy_rhs->sgfn], cg->fgfs[Axz_rhs->sgfn],
cg->fgfs[Ayy_rhs->sgfn], cg->fgfs[Ayz_rhs->sgfn], cg->fgfs[Azz_rhs->sgfn],
cg->fgfs[Gmx_rhs->sgfn], cg->fgfs[Gmy_rhs->sgfn], cg->fgfs[Gmz_rhs->sgfn],
cg->fgfs[Lap_rhs->sgfn],
cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
cg->fgfs[dtSfx_rhs->sgfn], cg->fgfs[dtSfy_rhs->sgfn], cg->fgfs[dtSfz_rhs->sgfn],
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
// rk4 substep boundary fix
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
while (varl0)
{
#if !USE_CUDA_BSSN
#if (SommerType == 0)
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
cg->fgfs[varlrhs->data->sgfn],
cg->fgfs[varl0->data->sgfn],
varl0->data->propspeed, varl0->data->SoA,
Symmetry);
#endif
#endif
f_rungekutta4_rout(cg->shape, dT_lev,
cg->fgfs[varl0->data->sgfn],
cg->fgfs[varl->data->sgfn],
cg->fgfs[varlrhs->data->sgfn],
iter_count);
#endif
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#if (SommerType == 1)
#warning "shell part still bam type"
if (lev == 0) // Shibata type sommerfeld
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, pre);
#endif
varl0 = varl0->next;
varl = varl->next;
varlrhs = varlrhs->next;
}
}
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
}
if (BP == Pp->data->ble)
break;
@@ -3469,102 +3545,148 @@ void bssn_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif
if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
cg->fgfs[Lap->sgfn],
cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn],
cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn],
cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn],
cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn],
cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn],
cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn],
cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn],
cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn],
cg->fgfs[Lap1->sgfn],
cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn],
cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn],
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, cor))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here
while (varl0)
{
#if (SommerType == 0)
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
cg->fgfs[varl1->data->sgfn],
cg->fgfs[varl->data->sgfn], varl0->data->propspeed, varl0->data->SoA,
Symmetry);
#endif
#endif
f_rungekutta4_rout(cg->shape, dT_lev,
cg->fgfs[varl0->data->sgfn],
cg->fgfs[varl1->data->sgfn],
cg->fgfs[varlrhs->data->sgfn],
iter_count);
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#if (SommerType == 1)
if (lev == 1) // shibata type sommerfeld
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#endif
varl0 = varl0->next;
varl = varl->next;
varl1 = varl1->next;
varlrhs = varlrhs->next;
}
}
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_CUDA_STATE_COUNT];
double *state_out[BSSN_CUDA_STATE_COUNT];
double *matter[BSSN_CUDA_MATTER_COUNT] = {
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn]};
double propspeed[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
!fill_bssn_cuda_views(cg, SynchList_cor, state_out))
{
cout << "CUDA BSSN state list mismatch on corrector step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
if (bssn_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out, matter,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor))
{
cout << "CUDA corrector substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep)
{
if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
cg->fgfs[Lap->sgfn],
cg->fgfs[Sfx->sgfn], cg->fgfs[Sfy->sgfn], cg->fgfs[Sfz->sgfn],
cg->fgfs[dtSfx->sgfn], cg->fgfs[dtSfy->sgfn], cg->fgfs[dtSfz->sgfn],
cg->fgfs[phi1->sgfn], cg->fgfs[trK1->sgfn],
cg->fgfs[gxx1->sgfn], cg->fgfs[gxy1->sgfn], cg->fgfs[gxz1->sgfn],
cg->fgfs[gyy1->sgfn], cg->fgfs[gyz1->sgfn], cg->fgfs[gzz1->sgfn],
cg->fgfs[Axx1->sgfn], cg->fgfs[Axy1->sgfn], cg->fgfs[Axz1->sgfn],
cg->fgfs[Ayy1->sgfn], cg->fgfs[Ayz1->sgfn], cg->fgfs[Azz1->sgfn],
cg->fgfs[Gmx1->sgfn], cg->fgfs[Gmy1->sgfn], cg->fgfs[Gmz1->sgfn],
cg->fgfs[Lap1->sgfn],
cg->fgfs[Sfx1->sgfn], cg->fgfs[Sfy1->sgfn], cg->fgfs[Sfz1->sgfn],
cg->fgfs[dtSfx1->sgfn], cg->fgfs[dtSfy1->sgfn], cg->fgfs[dtSfz1->sgfn],
cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
cg->fgfs[Gamxxx->sgfn], cg->fgfs[Gamxxy->sgfn], cg->fgfs[Gamxxz->sgfn],
cg->fgfs[Gamxyy->sgfn], cg->fgfs[Gamxyz->sgfn], cg->fgfs[Gamxzz->sgfn],
cg->fgfs[Gamyxx->sgfn], cg->fgfs[Gamyxy->sgfn], cg->fgfs[Gamyxz->sgfn],
cg->fgfs[Gamyyy->sgfn], cg->fgfs[Gamyyz->sgfn], cg->fgfs[Gamyzz->sgfn],
cg->fgfs[Gamzxx->sgfn], cg->fgfs[Gamzxy->sgfn], cg->fgfs[Gamzxz->sgfn],
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, cor))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
// rk4 substep boundary fix
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
// we do not check the correspondence here
while (varl0)
{
#if !USE_CUDA_BSSN
#if (SommerType == 0)
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
cg->fgfs[varl1->data->sgfn],
cg->fgfs[varl->data->sgfn],
varl0->data->propspeed, varl0->data->SoA,
Symmetry);
#endif
#endif
f_rungekutta4_rout(cg->shape, dT_lev,
cg->fgfs[varl0->data->sgfn],
cg->fgfs[varl1->data->sgfn],
cg->fgfs[varlrhs->data->sgfn],
iter_count);
#endif
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#if (SommerType == 1)
if (lev == 1) // shibata type sommerfeld
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev,
cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
#endif
varl0 = varl0->next;
varl = varl->next;
varl1 = varl1->next;
varlrhs = varlrhs->next;
}
}
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
}
if (BP == Pp->data->ble)
break;

View File

@@ -5,10 +5,12 @@
* Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
*/
#include <array>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <unordered_map>
#include <cuda_runtime.h>
#include "macrodef.h"
#include "bssn_rhs.h"
@@ -219,6 +221,40 @@ static const int STAGE_SLOT_COUNT =
? H2D_INPUT_SLOT_COUNT
: (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);
static constexpr int BSSN_STATE_COUNT = 24;
static constexpr int BSSN_MATTER_COUNT = 10;
static const int k_state_input_slots[BSSN_STATE_COUNT] = {
S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
S_Gamx, S_Gamy, S_Gamz,
S_Lap, S_betax, S_betay, S_betaz,
S_dtSfx, S_dtSfy, S_dtSfz
};
static const int k_state_rhs_slots[BSSN_STATE_COUNT] = {
S_chi_rhs, S_trK_rhs,
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs
};
static const int k_matter_slots[BSSN_MATTER_COUNT] = {
S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
};
struct StepContext {
double *d_state0_mem;
double *d_accum_mem;
std::array<double *, BSSN_STATE_COUNT> d_state0;
std::array<double *, BSSN_STATE_COUNT> d_accum;
size_t cap_all;
};
static std::unordered_map<void *, StepContext> g_step_ctx;
static void ensure_gpu_buffers(int nx, int ny, int nz) {
size_t all = (size_t)nx * ny * nz;
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
@@ -269,6 +305,38 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
g_buf.prev_ny = ny;
g_buf.prev_nz = nz;
}
static StepContext &ensure_step_ctx(void *block_tag, size_t all)
{
StepContext &ctx = g_step_ctx[block_tag];
if (ctx.cap_all < all) {
if (ctx.d_state0_mem) {
cudaFree(ctx.d_state0_mem);
ctx.d_state0_mem = nullptr;
}
if (ctx.d_accum_mem) {
cudaFree(ctx.d_accum_mem);
ctx.d_accum_mem = nullptr;
}
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
ctx.cap_all = all;
}
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
}
return ctx;
}
static void release_step_ctx(void *block_tag)
{
auto it = g_step_ctx.find(block_tag);
if (it == g_step_ctx.end()) return;
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
g_step_ctx.erase(it);
}
/* ================================================================== */
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
@@ -937,6 +1005,177 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
}
}
__global__ void kern_rk4_finalize(const double * __restrict__ f0,
double * __restrict__ frhs,
double * __restrict__ accum,
double dT,
int rk4_stage)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < d_gp.all;
i += blockDim.x * gridDim.x)
{
const double rhs = frhs[i];
switch (rk4_stage) {
case 0:
accum[i] = rhs;
frhs[i] = f0[i] + 0.5 * dT * rhs;
break;
case 1:
accum[i] += 2.0 * rhs;
frhs[i] = f0[i] + 0.5 * dT * rhs;
break;
case 2:
accum[i] += 2.0 * rhs;
frhs[i] = f0[i] + dT * rhs;
break;
default:
frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs);
break;
}
}
}
__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < d_gp.all;
i += blockDim.x * gridDim.x)
{
if (chi[i] < tinny) chi[i] = tinny;
}
}
enum SommerFace {
FACE_XMAX = 0,
FACE_YMAX = 1,
FACE_ZMAX = 2,
FACE_XMIN = 3,
FACE_YMIN = 4,
FACE_ZMIN = 5
};
__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh,
double * __restrict__ f_rhs,
int face,
double velocity,
double x0,
double y0,
double z0)
{
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1;
const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1;
const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1;
const int imax = nx, jmax = ny, kmax = nz;
const int plane_count =
(face == FACE_XMAX || face == FACE_XMIN) ? ny * nz :
(face == FACE_YMAX || face == FACE_YMIN) ? nx * nz :
nx * ny;
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
tid < plane_count;
tid += blockDim.x * gridDim.x)
{
int i0 = 0, j0 = 0, k0 = 0;
if (face == FACE_XMAX || face == FACE_XMIN) {
j0 = tid % ny;
k0 = tid / ny;
i0 = (face == FACE_XMAX) ? (nx - 1) : 0;
} else if (face == FACE_YMAX || face == FACE_YMIN) {
i0 = tid % nx;
k0 = tid / nx;
j0 = (face == FACE_YMAX) ? (ny - 1) : 0;
} else {
i0 = tid % nx;
j0 = tid / nx;
k0 = (face == FACE_ZMAX) ? (nz - 1) : 0;
}
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
const int p = idx_ex_d(i0, j0, k0);
const double x = x0 + i0 * d_gp.dX;
const double y = y0 + j0 * d_gp.dY;
const double z = z0 + k0 * d_gp.dZ;
const double r = sqrt(x * x + y * y + z * z);
if (r == 0.0) continue;
const double wx = velocity * x / r;
const double wy = velocity * y / r;
const double wz = velocity * z / r;
double fx = 0.0, fy = 0.0, fz = 0.0;
if (wx > 0.0) {
if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
} else if (wx < 0.0) {
if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
}
if (wy > 0.0) {
if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
} else if (wy < 0.0) {
if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
}
if (wz > 0.0) {
if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
} else if (wz < 0.0) {
if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
}
f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r;
}
}
static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs,
double velocity,
double SoA0, double SoA1, double SoA2,
double *X, double *Y, double *Z,
const double *bbox,
int Symmetry)
{
if (velocity == 0.0) return;
const int nx = g_buf.prev_nx;
const int ny = g_buf.prev_ny;
const int nz = g_buf.prev_nz;
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX;
const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY;
const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ;
const bool touch_xmin = fabs(X[0] - bbox[0]) < dX &&
!(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5);
const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY &&
!(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5);
const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ &&
!(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5);
const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2);
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2);
if (touch_xmax) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]);
if (touch_ymax) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]);
if (touch_zmax) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]);
if (touch_xmin) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]);
if (touch_ymin) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]);
if (touch_zmin) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]);
}
/* ================================================================== */
/* C. Point-wise computation kernels */
@@ -2075,14 +2314,435 @@ void kern_phase18_constraints(
+uxy*my_xz+uxz*mz_xz+uyz*mz_yz
- F2o3*Kz[i] - F8*PI_V*Sz_m[i];
}
}
}
static void setup_grid_params(int *ex,
double *X, double *Y, double *Z,
int Symmetry, double eps, int co)
{
const int nx = ex[0];
const int ny = ex[1];
const int nz = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int NO_SYMM = 0;
const int EQ_SYMM = 1;
ensure_gpu_buffers(nx, ny, nz);
GridParams gp = {};
gp.ex[0] = nx;
gp.ex[1] = ny;
gp.ex[2] = nz;
gp.all = nx * ny * nz;
gp.dX = dX;
gp.dY = dY;
gp.dZ = dZ;
gp.d12dx = 1.0 / (12.0 * dX);
gp.d12dy = 1.0 / (12.0 * dY);
gp.d12dz = 1.0 / (12.0 * dZ);
gp.d2dx = 1.0 / (2.0 * dX);
gp.d2dy = 1.0 / (2.0 * dY);
gp.d2dz = 1.0 / (2.0 * dZ);
gp.Fdxdx = 1.0 / (12.0 * dX * dX);
gp.Fdydy = 1.0 / (12.0 * dY * dY);
gp.Fdzdz = 1.0 / (12.0 * dZ * dZ);
gp.Sdxdx = 1.0 / (dX * dX);
gp.Sdydy = 1.0 / (dY * dY);
gp.Sdzdz = 1.0 / (dZ * dZ);
gp.Fdxdy = 1.0 / (144.0 * dX * dY);
gp.Fdxdz = 1.0 / (144.0 * dX * dZ);
gp.Fdydz = 1.0 / (144.0 * dY * dZ);
gp.Sdxdy = 0.25 / (dX * dY);
gp.Sdxdz = 0.25 / (dX * dZ);
gp.Sdydz = 0.25 / (dY * dZ);
gp.iminF = 1;
gp.jminF = 1;
gp.kminF = 1;
gp.imaxF = nx;
gp.jmaxF = ny;
gp.kmaxF = nz;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
gp.iminF3 = 1;
gp.jminF3 = 1;
gp.kminF3 = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
gp.Symmetry = Symmetry;
gp.eps = eps;
gp.co = co;
gp.fh2_nx = nx + 2;
gp.fh2_ny = ny + 2;
gp.fh2_nz = nz + 2;
gp.fh3_nx = nx + 3;
gp.fh3_ny = ny + 3;
gp.fh3_nz = nz + 3;
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
}
static void upload_state_and_matter(double **state_host,
double **matter_host,
size_t all)
{
static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
"state + matter upload must match contiguous input slots");
const size_t bytes = all * sizeof(double);
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
cudaMemcpyHostToDevice));
}
static void launch_rhs_pipeline(int all, double eps, int co)
{
const double SYM = 1.0;
const double ANTI = -1.0;
#define D(s) g_buf.slot[s]
kern_phase1_prep<<<grid(all),BLK>>>(
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all);
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
D(S_alpn1), D(S_chin1),
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_trK),
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
D(S_betaxx), D(S_betaxy), D(S_betaxz),
D(S_betayx), D(S_betayy), D(S_betayz),
D(S_betazx), D(S_betazy), D(S_betazz),
D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));
kern_phase2_inverse<<<grid(all),BLK>>>(
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz));
if (co == 0) {
kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
D(S_Gamx), D(S_Gamy), D(S_Gamz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
}
kern_phase4_christoffel<<<grid(all),BLK>>>(
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));
kern_phase5_raise_A<<<grid(all),BLK>>>(
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz));
kern_phase6_gamma_rhs_part1<<<grid(all),BLK>>>(
D(S_Lapx), D(S_Lapy), D(S_Lapz),
D(S_alpn1), D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_Kx), D(S_Ky), D(S_Kz),
D(S_Sx), D(S_Sy), D(S_Sz),
D(S_Rxx), D(S_Rxy), D(S_Rxz),
D(S_Ryy), D(S_Ryz), D(S_Rzz),
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
kern_phase8_gamma_rhs_part2<<<grid(all),BLK>>>(
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
D(S_betaxx),D(S_betaxy),D(S_betaxz),
D(S_betayx),D(S_betayy),D(S_betayz),
D(S_betazx),D(S_betazy),D(S_betazz),
D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
D(S_Gamxa),D(S_Gamya),D(S_Gamza));
kern_phase9_christoffel_contract<<<grid(all),BLK>>>(
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
gpu_fdderivs(D(S_dxx), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxx));
gpu_fdderivs(D(S_dyy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryy));
gpu_fdderivs(D(S_dzz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rzz));
gpu_fdderivs(D(S_gxy), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,ANTI,SYM, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxy));
gpu_fdderivs(D(S_gxz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), ANTI,SYM,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Rxz));
gpu_fdderivs(D(S_gyz), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,ANTI,ANTI, all);
kern_phase10_ricci_contract<<<grid(all),BLK>>>(
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_fxx),D(S_fxy),D(S_fxz),D(S_fyy),D(S_fyz),D(S_fzz), D(S_Ryz));
kern_phase11_ricci_diag<<<grid(all),BLK>>>(
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
D(S_Rxx),D(S_Ryy),D(S_Rzz));
kern_phase11_ricci_offdiag<<<grid(all),BLK>>>(
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
D(S_Rxy),D(S_Rxz),D(S_Ryz));
gpu_fdderivs(D(S_chi), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
kern_phase13_chi_correction<<<grid(all),BLK>>>(
D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_Rxx), D(S_Rxy), D(S_Rxz),
D(S_Ryy), D(S_Ryz), D(S_Rzz));
gpu_fdderivs(D(S_Lap), D(S_fxx),D(S_fxy),D(S_fxz),
D(S_fyy),D(S_fyz),D(S_fzz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_chi), D(S_dtSfx_rhs),D(S_dtSfy_rhs),D(S_dtSfz_rhs),
SYM,SYM,SYM, all);
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
D(S_alpn1), D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_trK),
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
D(S_Lapx), D(S_Lapy), D(S_Lapz),
D(S_betaxx), D(S_betaxy), D(S_betaxz),
D(S_betayx), D(S_betayy), D(S_betayz),
D(S_betazx), D(S_betazy), D(S_betazz),
D(S_rho),
D(S_Sx), D(S_Sy), D(S_Sz),
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_fxx), D(S_fxy), D(S_fxz),
D(S_fyy), D(S_fyz), D(S_fzz),
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
D(S_trK_rhs),
D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
D(S_Lap_rhs),
D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
D(S_f_arr), D(S_S_arr));
gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
if (co == 0) {
gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
kern_phase18_constraints<<<grid(all),BLK>>>(
D(S_chin1),
D(S_chix), D(S_chiy), D(S_chiz),
D(S_gupxx), D(S_gupxy), D(S_gupxz),
D(S_gupyy), D(S_gupyz), D(S_gupzz),
D(S_trK),
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
D(S_Kx), D(S_Ky), D(S_Kz),
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_gxxx), D(S_gxxy), D(S_gxxz),
D(S_gxyx), D(S_gxyy), D(S_gxyz),
D(S_gxzx), D(S_gxzy), D(S_gxzz),
D(S_gyyx), D(S_gyyy), D(S_gyyz),
D(S_gyzx), D(S_gyzy), D(S_gyzz),
D(S_gzzx), D(S_gzzy), D(S_gzzz),
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
}
#undef D
}
static void download_state_outputs(double **state_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
}
/* ================================================================== */
/* Main host function — drop-in replacement for bssn_rhs_c.C */
/* ================================================================== */
/* ================================================================== */
/* Main host function — drop-in replacement for bssn_rhs_c.C */
/* ================================================================== */
extern "C"
int f_compute_rhs_bssn(int *ex, double &T,
extern "C"
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
@@ -2563,3 +3223,76 @@ int f_compute_rhs_bssn(int *ex, double &T,
#undef D
return 0;
}
extern "C"
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **matter_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co)
{
(void)T;
(void)Lev;
if (RK4 < 0 || RK4 > 3) return 1;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
upload_state_and_matter(state_host_in, matter_host, all);
StepContext &ctx = ensure_step_ctx(block_tag, all);
if (RK4 == 0) {
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
}
launch_rhs_pipeline((int)all, eps, co);
if (apply_bam_bc) {
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
g_buf.slot[k_state_rhs_slots[i]],
propspeed[i],
soa_flat[3 * i + 0],
soa_flat[3 * i + 1],
soa_flat[3 * i + 2],
X, Y, Z, bbox, Symmetry);
}
}
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
g_buf.slot[k_state_rhs_slots[i]],
ctx.d_accum[i],
dT,
RK4);
}
download_state_outputs(state_host_out, all);
return 0;
}
extern "C"
void bssn_cuda_release_step_ctx(void *block_tag)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
release_step_ctx(block_tag);
}

View File

@@ -1,13 +1,18 @@
#ifndef BSSN_RHS_CUDA_H
#define BSSN_RHS_CUDA_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __cplusplus
extern "C" {
#endif
enum {
BSSN_CUDA_STATE_COUNT = 24,
BSSN_CUDA_MATTER_COUNT = 10
};
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
@@ -26,11 +31,30 @@ int f_compute_rhs_bssn(int *ex, double &T,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
#ifdef __cplusplus
}
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **matter_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co);
void bssn_cuda_release_step_ctx(void *block_tag);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -42,16 +42,16 @@ endif
.for.o:
$(f77) -c $< -o $@
.cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
.cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -83,46 +83,49 @@ TwoPunctures.o: TwoPunctures.C
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files
## CUDA BSSN RHS switch
## 1 : use the rewritten CUDA bssn_rhs backend
## 0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 0
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES_CPU =
else
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
endif
CFILES_CUDA_BSSN = bssn_rhs_cuda.o
ifeq ($(USE_CUDA_BSSN),1)
CFILES = $(CFILES_CUDA_BSSN)
else
CFILES = $(CFILES_CPU)
endif
## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
RK4_C_OBJ = rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_C_OBJ =
RK4_F90_OBJ = rungekutta4_rout.o
endif
CFILES += $(RK4_C_OBJ)
ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ)
ABE_LDLIBS = $(LDLIBS)
ifeq ($(USE_CUDA_BSSN),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif
# Input files
## CUDA BSSN RHS switch
## 1 : use the rewritten CUDA bssn_rhs backend
## 0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 1
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES_CPU =
else
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
endif
CFILES_CUDA_BSSN = bssn_rhs_cuda.o
ifeq ($(USE_CUDA_BSSN),1)
CFILES = $(CFILES_CUDA_BSSN)
else
CFILES = $(CFILES_CPU)
endif
## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
RK4_C_OBJ = rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_C_OBJ =
RK4_F90_OBJ = rungekutta4_rout.o
endif
CFILES += $(RK4_C_OBJ)
ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) $(RK4_C_OBJ)
ABE_LDLIBS = $(LDLIBS)
ifeq ($(USE_CUDA_BSSN),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -171,8 +174,8 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
#CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
# file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
# file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -195,7 +198,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
TwoPunctureFILES: TwoPunctures.h
@@ -203,18 +206,18 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
misc.o : zbesh.o
# projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
# projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
clean:
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f