Add zero matter handling and interpolation for resident state in CUDA BSSN
This commit is contained in:
@@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u
|
|||||||
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
|
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_patch_contains_point(Patch *patch, const double *point)
|
||||||
|
{
|
||||||
|
if (!patch)
|
||||||
|
return false;
|
||||||
|
for (int d = 0; d < dim; d++)
|
||||||
|
{
|
||||||
|
const double h = patch->getdX(d);
|
||||||
|
const double lo = patch->bbox[d] + patch->lli[d] * h;
|
||||||
|
const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
|
||||||
|
if (point[d] < lo || point[d] > hi)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_point_in_block(Patch *patch, Block *block,
|
||||||
|
const double *point, const double *DH)
|
||||||
|
{
|
||||||
|
if (!patch || !block)
|
||||||
|
return false;
|
||||||
|
for (int d = 0; d < dim; d++)
|
||||||
|
{
|
||||||
|
double llb;
|
||||||
|
double uub;
|
||||||
|
#ifdef Vertex
|
||||||
|
#ifdef Cell
|
||||||
|
#error Both Cell and Vertex are defined
|
||||||
|
#endif
|
||||||
|
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||||
|
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||||
|
: block->bbox[d] + (ghost_width - 0.5) * DH[d];
|
||||||
|
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||||
|
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||||
|
: block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||||
|
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||||
|
: block->bbox[d] + ghost_width * DH[d];
|
||||||
|
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||||
|
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||||
|
: block->bbox[dim + d] - ghost_width * DH[d];
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
|
||||||
|
{
|
||||||
|
if (!coords || n <= ordn)
|
||||||
|
return 0;
|
||||||
|
int cxi = int((x - coords[0]) / dx + 0.4) + 1;
|
||||||
|
int start = cxi - ordn / 2;
|
||||||
|
if (start < 0)
|
||||||
|
start = 0;
|
||||||
|
const int max_start = n - ordn;
|
||||||
|
if (start > max_start)
|
||||||
|
start = max_start;
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||||
|
int myrank,
|
||||||
|
const double *point,
|
||||||
|
var *forx, var *fory, var *forz,
|
||||||
|
int Symmetry,
|
||||||
|
double *shellf)
|
||||||
|
{
|
||||||
|
const int ordn = 2 * ghost_width;
|
||||||
|
int owner_rank = -1;
|
||||||
|
|
||||||
|
shellf[0] = shellf[1] = shellf[2] = 0.0;
|
||||||
|
|
||||||
|
MyList<Patch> *PL = PatL;
|
||||||
|
while (PL)
|
||||||
|
{
|
||||||
|
Patch *patch = PL->data;
|
||||||
|
if (!bssn_cuda_patch_contains_point(patch, point))
|
||||||
|
{
|
||||||
|
PL = PL->next;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
double DH[dim];
|
||||||
|
for (int d = 0; d < dim; d++)
|
||||||
|
DH[d] = patch->getdX(d);
|
||||||
|
|
||||||
|
MyList<Block> *BP = patch->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *block = BP->data;
|
||||||
|
if (bssn_cuda_point_in_block(patch, block, point, DH))
|
||||||
|
{
|
||||||
|
owner_rank = block->rank;
|
||||||
|
if (myrank == owner_rank)
|
||||||
|
{
|
||||||
|
int interp_ordn = ordn;
|
||||||
|
int interp_sym = Symmetry;
|
||||||
|
double x = point[0];
|
||||||
|
double y = point[1];
|
||||||
|
double z = point[2];
|
||||||
|
|
||||||
|
if (bssn_cuda_has_resident_state(block) &&
|
||||||
|
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||||
|
{
|
||||||
|
const int sx = ordn;
|
||||||
|
const int sy = ordn;
|
||||||
|
const int sz = ordn;
|
||||||
|
const int region_all = sx * sy * sz;
|
||||||
|
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
||||||
|
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
||||||
|
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
||||||
|
double packed_fields[3 * region_all];
|
||||||
|
var *vars[3] = {forx, fory, forz};
|
||||||
|
for (int f = 0; f < 3; f++)
|
||||||
|
{
|
||||||
|
if (bssn_cuda_pack_state_region_to_host_buffer(block,
|
||||||
|
k_bssn_cuda_bh_state_indices[f],
|
||||||
|
packed_fields + f * region_all,
|
||||||
|
block->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
sx, sy, sz) != 0)
|
||||||
|
{
|
||||||
|
cout << "CUDA BH tile download failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
int tile_shape[3] = {sx, sy, sz};
|
||||||
|
f_global_interp(tile_shape,
|
||||||
|
block->X[0] + i0,
|
||||||
|
block->X[1] + j0,
|
||||||
|
block->X[2] + k0,
|
||||||
|
packed_fields + f * region_all,
|
||||||
|
shellf[f],
|
||||||
|
x, y, z,
|
||||||
|
interp_ordn,
|
||||||
|
vars[f]->SoA,
|
||||||
|
interp_sym);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
|
block->fgfs[forx->sgfn], shellf[0],
|
||||||
|
x, y, z, interp_ordn, forx->SoA, interp_sym);
|
||||||
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
|
block->fgfs[fory->sgfn], shellf[1],
|
||||||
|
x, y, z, interp_ordn, fory->SoA, interp_sym);
|
||||||
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
|
block->fgfs[forz->sgfn], shellf[2],
|
||||||
|
x, y, z, interp_ordn, forz->SoA, interp_sym);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (BP == patch->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (owner_rank >= 0)
|
||||||
|
break;
|
||||||
|
PL = PL->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (owner_rank < 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||||
{
|
{
|
||||||
MyList<Patch> *Pp = PatL;
|
MyList<Patch> *Pp = PatL;
|
||||||
@@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
int apply_bam_bc = 0;
|
int apply_bam_bc = 0;
|
||||||
|
// bssn_class does not evolve matter source fields; they remain zero-initialized.
|
||||||
|
int use_zero_matter = 1;
|
||||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
int apply_enforce_ga = 0;
|
int apply_enforce_ga = 0;
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
@@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
propspeed, soa_flat, Pp->data->bbox,
|
propspeed, soa_flat, Pp->data->bbox,
|
||||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||||
Symmetry, lev, ndeps, pre,
|
Symmetry, lev, ndeps, pre,
|
||||||
|
use_zero_matter,
|
||||||
keep_resident_state, apply_enforce_ga, chitiny))
|
keep_resident_state, apply_enforce_ga, chitiny))
|
||||||
{
|
{
|
||||||
cout << "CUDA predictor substep failed in domain: ("
|
cout << "CUDA predictor substep failed in domain: ("
|
||||||
@@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
const bool need_analysis_state_after_predictor =
|
const bool need_analysis_state_after_predictor =
|
||||||
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
|
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
|
||||||
const bool need_bh_state_after_predictor =
|
|
||||||
(BH_num > 0) && (lev == GH->levels - 1);
|
|
||||||
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
|
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
|
||||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||||
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
|
|
||||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
@@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
int apply_bam_bc = 0;
|
int apply_bam_bc = 0;
|
||||||
|
// bssn_class does not evolve matter source fields; they remain zero-initialized.
|
||||||
|
int use_zero_matter = 1;
|
||||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
int apply_enforce_ga = 0;
|
int apply_enforce_ga = 0;
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
@@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
propspeed, soa_flat, Pp->data->bbox,
|
propspeed, soa_flat, Pp->data->bbox,
|
||||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||||
Symmetry, lev, ndeps, cor,
|
Symmetry, lev, ndeps, cor,
|
||||||
|
use_zero_matter,
|
||||||
keep_resident_state, apply_enforce_ga, chitiny))
|
keep_resident_state, apply_enforce_ga, chitiny))
|
||||||
{
|
{
|
||||||
cout << "CUDA corrector substep failed in domain: ("
|
cout << "CUDA corrector substep failed in domain: ("
|
||||||
@@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if USE_CUDA_BSSN
|
|
||||||
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
|
|
||||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// swap time level
|
// swap time level
|
||||||
if (iter_count < 3)
|
if (iter_count < 3)
|
||||||
{
|
{
|
||||||
@@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
|||||||
|
|
||||||
int lev = ilev;
|
int lev = ilev;
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (bssn_cuda_use_resident_sync(lev) &&
|
||||||
|
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
|
||||||
|
{
|
||||||
|
BH_RHS[n][0] = -shellf[0];
|
||||||
|
BH_RHS[n][1] = -shellf[1];
|
||||||
|
BH_RHS[n][2] = -shellf[2];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if (PSTR == 0)
|
#if (PSTR == 0)
|
||||||
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
|
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
|
||||||
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
||||||
|
|||||||
@@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx,
|
|||||||
ctx.matter_ready = true;
|
ctx.matter_ready = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void zero_matter_cache(StepContext &ctx, size_t all)
|
||||||
|
{
|
||||||
|
CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
|
||||||
|
(size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
|
||||||
|
ctx.matter_ready = true;
|
||||||
|
}
|
||||||
|
|
||||||
static void bind_matter_slots(const StepContext &ctx)
|
static void bind_matter_slots(const StepContext &ctx)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||||
@@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
int &Lev,
|
int &Lev,
|
||||||
double &eps,
|
double &eps,
|
||||||
int &co,
|
int &co,
|
||||||
|
int &use_zero_matter,
|
||||||
int &keep_resident_state,
|
int &keep_resident_state,
|
||||||
int &apply_enforce_ga,
|
int &apply_enforce_ga,
|
||||||
double &chitiny)
|
double &chitiny)
|
||||||
@@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
|
|
||||||
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
if (RK4 == 0) {
|
if (RK4 == 0) {
|
||||||
upload_matter_cache(ctx, matter_host, all);
|
if (use_zero_matter) {
|
||||||
|
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
|
||||||
|
} else {
|
||||||
|
upload_matter_cache(ctx, matter_host, all);
|
||||||
|
}
|
||||||
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
||||||
(size_t)BSSN_STATE_COUNT * bytes,
|
(size_t)BSSN_STATE_COUNT * bytes,
|
||||||
cudaMemcpyDeviceToDevice));
|
cudaMemcpyDeviceToDevice));
|
||||||
} else if (!ctx.matter_ready) {
|
} else if (!ctx.matter_ready) {
|
||||||
upload_matter_cache(ctx, matter_host, all);
|
if (use_zero_matter) zero_matter_cache(ctx, all);
|
||||||
|
else upload_matter_cache(ctx, matter_host, all);
|
||||||
}
|
}
|
||||||
bind_matter_slots(ctx);
|
bind_matter_slots(ctx);
|
||||||
if (profile) {
|
if (profile) {
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
int &Lev,
|
int &Lev,
|
||||||
double &eps,
|
double &eps,
|
||||||
int &co,
|
int &co,
|
||||||
|
int &use_zero_matter,
|
||||||
int &keep_resident_state,
|
int &keep_resident_state,
|
||||||
int &apply_enforce_ga,
|
int &apply_enforce_ga,
|
||||||
double &chitiny);
|
double &chitiny);
|
||||||
|
|||||||
Reference in New Issue
Block a user