Batch GA/BH subset sync with indexed GPU pack/unpack buffers
This commit is contained in:
@@ -145,6 +145,18 @@ int cuda_seg_begin(const Parallel::gridseg *seg, Block *bg, int dir)
|
|||||||
return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5);
|
return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int cuda_state_var_count(MyList<var> *src_vars, MyList<var> *dst_vars)
|
||||||
|
{
|
||||||
|
int count = 0;
|
||||||
|
while (src_vars && dst_vars)
|
||||||
|
{
|
||||||
|
++count;
|
||||||
|
src_vars = src_vars->next;
|
||||||
|
dst_vars = dst_vars->next;
|
||||||
|
}
|
||||||
|
return (src_vars || dst_vars) ? -1 : count;
|
||||||
|
}
|
||||||
|
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
|
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
|
||||||
{
|
{
|
||||||
@@ -163,15 +175,17 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
|
|||||||
bool cuda_direct_pack_segment(double *buffer,
|
bool cuda_direct_pack_segment(double *buffer,
|
||||||
const Parallel::gridseg *src,
|
const Parallel::gridseg *src,
|
||||||
const Parallel::gridseg *dst,
|
const Parallel::gridseg *dst,
|
||||||
int state_index)
|
int state_count)
|
||||||
{
|
{
|
||||||
|
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||||
|
return false;
|
||||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||||
const bool ok = bssn_cuda_pack_state_region_to_host_buffer(src->Bg, state_index, buffer, src->Bg->shape,
|
const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
i0, j0, k0,
|
i0, j0, k0,
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
if (sync_profile_enabled())
|
if (sync_profile_enabled())
|
||||||
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||||
return ok;
|
return ok;
|
||||||
@@ -179,15 +193,17 @@ bool cuda_direct_pack_segment(double *buffer,
|
|||||||
|
|
||||||
bool cuda_direct_unpack_segment(double *buffer,
|
bool cuda_direct_unpack_segment(double *buffer,
|
||||||
const Parallel::gridseg *dst,
|
const Parallel::gridseg *dst,
|
||||||
int state_index)
|
int state_count)
|
||||||
{
|
{
|
||||||
|
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||||
|
return false;
|
||||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||||
const bool ok = bssn_cuda_unpack_state_region_from_host_buffer(dst->Bg, state_index, buffer, dst->Bg->shape,
|
const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||||
i0, j0, k0,
|
i0, j0, k0,
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
if (sync_profile_enabled())
|
if (sync_profile_enabled())
|
||||||
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||||
return ok;
|
return ok;
|
||||||
@@ -3923,15 +3939,8 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
|
|
||||||
MyList<var> *varls, *varld;
|
MyList<var> *varls, *varld;
|
||||||
|
|
||||||
varls = VarLists;
|
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||||
varld = VarListd;
|
if (state_count < 0)
|
||||||
while (varls && varld)
|
|
||||||
{
|
|
||||||
varls = varls->next;
|
|
||||||
varld = varld->next;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (varls || varld)
|
|
||||||
{
|
{
|
||||||
cout << "error in short data packer, var lists does not match." << endl;
|
cout << "error in short data packer, var lists does not match." << endl;
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
@@ -3961,7 +3970,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
bool handled_by_cuda = false;
|
bool handled_by_cuda = false;
|
||||||
if (dir == PACK && cuda_can_direct_pack(src->data, dst->data, type))
|
if (dir == PACK && cuda_can_direct_pack(src->data, dst->data, type))
|
||||||
{
|
{
|
||||||
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_idx);
|
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count);
|
||||||
if (!handled_by_cuda)
|
if (!handled_by_cuda)
|
||||||
{
|
{
|
||||||
cout << "Parallel::data_packer: CUDA direct pack failed." << endl;
|
cout << "Parallel::data_packer: CUDA direct pack failed." << endl;
|
||||||
@@ -3970,7 +3979,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
}
|
}
|
||||||
else if (dir == UNPACK && cuda_can_direct_unpack(dst->data, type))
|
else if (dir == UNPACK && cuda_can_direct_unpack(dst->data, type))
|
||||||
{
|
{
|
||||||
handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_idx);
|
handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count);
|
||||||
if (!handled_by_cuda)
|
if (!handled_by_cuda)
|
||||||
{
|
{
|
||||||
cout << "Parallel::data_packer: CUDA direct unpack failed." << endl;
|
cout << "Parallel::data_packer: CUDA direct unpack failed." << endl;
|
||||||
@@ -4005,6 +4014,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
dst->data->llb, dst->data->uub);
|
dst->data->llb, dst->data->uub);
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
size_out += (state_count - 1) * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||||
|
while (varls->next && varld->next)
|
||||||
|
{
|
||||||
|
varls = varls->next;
|
||||||
|
varld = varld->next;
|
||||||
|
++state_idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||||
|
|||||||
@@ -53,6 +53,12 @@ using namespace std;
|
|||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
|
||||||
|
static const int k_bssn_cuda_ga_state_indices[12] = {
|
||||||
|
2, 3, 4, 5, 6, 7,
|
||||||
|
8, 9, 10, 11, 12, 13
|
||||||
|
};
|
||||||
|
|
||||||
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||||
double **host_views,
|
double **host_views,
|
||||||
double *propspeeds = nullptr,
|
double *propspeeds = nullptr,
|
||||||
@@ -82,11 +88,48 @@ bool bssn_cuda_use_resident_sync(int lev)
|
|||||||
(void)lev;
|
(void)lev;
|
||||||
return false;
|
return false;
|
||||||
#else
|
#else
|
||||||
return lev == 0;
|
return true;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
|
bool bssn_cuda_sync_subset(Block *cg,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **host_views,
|
||||||
|
bool upload)
|
||||||
|
{
|
||||||
|
if (!cg || subset_count <= 0)
|
||||||
|
return true;
|
||||||
|
if (!bssn_cuda_has_resident_state(cg))
|
||||||
|
return true;
|
||||||
|
if (upload)
|
||||||
|
return bssn_cuda_upload_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
|
||||||
|
return bssn_cuda_download_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_sync_ga_fields(Block *cg, MyList<var> *vars, bool upload)
|
||||||
|
{
|
||||||
|
double *ga_fields[12];
|
||||||
|
int idx = 0;
|
||||||
|
while (vars && idx < 12)
|
||||||
|
{
|
||||||
|
ga_fields[idx++] = cg->fgfs[vars->data->sgfn];
|
||||||
|
vars = vars->next;
|
||||||
|
}
|
||||||
|
if (idx != 12)
|
||||||
|
return false;
|
||||||
|
return bssn_cuda_sync_subset(cg, 12, k_bssn_cuda_ga_state_indices, ga_fields, upload);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool upload)
|
||||||
|
{
|
||||||
|
double *bh_fields[3] = {
|
||||||
|
cg->fgfs[forx->sgfn], cg->fgfs[fory->sgfn], cg->fgfs[forz->sgfn]
|
||||||
|
};
|
||||||
|
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
|
||||||
|
}
|
||||||
|
|
||||||
|
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||||
{
|
{
|
||||||
MyList<Patch> *Pp = PatL;
|
MyList<Patch> *Pp = PatL;
|
||||||
while (Pp)
|
while (Pp)
|
||||||
@@ -108,7 +151,32 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
|||||||
cout << "CUDA resident state download failed" << endl;
|
cout << "CUDA resident state download failed" << endl;
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
bssn_cuda_release_step_ctx(cg);
|
if (release_ctx)
|
||||||
|
bssn_cuda_release_step_ctx(cg);
|
||||||
|
}
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
|
||||||
|
int myrank,
|
||||||
|
var *forx, var *fory, var *forz)
|
||||||
|
{
|
||||||
|
MyList<Patch> *Pp = PatL;
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *cg = BP->data;
|
||||||
|
if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
||||||
|
{
|
||||||
|
cout << "CUDA BH state subset download failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
break;
|
break;
|
||||||
@@ -3187,6 +3255,24 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
bool used_gpu_substep = false;
|
bool used_gpu_substep = false;
|
||||||
bool used_gpu_resident_state = false;
|
bool used_gpu_resident_state = false;
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
|
if (use_cuda_resident_sync)
|
||||||
|
{
|
||||||
|
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, false))
|
||||||
|
{
|
||||||
|
cout << "CUDA predictor GA subset download failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
f_enforce_ga(cg->shape,
|
||||||
|
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||||
|
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||||
|
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||||
|
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||||
|
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, true))
|
||||||
|
{
|
||||||
|
cout << "CUDA predictor GA subset upload failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
{
|
{
|
||||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||||
@@ -3206,7 +3292,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
int apply_enforce_ga = 0;
|
int apply_enforce_ga = 0;
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
apply_enforce_ga = 1;
|
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
|
||||||
#endif
|
#endif
|
||||||
#if (SommerType == 0)
|
#if (SommerType == 0)
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
@@ -3519,6 +3605,17 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#endif
|
#endif
|
||||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
const bool need_analysis_state_after_predictor =
|
||||||
|
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
|
||||||
|
const bool need_bh_state_after_predictor =
|
||||||
|
(BH_num > 0) && (lev == GH->levels - 1);
|
||||||
|
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
|
||||||
|
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||||
|
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
|
||||||
|
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
// Complete non-blocking error reduction and check
|
// Complete non-blocking error reduction and check
|
||||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||||
@@ -3609,6 +3706,24 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
bool used_gpu_substep = false;
|
bool used_gpu_substep = false;
|
||||||
bool used_gpu_resident_state = false;
|
bool used_gpu_resident_state = false;
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
|
if (use_cuda_resident_sync)
|
||||||
|
{
|
||||||
|
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, false))
|
||||||
|
{
|
||||||
|
cout << "CUDA corrector GA subset download failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
f_enforce_ga(cg->shape,
|
||||||
|
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||||
|
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||||
|
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||||
|
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||||
|
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, true))
|
||||||
|
{
|
||||||
|
cout << "CUDA corrector GA subset upload failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
{
|
{
|
||||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||||
@@ -3628,9 +3743,9 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
int apply_enforce_ga = 0;
|
int apply_enforce_ga = 0;
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
apply_enforce_ga = 1;
|
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
|
||||||
#elif (AGM == 1)
|
#elif (AGM == 1)
|
||||||
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
|
apply_enforce_ga = (iter_count == 3 && !use_cuda_resident_sync) ? 1 : 0;
|
||||||
#endif
|
#endif
|
||||||
#if (SommerType == 0)
|
#if (SommerType == 0)
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
@@ -3993,6 +4108,11 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
|
||||||
|
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
|
||||||
|
#endif
|
||||||
|
|
||||||
// swap time level
|
// swap time level
|
||||||
if (iter_count < 3)
|
if (iter_count < 3)
|
||||||
{
|
{
|
||||||
@@ -4046,7 +4166,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
if (use_cuda_resident_sync)
|
if (use_cuda_resident_sync)
|
||||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
|
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
|
||||||
#endif
|
#endif
|
||||||
#if (RPS == 0)
|
#if (RPS == 0)
|
||||||
// mesh refinement boundary part
|
// mesh refinement boundary part
|
||||||
|
|||||||
@@ -362,6 +362,8 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
|
|||||||
S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
|
S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
|
||||||
};
|
};
|
||||||
|
|
||||||
|
__constant__ int d_subset_state_indices[BSSN_STATE_COUNT];
|
||||||
|
|
||||||
static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
||||||
1, 1, 1,
|
1, 1, 1,
|
||||||
1, 1, -1,
|
1, 1, -1,
|
||||||
@@ -395,19 +397,25 @@ struct StepContext {
|
|||||||
double *d_state_curr_mem;
|
double *d_state_curr_mem;
|
||||||
double *d_state_next_mem;
|
double *d_state_next_mem;
|
||||||
double *d_matter_mem;
|
double *d_matter_mem;
|
||||||
|
double *d_comm_mem;
|
||||||
|
double *h_comm_mem;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||||
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||||
size_t cap_all;
|
size_t cap_all;
|
||||||
|
size_t cap_comm;
|
||||||
|
bool h_comm_pinned;
|
||||||
|
size_t cap_h_comm;
|
||||||
bool matter_ready;
|
bool matter_ready;
|
||||||
bool state_ready;
|
bool state_ready;
|
||||||
|
|
||||||
StepContext()
|
StepContext()
|
||||||
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
||||||
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
||||||
d_matter_mem(nullptr), cap_all(0),
|
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
||||||
|
cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
|
||||||
matter_ready(false), state_ready(false)
|
matter_ready(false), state_ready(false)
|
||||||
{
|
{
|
||||||
d_state0.fill(nullptr);
|
d_state0.fill(nullptr);
|
||||||
@@ -584,11 +592,65 @@ static void release_step_ctx(void *block_tag)
|
|||||||
{
|
{
|
||||||
auto it = g_step_ctx.find(block_tag);
|
auto it = g_step_ctx.find(block_tag);
|
||||||
if (it == g_step_ctx.end()) return;
|
if (it == g_step_ctx.end()) return;
|
||||||
|
if (it->second.d_comm_mem) {
|
||||||
|
cudaFree(it->second.d_comm_mem);
|
||||||
|
it->second.d_comm_mem = nullptr;
|
||||||
|
it->second.cap_comm = 0;
|
||||||
|
}
|
||||||
|
if (it->second.h_comm_mem) {
|
||||||
|
if (it->second.h_comm_pinned) cudaFreeHost(it->second.h_comm_mem);
|
||||||
|
else free(it->second.h_comm_mem);
|
||||||
|
it->second.h_comm_mem = nullptr;
|
||||||
|
it->second.h_comm_pinned = false;
|
||||||
|
it->second.cap_h_comm = 0;
|
||||||
|
}
|
||||||
StepAllocation alloc = detach_step_allocation(it->second);
|
StepAllocation alloc = detach_step_allocation(it->second);
|
||||||
recycle_step_allocation(alloc);
|
recycle_step_allocation(alloc);
|
||||||
g_step_ctx.erase(it);
|
g_step_ctx.erase(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
||||||
|
{
|
||||||
|
if (needed_doubles == 0) return nullptr;
|
||||||
|
if (ctx.cap_comm < needed_doubles) {
|
||||||
|
if (ctx.d_comm_mem) {
|
||||||
|
CUDA_CHECK(cudaFree(ctx.d_comm_mem));
|
||||||
|
ctx.d_comm_mem = nullptr;
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double)));
|
||||||
|
ctx.cap_comm = needed_doubles;
|
||||||
|
}
|
||||||
|
return ctx.d_comm_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
||||||
|
{
|
||||||
|
if (needed_doubles == 0) return nullptr;
|
||||||
|
if (ctx.cap_h_comm < needed_doubles) {
|
||||||
|
if (ctx.h_comm_mem) {
|
||||||
|
if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem);
|
||||||
|
else free(ctx.h_comm_mem);
|
||||||
|
ctx.h_comm_mem = nullptr;
|
||||||
|
ctx.h_comm_pinned = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t bytes = needed_doubles * sizeof(double);
|
||||||
|
cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes);
|
||||||
|
if (err == cudaSuccess) {
|
||||||
|
ctx.h_comm_pinned = true;
|
||||||
|
} else {
|
||||||
|
ctx.h_comm_mem = (double *)malloc(bytes);
|
||||||
|
ctx.h_comm_pinned = false;
|
||||||
|
if (!ctx.h_comm_mem) {
|
||||||
|
fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctx.cap_h_comm = needed_doubles;
|
||||||
|
}
|
||||||
|
return ctx.h_comm_mem;
|
||||||
|
}
|
||||||
|
|
||||||
static void upload_grid_params_if_needed(const GridParams &gp)
|
static void upload_grid_params_if_needed(const GridParams &gp)
|
||||||
{
|
{
|
||||||
if (!g_gp_host_cache_valid ||
|
if (!g_gp_host_cache_valid ||
|
||||||
@@ -1681,7 +1743,7 @@ __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
|||||||
- lgxy * lgxy * lgzz
|
- lgxy * lgxy * lgzz
|
||||||
- lgxx * lgyz * lgyz;
|
- lgxx * lgyz * lgyz;
|
||||||
|
|
||||||
lscale = ONE / cbrt(lscale);
|
lscale = ONE / pow(lscale, F1O3);
|
||||||
|
|
||||||
lgxx *= lscale;
|
lgxx *= lscale;
|
||||||
lgxy *= lscale;
|
lgxy *= lscale;
|
||||||
@@ -3446,6 +3508,88 @@ static void download_state_outputs(double **state_host_out, size_t all)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||||
|
double * __restrict__ dst,
|
||||||
|
int nx, int ny,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
int region_all,
|
||||||
|
int state_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||||
|
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
tid < total;
|
||||||
|
tid += (size_t)blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int state_index = (int)(tid / (size_t)region_all);
|
||||||
|
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||||
|
const int ii = local % sx;
|
||||||
|
const int jj = (local / sx) % sy;
|
||||||
|
const int kk = local / (sx * sy);
|
||||||
|
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
|
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
||||||
|
const double * __restrict__ src,
|
||||||
|
int nx, int ny,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
int region_all,
|
||||||
|
int state_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||||
|
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
tid < total;
|
||||||
|
tid += (size_t)blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int state_index = (int)(tid / (size_t)region_all);
|
||||||
|
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||||
|
const int ii = local % sx;
|
||||||
|
const int jj = (local / sx) % sy;
|
||||||
|
const int kk = local / (sx * sy);
|
||||||
|
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
|
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
||||||
|
double * __restrict__ dst,
|
||||||
|
int subset_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const size_t total = (size_t)subset_count * (size_t)all;
|
||||||
|
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
tid < total;
|
||||||
|
tid += (size_t)blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int subset_slot = (int)(tid / (size_t)all);
|
||||||
|
const int state_index = d_subset_state_indices[subset_slot];
|
||||||
|
const int src = (int)(tid - (size_t)subset_slot * all);
|
||||||
|
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
|
||||||
|
const double * __restrict__ src,
|
||||||
|
int subset_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const size_t total = (size_t)subset_count * (size_t)all;
|
||||||
|
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
tid < total;
|
||||||
|
tid += (size_t)blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int subset_slot = (int)(tid / (size_t)all);
|
||||||
|
const int state_index = d_subset_state_indices[subset_slot];
|
||||||
|
const int dst = (int)(tid - (size_t)subset_slot * all);
|
||||||
|
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void copy_state_region_cuda(void *block_tag,
|
static void copy_state_region_cuda(void *block_tag,
|
||||||
int state_index,
|
int state_index,
|
||||||
double *host_state,
|
double *host_state,
|
||||||
@@ -3508,6 +3652,41 @@ static void copy_state_region_packed_cuda(void *block_tag,
|
|||||||
CUDA_CHECK(cudaMemcpy3D(&p));
|
CUDA_CHECK(cudaMemcpy3D(&p));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_state_region_packed_batch_cuda(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
const int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
cudaMemcpyKind kind)
|
||||||
|
{
|
||||||
|
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
|
||||||
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
||||||
|
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||||
|
const int region_all = sx * sy * sz;
|
||||||
|
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
|
||||||
|
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||||
|
|
||||||
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
|
kern_pack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||||
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
|
||||||
|
total_doubles * sizeof(double),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
} else {
|
||||||
|
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
||||||
|
total_doubles * sizeof(double),
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
kern_unpack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||||
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
||||||
{
|
{
|
||||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
@@ -3521,6 +3700,63 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_state_subset(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **state_host,
|
||||||
|
cudaMemcpyKind kind)
|
||||||
|
{
|
||||||
|
if (subset_count <= 0) return;
|
||||||
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
|
const size_t bytes = all * sizeof(double);
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
|
int active_state_indices[BSSN_STATE_COUNT];
|
||||||
|
double *active_state_host[BSSN_STATE_COUNT];
|
||||||
|
int active_count = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < subset_count; ++i) {
|
||||||
|
const int state_index = state_indices[i];
|
||||||
|
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue;
|
||||||
|
if (!state_host[i]) continue;
|
||||||
|
active_state_indices[active_count] = state_index;
|
||||||
|
active_state_host[active_count] = state_host[i];
|
||||||
|
++active_count;
|
||||||
|
}
|
||||||
|
if (active_count <= 0) return;
|
||||||
|
|
||||||
|
const size_t total_doubles = (size_t)active_count * all;
|
||||||
|
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||||
|
double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices,
|
||||||
|
(size_t)active_count * sizeof(int),
|
||||||
|
0, cudaMemcpyHostToDevice));
|
||||||
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
|
kern_pack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||||
|
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
||||||
|
total_doubles * sizeof(double),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
for (int i = 0; i < active_count; ++i) {
|
||||||
|
std::memcpy(active_state_host[i],
|
||||||
|
h_comm + (size_t)i * all,
|
||||||
|
bytes);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < active_count; ++i) {
|
||||||
|
std::memcpy(h_comm + (size_t)i * all,
|
||||||
|
active_state_host[i],
|
||||||
|
bytes);
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
||||||
|
total_doubles * sizeof(double),
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
kern_unpack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool has_resident_state(void *block_tag)
|
static bool has_resident_state(void *block_tag)
|
||||||
{
|
{
|
||||||
auto it = g_step_ctx.find(block_tag);
|
auto it = g_step_ctx.find(block_tag);
|
||||||
@@ -4186,6 +4422,66 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz,
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz,
|
||||||
|
cudaMemcpyHostToDevice);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_download_state_subset(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **state_host_out)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out,
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_upload_state_subset(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **state_host_in)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in,
|
||||||
|
cudaMemcpyHostToDevice);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
int bssn_cuda_has_resident_state(void *block_tag)
|
int bssn_cuda_has_resident_state(void *block_tag)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -86,6 +86,32 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_download_state_subset(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **state_host_out);
|
||||||
|
|
||||||
|
int bssn_cuda_upload_state_subset(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int subset_count,
|
||||||
|
const int *state_indices,
|
||||||
|
double **state_host_in);
|
||||||
|
|
||||||
int bssn_cuda_has_resident_state(void *block_tag);
|
int bssn_cuda_has_resident_state(void *block_tag);
|
||||||
|
|
||||||
void bssn_cuda_release_step_ctx(void *block_tag);
|
void bssn_cuda_release_step_ctx(void *block_tag);
|
||||||
|
|||||||
Reference in New Issue
Block a user