Batch GA/BH subset sync with indexed GPU pack/unpack buffers
This commit is contained in:
@@ -53,6 +53,12 @@ using namespace std;
|
||||
#if USE_CUDA_BSSN
|
||||
namespace {
|
||||
|
||||
static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
|
||||
static const int k_bssn_cuda_ga_state_indices[12] = {
|
||||
2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13
|
||||
};
|
||||
|
||||
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||
double **host_views,
|
||||
double *propspeeds = nullptr,
|
||||
@@ -82,11 +88,48 @@ bool bssn_cuda_use_resident_sync(int lev)
|
||||
(void)lev;
|
||||
return false;
|
||||
#else
|
||||
return lev == 0;
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
|
||||
bool bssn_cuda_sync_subset(Block *cg,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **host_views,
|
||||
bool upload)
|
||||
{
|
||||
if (!cg || subset_count <= 0)
|
||||
return true;
|
||||
if (!bssn_cuda_has_resident_state(cg))
|
||||
return true;
|
||||
if (upload)
|
||||
return bssn_cuda_upload_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
|
||||
return bssn_cuda_download_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
|
||||
}
|
||||
|
||||
bool bssn_cuda_sync_ga_fields(Block *cg, MyList<var> *vars, bool upload)
|
||||
{
|
||||
double *ga_fields[12];
|
||||
int idx = 0;
|
||||
while (vars && idx < 12)
|
||||
{
|
||||
ga_fields[idx++] = cg->fgfs[vars->data->sgfn];
|
||||
vars = vars->next;
|
||||
}
|
||||
if (idx != 12)
|
||||
return false;
|
||||
return bssn_cuda_sync_subset(cg, 12, k_bssn_cuda_ga_state_indices, ga_fields, upload);
|
||||
}
|
||||
|
||||
bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool upload)
|
||||
{
|
||||
double *bh_fields[3] = {
|
||||
cg->fgfs[forx->sgfn], cg->fgfs[fory->sgfn], cg->fgfs[forz->sgfn]
|
||||
};
|
||||
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
|
||||
}
|
||||
|
||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
@@ -108,7 +151,32 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
cout << "CUDA resident state download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
bssn_cuda_release_step_ctx(cg);
|
||||
if (release_ctx)
|
||||
bssn_cuda_release_step_ctx(cg);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
|
||||
void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
|
||||
int myrank,
|
||||
var *forx, var *fory, var *forz)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
||||
{
|
||||
cout << "CUDA BH state subset download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
@@ -3187,6 +3255,24 @@ void bssn_class::Step(int lev, int YN)
|
||||
bool used_gpu_substep = false;
|
||||
bool used_gpu_resident_state = false;
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync)
|
||||
{
|
||||
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, false))
|
||||
{
|
||||
cout << "CUDA predictor GA subset download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, true))
|
||||
{
|
||||
cout << "CUDA predictor GA subset upload failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
{
|
||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
@@ -3206,7 +3292,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
|
||||
#endif
|
||||
#if (SommerType == 0)
|
||||
#ifndef WithShell
|
||||
@@ -3519,6 +3605,17 @@ void bssn_class::Step(int lev, int YN)
|
||||
#endif
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
const bool need_analysis_state_after_predictor =
|
||||
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
|
||||
const bool need_bh_state_after_predictor =
|
||||
(BH_num > 0) && (lev == GH->levels - 1);
|
||||
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
|
||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
|
||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
// Complete non-blocking error reduction and check
|
||||
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
|
||||
@@ -3609,6 +3706,24 @@ void bssn_class::Step(int lev, int YN)
|
||||
bool used_gpu_substep = false;
|
||||
bool used_gpu_resident_state = false;
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync)
|
||||
{
|
||||
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, false))
|
||||
{
|
||||
cout << "CUDA corrector GA subset download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, true))
|
||||
{
|
||||
cout << "CUDA corrector GA subset upload failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
{
|
||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
@@ -3628,9 +3743,9 @@ void bssn_class::Step(int lev, int YN)
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
|
||||
#elif (AGM == 1)
|
||||
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
|
||||
apply_enforce_ga = (iter_count == 3 && !use_cuda_resident_sync) ? 1 : 0;
|
||||
#endif
|
||||
#if (SommerType == 0)
|
||||
#ifndef WithShell
|
||||
@@ -3993,6 +4108,11 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
|
||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
|
||||
#endif
|
||||
|
||||
// swap time level
|
||||
if (iter_count < 3)
|
||||
{
|
||||
@@ -4046,7 +4166,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync)
|
||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
|
||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
|
||||
#endif
|
||||
#if (RPS == 0)
|
||||
// mesh refinement boundary part
|
||||
|
||||
Reference in New Issue
Block a user