Batch GA/BH subset sync with indexed GPU pack/unpack buffers

This commit is contained in:
2026-04-13 20:27:30 +08:00
parent c5d1268dd1
commit e952ee8e91
4 changed files with 495 additions and 34 deletions

View File

@@ -53,6 +53,12 @@ using namespace std;
#if USE_CUDA_BSSN
namespace {
static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
static const int k_bssn_cuda_ga_state_indices[12] = {
2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13
};
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = nullptr,
@@ -82,11 +88,48 @@ bool bssn_cuda_use_resident_sync(int lev)
(void)lev;
return false;
#else
return lev == 0;
return true;
#endif
}
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
bool bssn_cuda_sync_subset(Block *cg,
int subset_count,
const int *state_indices,
double **host_views,
bool upload)
{
if (!cg || subset_count <= 0)
return true;
if (!bssn_cuda_has_resident_state(cg))
return true;
if (upload)
return bssn_cuda_upload_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
return bssn_cuda_download_state_subset(cg, cg->shape, subset_count, state_indices, host_views) == 0;
}
bool bssn_cuda_sync_ga_fields(Block *cg, MyList<var> *vars, bool upload)
{
double *ga_fields[12];
int idx = 0;
while (vars && idx < 12)
{
ga_fields[idx++] = cg->fgfs[vars->data->sgfn];
vars = vars->next;
}
if (idx != 12)
return false;
return bssn_cuda_sync_subset(cg, 12, k_bssn_cuda_ga_state_indices, ga_fields, upload);
}
bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool upload)
{
double *bh_fields[3] = {
cg->fgfs[forx->sgfn], cg->fgfs[fory->sgfn], cg->fgfs[forz->sgfn]
};
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
}
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
while (Pp)
@@ -108,7 +151,32 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
cout << "CUDA resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
bssn_cuda_release_step_ctx(cg);
if (release_ctx)
bssn_cuda_release_step_ctx(cg);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
int myrank,
var *forx, var *fory, var *forz)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
{
cout << "CUDA BH state subset download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (BP == Pp->data->ble)
break;
@@ -3187,6 +3255,24 @@ void bssn_class::Step(int lev, int YN)
bool used_gpu_substep = false;
bool used_gpu_resident_state = false;
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
{
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, false))
{
cout << "CUDA predictor GA subset download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
if (!bssn_cuda_sync_ga_fields(cg, StateList->next->next, true))
{
cout << "CUDA predictor GA subset upload failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
{
double *state_in[BSSN_CUDA_STATE_COUNT];
double *state_out[BSSN_CUDA_STATE_COUNT];
@@ -3206,7 +3292,7 @@ void bssn_class::Step(int lev, int YN)
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
#endif
#if (SommerType == 0)
#ifndef WithShell
@@ -3519,6 +3605,17 @@ void bssn_class::Step(int lev, int YN)
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#if USE_CUDA_BSSN
const bool need_analysis_state_after_predictor =
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
const bool need_bh_state_after_predictor =
(BH_num > 0) && (lev == GH->levels - 1);
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
#endif
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
@@ -3609,6 +3706,24 @@ void bssn_class::Step(int lev, int YN)
bool used_gpu_substep = false;
bool used_gpu_resident_state = false;
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
{
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, false))
{
cout << "CUDA corrector GA subset download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
if (!bssn_cuda_sync_ga_fields(cg, SynchList_pre->next->next, true))
{
cout << "CUDA corrector GA subset upload failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
{
double *state_in[BSSN_CUDA_STATE_COUNT];
double *state_out[BSSN_CUDA_STATE_COUNT];
@@ -3628,9 +3743,9 @@ void bssn_class::Step(int lev, int YN)
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
apply_enforce_ga = use_cuda_resident_sync ? 0 : 1;
#elif (AGM == 1)
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
apply_enforce_ga = (iter_count == 3 && !use_cuda_resident_sync) ? 1 : 0;
#endif
#if (SommerType == 0)
#ifndef WithShell
@@ -3993,6 +4108,11 @@ void bssn_class::Step(int lev, int YN)
}
#endif
#if USE_CUDA_BSSN
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
#endif
// swap time level
if (iter_count < 3)
{
@@ -4046,7 +4166,7 @@ void bssn_class::Step(int lev, int YN)
}
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
#endif
#if (RPS == 0)
// mesh refinement boundary part