Batch GA/BH subset sync with indexed GPU pack/unpack buffers
This commit is contained in:
@@ -145,6 +145,18 @@ int cuda_seg_begin(const Parallel::gridseg *seg, Block *bg, int dir)
|
||||
return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5);
|
||||
}
|
||||
|
||||
int cuda_state_var_count(MyList<var> *src_vars, MyList<var> *dst_vars)
|
||||
{
|
||||
int count = 0;
|
||||
while (src_vars && dst_vars)
|
||||
{
|
||||
++count;
|
||||
src_vars = src_vars->next;
|
||||
dst_vars = dst_vars->next;
|
||||
}
|
||||
return (src_vars || dst_vars) ? -1 : count;
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
|
||||
{
|
||||
@@ -163,15 +175,17 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
|
||||
bool cuda_direct_pack_segment(double *buffer,
|
||||
const Parallel::gridseg *src,
|
||||
const Parallel::gridseg *dst,
|
||||
int state_index)
|
||||
int state_count)
|
||||
{
|
||||
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||
const bool ok = bssn_cuda_pack_state_region_to_host_buffer(src->Bg, state_index, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
const bool ok = bssn_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
if (sync_profile_enabled())
|
||||
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||
return ok;
|
||||
@@ -179,15 +193,17 @@ bool cuda_direct_pack_segment(double *buffer,
|
||||
|
||||
bool cuda_direct_unpack_segment(double *buffer,
|
||||
const Parallel::gridseg *dst,
|
||||
int state_index)
|
||||
int state_count)
|
||||
{
|
||||
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||
const bool ok = bssn_cuda_unpack_state_region_from_host_buffer(dst->Bg, state_index, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
if (sync_profile_enabled())
|
||||
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||
return ok;
|
||||
@@ -3921,21 +3937,14 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
if (!src || !dst)
|
||||
return size_out;
|
||||
|
||||
MyList<var> *varls, *varld;
|
||||
|
||||
varls = VarLists;
|
||||
varld = VarListd;
|
||||
while (varls && varld)
|
||||
{
|
||||
varls = varls->next;
|
||||
varld = varld->next;
|
||||
}
|
||||
|
||||
if (varls || varld)
|
||||
{
|
||||
cout << "error in short data packer, var lists does not match." << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MyList<var> *varls, *varld;
|
||||
|
||||
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||
if (state_count < 0)
|
||||
{
|
||||
cout << "error in short data packer, var lists does not match." << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
int type; /* 1 copy, 2 restrict, 3 prolong */
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
@@ -3961,7 +3970,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
bool handled_by_cuda = false;
|
||||
if (dir == PACK && cuda_can_direct_pack(src->data, dst->data, type))
|
||||
{
|
||||
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_idx);
|
||||
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count);
|
||||
if (!handled_by_cuda)
|
||||
{
|
||||
cout << "Parallel::data_packer: CUDA direct pack failed." << endl;
|
||||
@@ -3970,7 +3979,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
}
|
||||
else if (dir == UNPACK && cuda_can_direct_unpack(dst->data, type))
|
||||
{
|
||||
handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_idx);
|
||||
handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_count);
|
||||
if (!handled_by_cuda)
|
||||
{
|
||||
cout << "Parallel::data_packer: CUDA direct unpack failed." << endl;
|
||||
@@ -4005,6 +4014,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
dst->data->llb, dst->data->uub);
|
||||
#if USE_CUDA_BSSN
|
||||
}
|
||||
else
|
||||
{
|
||||
size_out += (state_count - 1) * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||
while (varls->next && varld->next)
|
||||
{
|
||||
varls = varls->next;
|
||||
varld = varld->next;
|
||||
++state_idx;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||
|
||||
Reference in New Issue
Block a user