Checkpoint Z4C CUDA resident sync progress
This commit is contained in:
@@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
|
||||
return false;
|
||||
if (z4c_cuda_has_resident_state(dst->Bg) == 0)
|
||||
return false;
|
||||
if (type != 1 && VarListd)
|
||||
{
|
||||
double *view_ptrs[Z4C_CUDA_STATE_COUNT];
|
||||
if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
|
||||
return false;
|
||||
if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
|
||||
return false;
|
||||
}
|
||||
(void)VarListd;
|
||||
return true;
|
||||
#elif USE_CUDA_BSSN
|
||||
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
||||
@@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer,
|
||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
double *views[Z4C_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
|
||||
const bool ok = have_views
|
||||
? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||
: z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
#else
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
|
||||
@@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer,
|
||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
double *views[Z4C_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
|
||||
const bool ok = have_views
|
||||
? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(
|
||||
dst->Bg, views, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||
: z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
#else
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
|
||||
@@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
||||
{
|
||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
bool ok = false;
|
||||
double *views[Z4C_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
|
||||
const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
|
||||
if (type == 1)
|
||||
{
|
||||
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||
ok = z4c_cuda_pack_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
ok = have_views
|
||||
? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||
: z4c_cuda_pack_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
}
|
||||
else if (type == 2)
|
||||
{
|
||||
int first_fine[3];
|
||||
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
|
||||
return false;
|
||||
ok = z4c_cuda_restrict_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2],
|
||||
have_soa ? soa_flat : 0) == 0;
|
||||
ok = have_views
|
||||
? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2],
|
||||
have_soa ? soa_flat : 0) == 0
|
||||
: z4c_cuda_restrict_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2],
|
||||
have_soa ? soa_flat : 0) == 0;
|
||||
}
|
||||
else if (type == 3)
|
||||
{
|
||||
int first_fine_ii[3], coarse_lb[3];
|
||||
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
|
||||
return false;
|
||||
ok = z4c_cuda_prolong_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||
have_soa ? soa_flat : 0) == 0;
|
||||
ok = have_views
|
||||
? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||
have_soa ? soa_flat : 0) == 0
|
||||
: z4c_cuda_prolong_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||
have_soa ? soa_flat : 0) == 0;
|
||||
}
|
||||
if (sync_profile_enabled())
|
||||
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||
@@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
|
||||
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||
const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
|
||||
dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
double *views[Z4C_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
|
||||
const bool ok = have_views
|
||||
? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(
|
||||
dst->Bg, views, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||
: z4c_cuda_unpack_state_batch_from_device_buffer(
|
||||
dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||
i0, j0, k0,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||
if (sync_profile_enabled())
|
||||
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||
return ok;
|
||||
@@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block,
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||
{
|
||||
double *views[Z4C_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views);
|
||||
const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
|
||||
if (dir == PACK)
|
||||
{
|
||||
if (type == 2)
|
||||
return z4c_cuda_restrict_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||
return have_views
|
||||
? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||
: z4c_cuda_restrict_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||
if (type == 3)
|
||||
return z4c_cuda_prolong_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||
return z4c_cuda_pack_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
return have_views
|
||||
? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||
: z4c_cuda_prolong_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||
return have_views
|
||||
? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||
: z4c_cuda_pack_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
}
|
||||
return z4c_cuda_unpack_state_segments_from_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
return have_views
|
||||
? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||
: z4c_cuda_unpack_state_segments_from_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
}
|
||||
#endif
|
||||
#if USE_CUDA_BSSN
|
||||
@@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (dir == UNPACK) // from target data to corresponding grid
|
||||
{
|
||||
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
||||
|
||||
Reference in New Issue
Block a user