Optimize BSSN CUDA state transfers
This commit is contained in:
@@ -6,6 +6,7 @@
|
||||
#include "parameters.h"
|
||||
#include <cstdlib>
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
|
||||
#ifndef USE_CUDA_Z4C
|
||||
#define USE_CUDA_Z4C 0
|
||||
@@ -391,6 +392,113 @@ bool cuda_device_state_count_supported(int state_count)
|
||||
#endif
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
bool cuda_flush_device_segment_batch(Block *block,
|
||||
double *data,
|
||||
int state_count,
|
||||
const std::vector<int> &meta,
|
||||
int dir)
|
||||
{
|
||||
if (!block || meta.empty())
|
||||
return true;
|
||||
const int segment_count = (int)(meta.size() / 8);
|
||||
if (dir == PACK)
|
||||
return bssn_cuda_pack_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
return bssn_cuda_unpack_state_segments_from_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
}
|
||||
|
||||
int cuda_data_packer_device_batched(double *data,
|
||||
MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
int dir,
|
||||
MyList<var> *VarLists,
|
||||
MyList<var> *VarListd,
|
||||
int Symmetry)
|
||||
{
|
||||
(void)Symmetry;
|
||||
if (!data || (dir != PACK && dir != UNPACK) || !src || !dst)
|
||||
return -1;
|
||||
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||
if (!cuda_device_state_count_supported(state_count))
|
||||
return -1;
|
||||
|
||||
int size_out = 0;
|
||||
Block *batch_block = 0;
|
||||
std::vector<int> batch_meta;
|
||||
batch_meta.reserve(64);
|
||||
|
||||
while (src && dst)
|
||||
{
|
||||
const bool active =
|
||||
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||
if (active)
|
||||
{
|
||||
int type;
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
type = 1;
|
||||
else if (src->data->Bg->lev > dst->data->Bg->lev)
|
||||
type = 2;
|
||||
else
|
||||
type = 3;
|
||||
if (type != 1)
|
||||
return -1;
|
||||
|
||||
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
|
||||
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
|
||||
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
|
||||
return -1;
|
||||
|
||||
if (batch_block && batch_block != block)
|
||||
{
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
|
||||
return -1;
|
||||
batch_meta.clear();
|
||||
}
|
||||
batch_block = block;
|
||||
|
||||
const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
|
||||
: cuda_seg_begin(dst->data, block, 0);
|
||||
const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
|
||||
: cuda_seg_begin(dst->data, block, 1);
|
||||
const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
|
||||
: cuda_seg_begin(dst->data, block, 2);
|
||||
const int sx = dst->data->shape[0];
|
||||
const int sy = dst->data->shape[1];
|
||||
const int sz = dst->data->shape[2];
|
||||
const int region_all = sx * sy * sz;
|
||||
|
||||
batch_meta.push_back(i0);
|
||||
batch_meta.push_back(j0);
|
||||
batch_meta.push_back(k0);
|
||||
batch_meta.push_back(sx);
|
||||
batch_meta.push_back(sy);
|
||||
batch_meta.push_back(sz);
|
||||
batch_meta.push_back(region_all);
|
||||
batch_meta.push_back(size_out);
|
||||
|
||||
size_out += state_count * region_all;
|
||||
}
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
}
|
||||
|
||||
if (batch_block)
|
||||
{
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
|
||||
return -1;
|
||||
}
|
||||
return size_out;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool cuda_segments_same_level(MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
@@ -465,6 +573,23 @@ int data_packer_with_device_buffer(double *data,
|
||||
MyList<var> *VarListd,
|
||||
int Symmetry)
|
||||
{
|
||||
#if USE_CUDA_BSSN
|
||||
const double batched_t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
const int batched = cuda_data_packer_device_batched(data, src, dst, rank_in, dir,
|
||||
VarLists, VarListd, Symmetry);
|
||||
if (batched >= 0)
|
||||
{
|
||||
if (sync_profile_enabled())
|
||||
{
|
||||
const double dt = MPI_Wtime() - batched_t0;
|
||||
if (dir == PACK)
|
||||
sync_profile_stats().direct_pack_sec += dt;
|
||||
else if (dir == UNPACK)
|
||||
sync_profile_stats().direct_unpack_sec += dt;
|
||||
}
|
||||
return batched;
|
||||
}
|
||||
#endif
|
||||
s_cuda_aware_pack_active = true;
|
||||
int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry);
|
||||
s_cuda_aware_pack_active = false;
|
||||
|
||||
Reference in New Issue
Block a user