Checkpoint Z4C CUDA optimization progress
This commit is contained in:
@@ -193,6 +193,28 @@ bool cuda_build_bssn_host_views(Block *block,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
bool cuda_build_z4c_host_views(Block *block,
|
||||||
|
MyList<var> *vars,
|
||||||
|
int state_count,
|
||||||
|
double **views)
|
||||||
|
{
|
||||||
|
if (!block || !vars || !views || state_count != Z4C_CUDA_STATE_COUNT)
|
||||||
|
return false;
|
||||||
|
MyList<var> *v = vars;
|
||||||
|
for (int i = 0; i < Z4C_CUDA_STATE_COUNT; ++i)
|
||||||
|
{
|
||||||
|
if (!v)
|
||||||
|
return false;
|
||||||
|
views[i] = block->fgfs[v->data->sgfn];
|
||||||
|
if (!views[i])
|
||||||
|
return false;
|
||||||
|
v = v->next;
|
||||||
|
}
|
||||||
|
return v == 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
bool cuda_build_state_soa(MyList<var> *vars,
|
bool cuda_build_state_soa(MyList<var> *vars,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *soa_flat)
|
double *soa_flat)
|
||||||
@@ -220,6 +242,8 @@ int fortran_idint(double x)
|
|||||||
|
|
||||||
bool cuda_amr_restrict_device_enabled();
|
bool cuda_amr_restrict_device_enabled();
|
||||||
bool cuda_amr_prolong_device_enabled();
|
bool cuda_amr_prolong_device_enabled();
|
||||||
|
bool cuda_z4c_amr_prolong_device_enabled();
|
||||||
|
bool cuda_z4c_amr_unpack_device_enabled();
|
||||||
bool cuda_amr_restrict_compare_enabled();
|
bool cuda_amr_restrict_compare_enabled();
|
||||||
bool cuda_amr_restrict_batch_enabled();
|
bool cuda_amr_restrict_batch_enabled();
|
||||||
bool cuda_device_segment_batch_enabled();
|
bool cuda_device_segment_batch_enabled();
|
||||||
@@ -324,7 +348,8 @@ bool cuda_state_count_direct_supported(int state_count)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
|
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst,
|
||||||
|
int type, MyList<var> *VarLists = 0)
|
||||||
{
|
{
|
||||||
if (!src || !dst || !src->Bg)
|
if (!src || !dst || !src->Bg)
|
||||||
return false;
|
return false;
|
||||||
@@ -342,7 +367,7 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
|
|||||||
}
|
}
|
||||||
if (type == 2 && !cuda_amr_restrict_device_enabled())
|
if (type == 2 && !cuda_amr_restrict_device_enabled())
|
||||||
return false;
|
return false;
|
||||||
if (type == 3 && !cuda_amr_prolong_device_enabled())
|
if (type == 3 && (!cuda_amr_prolong_device_enabled() || !cuda_z4c_amr_prolong_device_enabled()))
|
||||||
return false;
|
return false;
|
||||||
if (type == 2) {
|
if (type == 2) {
|
||||||
int a[3];
|
int a[3];
|
||||||
@@ -354,7 +379,17 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
|
|||||||
if (!cuda_cell_gw3_prolong_params(src, dst, a, b))
|
if (!cuda_cell_gw3_prolong_params(src, dst, a, b))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return z4c_cuda_has_resident_state(src->Bg) != 0;
|
if (z4c_cuda_has_resident_state(src->Bg) == 0)
|
||||||
|
return false;
|
||||||
|
if (type != 1 && VarLists)
|
||||||
|
{
|
||||||
|
double *view_ptrs[Z4C_CUDA_STATE_COUNT];
|
||||||
|
if (!cuda_build_z4c_host_views(src->Bg, VarLists, Z4C_CUDA_STATE_COUNT, view_ptrs))
|
||||||
|
return false;
|
||||||
|
if (z4c_cuda_resident_state_matches(src->Bg, view_ptrs) == 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
#elif USE_CUDA_BSSN
|
#elif USE_CUDA_BSSN
|
||||||
if (bssn_cuda_has_resident_state(src->Bg) == 0)
|
if (bssn_cuda_has_resident_state(src->Bg) == 0)
|
||||||
return false;
|
return false;
|
||||||
@@ -380,12 +415,24 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
|
bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var> *VarListd = 0)
|
||||||
{
|
{
|
||||||
if (type < 1 || type > 3 || !dst || !dst->Bg)
|
if (type < 1 || type > 3 || !dst || !dst->Bg)
|
||||||
return false;
|
return false;
|
||||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
return z4c_cuda_has_resident_state(dst->Bg) != 0;
|
if (type != 1 && !cuda_z4c_amr_unpack_device_enabled())
|
||||||
|
return false;
|
||||||
|
if (z4c_cuda_has_resident_state(dst->Bg) == 0)
|
||||||
|
return false;
|
||||||
|
if (type != 1 && VarListd)
|
||||||
|
{
|
||||||
|
double *view_ptrs[Z4C_CUDA_STATE_COUNT];
|
||||||
|
if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
|
||||||
|
return false;
|
||||||
|
if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
#elif USE_CUDA_BSSN
|
#elif USE_CUDA_BSSN
|
||||||
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
||||||
#else
|
#else
|
||||||
@@ -507,6 +554,28 @@ bool cuda_amr_prolong_device_enabled()
|
|||||||
return enabled != 0;
|
return enabled != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool cuda_z4c_amr_prolong_device_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_CUDA_Z4C_AMR_PROLONG_DEVICE");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_z4c_amr_unpack_device_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_CUDA_Z4C_AMR_UNPACK_DEVICE");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
bool cuda_amr_restrict_compare_enabled()
|
bool cuda_amr_restrict_compare_enabled()
|
||||||
{
|
{
|
||||||
static int enabled = -1;
|
static int enabled = -1;
|
||||||
@@ -1109,8 +1178,8 @@ int cuda_data_packer_device_batched(double *data,
|
|||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
|
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
|
||||||
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
|
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type, VarLists)) ||
|
||||||
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
|
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type, VarListd)))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (batch_block && (batch_block != block || batch_type != type))
|
if (batch_block && (batch_block != block || batch_type != type))
|
||||||
@@ -1195,7 +1264,9 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
|||||||
int rank_in,
|
int rank_in,
|
||||||
int dir,
|
int dir,
|
||||||
int myrank,
|
int myrank,
|
||||||
int state_count)
|
int state_count,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
MyList<var> *VarListd)
|
||||||
{
|
{
|
||||||
bool has_work = false;
|
bool has_work = false;
|
||||||
while (src && dst)
|
while (src && dst)
|
||||||
@@ -1215,12 +1286,12 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
|||||||
type = 3;
|
type = 3;
|
||||||
if (dir == PACK)
|
if (dir == PACK)
|
||||||
{
|
{
|
||||||
if (!cuda_can_direct_pack(src->data, dst->data, type))
|
if (!cuda_can_direct_pack(src->data, dst->data, type, VarLists))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (!cuda_can_direct_unpack(dst->data, type))
|
if (!cuda_can_direct_unpack(dst->data, type, VarListd))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1318,11 +1389,13 @@ bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
|
|||||||
MyList<Parallel::gridseg> *dst,
|
MyList<Parallel::gridseg> *dst,
|
||||||
int rank_in,
|
int rank_in,
|
||||||
int state_count,
|
int state_count,
|
||||||
int myrank)
|
int myrank,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
MyList<var> *VarListd)
|
||||||
{
|
{
|
||||||
if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
|
if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
|
||||||
return false;
|
return false;
|
||||||
if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count))
|
if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count, VarLists, VarListd))
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -1331,11 +1404,13 @@ bool cuda_recv_to_device_eligible(MyList<Parallel::gridseg> *src,
|
|||||||
MyList<Parallel::gridseg> *dst,
|
MyList<Parallel::gridseg> *dst,
|
||||||
int rank_in,
|
int rank_in,
|
||||||
int state_count,
|
int state_count,
|
||||||
int myrank)
|
int myrank,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
MyList<var> *VarListd)
|
||||||
{
|
{
|
||||||
if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
|
if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count))
|
||||||
return false;
|
return false;
|
||||||
if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count))
|
if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count, VarLists, VarListd))
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -5133,7 +5208,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
bool handled_by_cuda = false;
|
bool handled_by_cuda = false;
|
||||||
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) &&
|
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) &&
|
||||||
cuda_state_count_direct_supported(state_count) &&
|
cuda_state_count_direct_supported(state_count) &&
|
||||||
cuda_can_direct_pack(src->data, dst->data, type))
|
cuda_can_direct_pack(src->data, dst->data, type, VarLists))
|
||||||
{
|
{
|
||||||
if (s_cuda_aware_pack_active) {
|
if (s_cuda_aware_pack_active) {
|
||||||
handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
|
handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
|
||||||
@@ -5148,7 +5223,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
}
|
}
|
||||||
else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) &&
|
else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) &&
|
||||||
cuda_state_count_direct_supported(state_count) &&
|
cuda_state_count_direct_supported(state_count) &&
|
||||||
cuda_can_direct_unpack(dst->data, type))
|
cuda_can_direct_unpack(dst->data, type, VarListd))
|
||||||
{
|
{
|
||||||
if (s_cuda_aware_pack_active) {
|
if (s_cuda_aware_pack_active) {
|
||||||
handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd);
|
handled_by_cuda = cuda_direct_unpack_segment_from_device(data + size_out, dst->data, state_count, VarListd);
|
||||||
@@ -5229,6 +5304,24 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
if (cuda_state_count_direct_supported(state_count) &&
|
if (cuda_state_count_direct_supported(state_count) &&
|
||||||
dst->data && dst->data->Bg)
|
dst->data && dst->data->Bg)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
const char *z4c_amr_env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE");
|
||||||
|
if (state_count == Z4C_CUDA_STATE_COUNT && type != 1 &&
|
||||||
|
z4c_amr_env && atoi(z4c_amr_env) != 0)
|
||||||
|
{
|
||||||
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
|
if (cuda_build_z4c_host_views(dst->data->Bg, VarListd, state_count, views) &&
|
||||||
|
z4c_cuda_resident_state_matches(dst->data->Bg, views) != 0)
|
||||||
|
{
|
||||||
|
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
|
||||||
|
{
|
||||||
|
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
|
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
|
||||||
{
|
{
|
||||||
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
|
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
|
||||||
@@ -5906,8 +5999,8 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
{
|
{
|
||||||
for (int n = 0; n < cpusize; n++)
|
for (int n = 0; n < cpusize; n++)
|
||||||
{
|
{
|
||||||
cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
|
cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank, VarList1, VarList2) ? 1 : 0;
|
||||||
cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
|
cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank, VarList1, VarList2) ? 1 : 0;
|
||||||
}
|
}
|
||||||
cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
|
cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
|
||||||
for (int n = 0; n < cpusize; n++)
|
for (int n = 0; n < cpusize; n++)
|
||||||
@@ -6225,8 +6318,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
{
|
{
|
||||||
for (int n = 0; n < cpusize; n++)
|
for (int n = 0; n < cpusize; n++)
|
||||||
{
|
{
|
||||||
cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank) ? 1 : 0;
|
cache.send_buf_is_dev[n] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], n, state_count, myrank, VarList, VarList) ? 1 : 0;
|
||||||
cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank) ? 1 : 0;
|
cache.recv_buf_is_dev[n] = cuda_recv_to_device_eligible(src[n], dst[n], n, state_count, myrank, VarList, VarList) ? 1 : 0;
|
||||||
}
|
}
|
||||||
cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
|
cache.recv_buf_is_dev[myrank] = (cache.send_buf_is_dev[myrank] && cache.recv_buf_is_dev[myrank]) ? 1 : 0;
|
||||||
for (int n = 0; n < cpusize; n++)
|
for (int n = 0; n < cpusize; n++)
|
||||||
|
|||||||
@@ -679,7 +679,11 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
{
|
||||||
|
Parallel::AsyncSyncState async_pre;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||||
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
|
if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
|
||||||
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||||
@@ -778,7 +782,11 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
{
|
||||||
|
Parallel::AsyncSyncState async_cor;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||||
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||||
|
}
|
||||||
|
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -462,25 +462,28 @@ struct StepContext {
|
|||||||
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||||
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||||
|
std::array<double *, BSSN_STATE_COUNT> resident_host;
|
||||||
size_t cap_all;
|
size_t cap_all;
|
||||||
size_t cap_comm;
|
size_t cap_comm;
|
||||||
bool h_comm_pinned;
|
bool h_comm_pinned;
|
||||||
size_t cap_h_comm;
|
size_t cap_h_comm;
|
||||||
bool matter_ready;
|
bool matter_ready;
|
||||||
bool state_ready;
|
bool state_ready;
|
||||||
|
bool resident_host_valid;
|
||||||
|
|
||||||
StepContext()
|
StepContext()
|
||||||
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
||||||
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
||||||
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
||||||
cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
|
cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
|
||||||
matter_ready(false), state_ready(false)
|
matter_ready(false), state_ready(false), resident_host_valid(false)
|
||||||
{
|
{
|
||||||
d_state0.fill(nullptr);
|
d_state0.fill(nullptr);
|
||||||
d_accum.fill(nullptr);
|
d_accum.fill(nullptr);
|
||||||
d_state_curr.fill(nullptr);
|
d_state_curr.fill(nullptr);
|
||||||
d_state_next.fill(nullptr);
|
d_state_next.fill(nullptr);
|
||||||
d_matter.fill(nullptr);
|
d_matter.fill(nullptr);
|
||||||
|
resident_host.fill(nullptr);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -544,6 +547,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
|||||||
ctx.d_state_curr.fill(nullptr);
|
ctx.d_state_curr.fill(nullptr);
|
||||||
ctx.d_state_next.fill(nullptr);
|
ctx.d_state_next.fill(nullptr);
|
||||||
ctx.d_matter.fill(nullptr);
|
ctx.d_matter.fill(nullptr);
|
||||||
|
ctx.resident_host.fill(nullptr);
|
||||||
|
ctx.resident_host_valid = false;
|
||||||
return alloc;
|
return alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -562,6 +567,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
|
|||||||
ctx.cap_h_comm = alloc.cap_h_comm;
|
ctx.cap_h_comm = alloc.cap_h_comm;
|
||||||
ctx.matter_ready = false;
|
ctx.matter_ready = false;
|
||||||
ctx.state_ready = false;
|
ctx.state_ready = false;
|
||||||
|
ctx.resident_host.fill(nullptr);
|
||||||
|
ctx.resident_host_valid = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void recycle_step_allocation(StepAllocation &alloc)
|
static void recycle_step_allocation(StepAllocation &alloc)
|
||||||
@@ -5794,6 +5801,37 @@ static bool has_resident_state(void *block_tag)
|
|||||||
return it != g_step_ctx.end() && it->second.state_ready;
|
return it != g_step_ctx.end() && it->second.state_ready;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool resident_key_usable(double **host_key)
|
||||||
|
{
|
||||||
|
if (!host_key) return false;
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
if (!host_key[i]) return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool resident_key_matches(const StepContext &ctx, double **host_key)
|
||||||
|
{
|
||||||
|
if (!ctx.state_ready || !ctx.resident_host_valid || !resident_key_usable(host_key))
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
if (ctx.resident_host[i] != host_key[i]) return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_resident_key(StepContext &ctx, double **host_key)
|
||||||
|
{
|
||||||
|
if (!resident_key_usable(host_key)) {
|
||||||
|
ctx.resident_host.fill(nullptr);
|
||||||
|
ctx.resident_host_valid = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i)
|
||||||
|
ctx.resident_host[i] = host_key[i];
|
||||||
|
ctx.resident_host_valid = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define pow2(x) ((x) * (x))
|
#define pow2(x) ((x) * (x))
|
||||||
@@ -7786,10 +7824,13 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
|
|||||||
bind_state_input_slots(ctx.d_state_curr);
|
bind_state_input_slots(ctx.d_state_curr);
|
||||||
bind_state_output_slots(ctx.d_state_next);
|
bind_state_output_slots(ctx.d_state_next);
|
||||||
}
|
}
|
||||||
|
|
||||||
double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
if (!use_resident_state || !ctx.state_ready) {
|
if (!use_resident_state || !ctx.state_ready) {
|
||||||
upload_state_inputs(state_host_in, all);
|
upload_state_inputs(state_host_in, all);
|
||||||
|
if (use_resident_state) {
|
||||||
|
ctx.state_ready = true;
|
||||||
|
set_resident_key(ctx, state_host_in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (apply_enforce_ga) {
|
if (apply_enforce_ga) {
|
||||||
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
||||||
@@ -7849,6 +7890,7 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
|
|||||||
std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
|
std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
|
||||||
ctx.d_state_curr.swap(ctx.d_state_next);
|
ctx.d_state_curr.swap(ctx.d_state_next);
|
||||||
ctx.state_ready = true;
|
ctx.state_ready = true;
|
||||||
|
set_resident_key(ctx, state_host_out);
|
||||||
} else {
|
} else {
|
||||||
download_state_outputs(state_host_out, all);
|
download_state_outputs(state_host_out, all);
|
||||||
}
|
}
|
||||||
@@ -8154,6 +8196,17 @@ extern "C" int z4c_cuda_has_resident_state(void *block_tag)
|
|||||||
return has_resident_state(block_tag) ? 1 : 0;
|
return has_resident_state(block_tag) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" int z4c_cuda_resident_state_matches(void *block_tag,
|
||||||
|
double **state_host_key)
|
||||||
|
{
|
||||||
|
using namespace z4c_cuda;
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
auto it = g_step_ctx.find(block_tag);
|
||||||
|
if (it == g_step_ctx.end()) return 0;
|
||||||
|
return resident_key_matches(it->second, state_host_key) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" void z4c_cuda_release_step_ctx(void *block_tag)
|
extern "C" void z4c_cuda_release_step_ctx(void *block_tag)
|
||||||
{
|
{
|
||||||
using namespace z4c_cuda;
|
using namespace z4c_cuda;
|
||||||
|
|||||||
@@ -142,6 +142,8 @@ int z4c_cuda_download_constraint_outputs(int *ex,
|
|||||||
double **constraint_host_out);
|
double **constraint_host_out);
|
||||||
|
|
||||||
int z4c_cuda_has_resident_state(void *block_tag);
|
int z4c_cuda_has_resident_state(void *block_tag);
|
||||||
|
int z4c_cuda_resident_state_matches(void *block_tag,
|
||||||
|
double **state_host_key);
|
||||||
|
|
||||||
void z4c_cuda_release_step_ctx(void *block_tag);
|
void z4c_cuda_release_step_ctx(void *block_tag);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user