Broaden cached CUDA sync paths
This commit is contained in:
@@ -608,6 +608,24 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
|||||||
MyList<var> *VarLists,
|
MyList<var> *VarLists,
|
||||||
int Symmetry)
|
int Symmetry)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||||
|
{
|
||||||
|
if (type != 1)
|
||||||
|
return false;
|
||||||
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||||
|
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||||
|
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||||
|
const bool ok = z4c_cuda_pack_state_batch_to_device_buffer(
|
||||||
|
src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||||
return false;
|
return false;
|
||||||
@@ -777,6 +795,22 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
|
|||||||
int state_count,
|
int state_count,
|
||||||
MyList<var> *VarListd)
|
MyList<var> *VarListd)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||||
|
{
|
||||||
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||||
|
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||||
|
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||||
|
const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
|
||||||
|
dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||||
return false;
|
return false;
|
||||||
@@ -856,6 +890,10 @@ bool cuda_unpack_host_region_to_resident(Block *block,
|
|||||||
|
|
||||||
bool cuda_device_state_count_supported(int state_count)
|
bool cuda_device_state_count_supported(int state_count)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||||
|
return true;
|
||||||
|
#endif
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
|
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
|
||||||
#else
|
#else
|
||||||
@@ -933,6 +971,10 @@ int cuda_data_packer_device_batched(double *data,
|
|||||||
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||||
if (!cuda_device_state_count_supported(state_count))
|
if (!cuda_device_state_count_supported(state_count))
|
||||||
return -1;
|
return -1;
|
||||||
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
|
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||||
|
return -1;
|
||||||
|
#endif
|
||||||
|
|
||||||
int size_out = 0;
|
int size_out = 0;
|
||||||
Block *batch_block = 0;
|
Block *batch_block = 0;
|
||||||
|
|||||||
@@ -537,7 +537,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||||
|
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
@@ -635,7 +635,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||||
|
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1221,7 +1221,7 @@ void bssnEM_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -1683,7 +1683,7 @@ void bssnEM_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
|
|||||||
@@ -993,7 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -1349,7 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
|
|||||||
@@ -5224,6 +5224,36 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_state_region_packed_batch_device_cuda(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
const int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
int pack_not_unpack)
|
||||||
|
{
|
||||||
|
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
|
||||||
|
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return;
|
||||||
|
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||||
|
const int region_all = sx * sy * sz;
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
||||||
|
(unsigned int)state_count);
|
||||||
|
|
||||||
|
if (pack_not_unpack) {
|
||||||
|
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
|
||||||
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
} else {
|
||||||
|
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
|
||||||
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
ctx.state_ready = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
||||||
{
|
{
|
||||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
@@ -7451,6 +7481,36 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
using namespace z4c_cuda;
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
using namespace z4c_cuda;
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, 0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" int z4c_cuda_download_state_subset(void *block_tag,
|
extern "C" int z4c_cuda_download_state_subset(void *block_tag,
|
||||||
int *ex,
|
int *ex,
|
||||||
int subset_count,
|
int subset_count,
|
||||||
|
|||||||
@@ -60,6 +60,20 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
int z4c_cuda_download_state_subset(void *block_tag,
|
int z4c_cuda_download_state_subset(void *block_tag,
|
||||||
int *ex,
|
int *ex,
|
||||||
int subset_count,
|
int subset_count,
|
||||||
|
|||||||
Reference in New Issue
Block a user