Pack sync send buffers directly from GPU state
This commit is contained in:
@@ -3,6 +3,7 @@
|
|||||||
#include "fmisc.h"
|
#include "fmisc.h"
|
||||||
#include "prolongrestrict.h"
|
#include "prolongrestrict.h"
|
||||||
#include "bssn_cuda_ops.h"
|
#include "bssn_cuda_ops.h"
|
||||||
|
#include "bssn_gpu.h"
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
@@ -43,6 +44,85 @@ struct ParallelTransferContextGuard
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bool parallel_can_gpu_pack_segments(MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst,
|
||||||
|
int rank_in, MyList<var> *VarLists, MyList<var> *VarListd)
|
||||||
|
{
|
||||||
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
if (!src || !dst)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (src->data->Bg->lev != dst->data->Bg->lev)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
while (src && dst)
|
||||||
|
{
|
||||||
|
if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank))
|
||||||
|
{
|
||||||
|
MyList<var> *varls = VarLists;
|
||||||
|
MyList<var> *varld = VarListd;
|
||||||
|
while (varls && varld)
|
||||||
|
{
|
||||||
|
(void)varld;
|
||||||
|
if (!bssn_gpu_find_device_buffer(src->data->Bg->fgfs[varls->data->sgfn]))
|
||||||
|
return false;
|
||||||
|
varls = varls->next;
|
||||||
|
varld = varld->next;
|
||||||
|
}
|
||||||
|
if (varls || varld)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
src = src->next;
|
||||||
|
dst = dst->next;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool parallel_gpu_pack_segments(double *data,
|
||||||
|
MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst,
|
||||||
|
int rank_in, MyList<var> *VarLists, MyList<var> *VarListd)
|
||||||
|
{
|
||||||
|
if (!data)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
if (!src || !dst)
|
||||||
|
return false;
|
||||||
|
if (src->data->Bg->lev != dst->data->Bg->lev)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
int size_out = 0;
|
||||||
|
while (src && dst)
|
||||||
|
{
|
||||||
|
if ((dst->data->Bg->rank == rank_in) && (src->data->Bg->rank == myrank))
|
||||||
|
{
|
||||||
|
MyList<var> *varls = VarLists;
|
||||||
|
MyList<var> *varld = VarListd;
|
||||||
|
while (varls && varld)
|
||||||
|
{
|
||||||
|
(void)varld;
|
||||||
|
if (bssn_gpu_stage_download_region_to_buffer(src->data->Bg->fgfs[varls->data->sgfn],
|
||||||
|
src->data->Bg->shape,
|
||||||
|
src->data->Bg->bbox,
|
||||||
|
src->data->Bg->bbox + dim,
|
||||||
|
dst->data->shape,
|
||||||
|
dst->data->llb,
|
||||||
|
data + size_out))
|
||||||
|
return false;
|
||||||
|
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||||
|
varls = varls->next;
|
||||||
|
varld = varld->next;
|
||||||
|
}
|
||||||
|
if (varls || varld)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
src = src->next;
|
||||||
|
dst = dst->next;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void parallel_report_mpi_error(const char *context, int errcode, int req_no)
|
void parallel_report_mpi_error(const char *context, int errcode, int req_no)
|
||||||
{
|
{
|
||||||
char errstr[MPI_MAX_ERROR_STRING];
|
char errstr[MPI_MAX_ERROR_STRING];
|
||||||
@@ -4843,27 +4923,30 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
if (node == myrank)
|
if (node == myrank)
|
||||||
{
|
{
|
||||||
int length;
|
int length;
|
||||||
if (!cache.lengths_valid) {
|
if (!cache.lengths_valid) {
|
||||||
length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
cache.recv_lengths[node] = length;
|
cache.recv_lengths[node] = length;
|
||||||
} else {
|
} else {
|
||||||
length = cache.recv_lengths[node];
|
length = cache.recv_lengths[node];
|
||||||
}
|
}
|
||||||
if (length > 0)
|
if (length > 0)
|
||||||
{
|
{
|
||||||
if (length > cache.recv_buf_caps[node])
|
if (length > cache.recv_buf_caps[node])
|
||||||
{
|
{
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
||||||
cache.recv_bufs[node] = new double[length];
|
cache.recv_bufs[node] = new double[length];
|
||||||
cache.recv_buf_caps[node] = length;
|
cache.recv_buf_caps[node] = length;
|
||||||
}
|
}
|
||||||
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
bssn_gpu_prepare_host_buffer(cache.recv_bufs[node], length);
|
||||||
}
|
if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) ||
|
||||||
}
|
!parallel_gpu_pack_segments(cache.recv_bufs[node], src[myrank], dst[myrank], node, VarList, VarList))
|
||||||
else
|
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
int slength;
|
int slength;
|
||||||
if (!cache.lengths_valid) {
|
if (!cache.lengths_valid) {
|
||||||
@@ -4872,17 +4955,20 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
} else {
|
} else {
|
||||||
slength = cache.send_lengths[node];
|
slength = cache.send_lengths[node];
|
||||||
}
|
}
|
||||||
if (slength > 0)
|
if (slength > 0)
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
if (slength > cache.send_buf_caps[node])
|
||||||
{
|
{
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
||||||
cache.send_bufs[node] = new double[slength];
|
cache.send_bufs[node] = new double[slength];
|
||||||
cache.send_buf_caps[node] = slength;
|
cache.send_buf_caps[node] = slength;
|
||||||
}
|
}
|
||||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
bssn_gpu_prepare_host_buffer(cache.send_bufs[node], slength);
|
||||||
state.req_node[state.req_no] = node;
|
if (!parallel_can_gpu_pack_segments(src[myrank], dst[myrank], node, VarList, VarList) ||
|
||||||
state.req_is_recv[state.req_no] = 0;
|
!parallel_gpu_pack_segments(cache.send_bufs[node], src[myrank], dst[myrank], node, VarList, VarList))
|
||||||
|
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
|
state.req_node[state.req_no] = node;
|
||||||
|
state.req_is_recv[state.req_no] = 0;
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||||
}
|
}
|
||||||
int rlength;
|
int rlength;
|
||||||
|
|||||||
@@ -306,6 +306,26 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto can_pack_sync_from_device =
|
||||||
|
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
|
||||||
|
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
|
||||||
|
while (seg)
|
||||||
|
{
|
||||||
|
MyList<var> *var_it = var_list;
|
||||||
|
while (var_it)
|
||||||
|
{
|
||||||
|
if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
|
||||||
|
return false;
|
||||||
|
var_it = var_it->next;
|
||||||
|
}
|
||||||
|
seg = seg->next;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
MyList<Patch> *Pp = GH->PatL[lev];
|
MyList<Patch> *Pp = GH->PatL[lev];
|
||||||
while (Pp)
|
while (Pp)
|
||||||
{
|
{
|
||||||
@@ -341,7 +361,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
Pp = Pp->next;
|
Pp = Pp->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ERROR && sync_cache_pre[lev].valid)
|
if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev]))
|
||||||
refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
|
refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
|
||||||
|
|
||||||
MPI_Request err_req_pre;
|
MPI_Request err_req_pre;
|
||||||
@@ -433,7 +453,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
Pp = Pp->next;
|
Pp = Pp->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3)
|
if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 &&
|
||||||
|
!can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev]))
|
||||||
refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
|
refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
|
||||||
|
|
||||||
MPI_Request err_req_cor;
|
MPI_Request err_req_cor;
|
||||||
|
|||||||
@@ -1099,6 +1099,74 @@ int bssn_gpu_stage_download_region(double *host_ptr,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
|
||||||
|
const int *full_shape,
|
||||||
|
const double *full_llb,
|
||||||
|
const double *full_uub,
|
||||||
|
const int *region_shape,
|
||||||
|
const double *region_llb,
|
||||||
|
double *host_dst_ptr)
|
||||||
|
{
|
||||||
|
if (!host_src_ptr || !host_dst_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_src_ptr);
|
||||||
|
if (!device_ptr)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
int start[3] = {0, 0, 0};
|
||||||
|
for (int i = 0; i < 3; ++i)
|
||||||
|
{
|
||||||
|
if (full_shape[i] <= 0 || region_shape[i] <= 0)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
#ifdef Vertex
|
||||||
|
#ifdef Cell
|
||||||
|
#error Both Cell and Vertex are defined
|
||||||
|
#endif
|
||||||
|
const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i] - 1);
|
||||||
|
start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i]);
|
||||||
|
start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i])
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaMemcpy3DParms parms = {};
|
||||||
|
parms.srcPtr = make_cudaPitchedPtr(const_cast<double *>(device_ptr),
|
||||||
|
static_cast<size_t>(full_shape[0]) * sizeof(double),
|
||||||
|
static_cast<size_t>(full_shape[0]),
|
||||||
|
static_cast<size_t>(full_shape[1]));
|
||||||
|
parms.dstPtr = make_cudaPitchedPtr(host_dst_ptr,
|
||||||
|
static_cast<size_t>(region_shape[0]) * sizeof(double),
|
||||||
|
static_cast<size_t>(region_shape[0]),
|
||||||
|
static_cast<size_t>(region_shape[1]));
|
||||||
|
parms.srcPos = make_cudaPos(static_cast<size_t>(start[0]) * sizeof(double),
|
||||||
|
static_cast<size_t>(start[1]),
|
||||||
|
static_cast<size_t>(start[2]));
|
||||||
|
parms.dstPos = make_cudaPos(0, 0, 0);
|
||||||
|
parms.extent = make_cudaExtent(static_cast<size_t>(region_shape[0]) * sizeof(double),
|
||||||
|
static_cast<size_t>(region_shape[1]),
|
||||||
|
static_cast<size_t>(region_shape[2]));
|
||||||
|
parms.kind = cudaMemcpyDeviceToHost;
|
||||||
|
|
||||||
|
cudaError_t err = cudaMemcpy3D(&parms);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaMemcpy3D(D2H region->buffer) failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void test_const_address(double * testd){
|
__global__ void test_const_address(double * testd){
|
||||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||||
if(_t == 0)
|
if(_t == 0)
|
||||||
|
|||||||
@@ -84,6 +84,13 @@ int bssn_gpu_stage_download_region(double *host_ptr,
|
|||||||
const double *full_uub,
|
const double *full_uub,
|
||||||
const int *region_shape,
|
const int *region_shape,
|
||||||
const double *region_llb);
|
const double *region_llb);
|
||||||
|
int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
|
||||||
|
const int *full_shape,
|
||||||
|
const double *full_llb,
|
||||||
|
const double *full_uub,
|
||||||
|
const int *region_shape,
|
||||||
|
const double *region_llb,
|
||||||
|
double *host_dst_ptr);
|
||||||
|
|
||||||
/** Init GPU side data in GPUMeta. */
|
/** Init GPU side data in GPUMeta. */
|
||||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||||
|
|||||||
Reference in New Issue
Block a user