Optimize BSSN CUDA resident AMR prolong path
This commit is contained in:
@@ -190,6 +190,25 @@ bool cuda_build_bssn_host_views(Block *block,
|
||||
}
|
||||
return v == 0;
|
||||
}
|
||||
|
||||
bool cuda_build_bssn_soa(MyList<var> *vars,
|
||||
int state_count,
|
||||
double *soa_flat)
|
||||
{
|
||||
if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
MyList<var> *v = vars;
|
||||
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
|
||||
{
|
||||
if (!v)
|
||||
return false;
|
||||
soa_flat[3 * i + 0] = v->data->SoA[0];
|
||||
soa_flat[3 * i + 1] = v->data->SoA[1];
|
||||
soa_flat[3 * i + 2] = v->data->SoA[2];
|
||||
v = v->next;
|
||||
}
|
||||
return v == 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN || USE_CUDA_Z4C
|
||||
@@ -198,6 +217,9 @@ int fortran_idint(double x)
|
||||
return (int)x;
|
||||
}
|
||||
|
||||
bool cuda_amr_restrict_device_enabled();
|
||||
bool cuda_amr_prolong_device_enabled();
|
||||
|
||||
bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
|
||||
const Parallel::gridseg *dst,
|
||||
int first_fine[3])
|
||||
@@ -226,7 +248,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
|
||||
const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1;
|
||||
const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1;
|
||||
first_fine[d] = 2 * lbc - lbf - 1;
|
||||
if (first_fine[d] - 2 < 0)
|
||||
if (first_fine[d] < 0)
|
||||
return false;
|
||||
if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d])
|
||||
return false;
|
||||
@@ -271,7 +293,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
|
||||
const int first_coarse = first_fine_ii[d] / 2 - coarse_lb[d];
|
||||
const int last_fine_ii = first_fine_ii[d] + dst->shape[d] - 1;
|
||||
const int last_coarse = last_fine_ii / 2 - coarse_lb[d];
|
||||
if (first_coarse - 2 < 0)
|
||||
if (first_coarse < -1)
|
||||
return false;
|
||||
if (last_coarse + 3 >= src->Bg->shape[d])
|
||||
return false;
|
||||
@@ -306,13 +328,21 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
|
||||
#elif USE_CUDA_BSSN
|
||||
if (bssn_cuda_has_resident_state(src->Bg) == 0)
|
||||
return false;
|
||||
if (type == 1)
|
||||
return true;
|
||||
int a[3], b[3];
|
||||
if (type == 2)
|
||||
return cuda_cell_gw3_restrict_params(src, dst, a);
|
||||
if (type == 3)
|
||||
return cuda_cell_gw3_prolong_params(src, dst, a, b);
|
||||
if (type == 1)
|
||||
return true;
|
||||
int a[3], b[3];
|
||||
if (type == 2)
|
||||
{
|
||||
if (!cuda_amr_restrict_device_enabled())
|
||||
return false;
|
||||
return cuda_cell_gw3_restrict_params(src, dst, a);
|
||||
}
|
||||
if (type == 3)
|
||||
{
|
||||
if (!cuda_amr_prolong_device_enabled())
|
||||
return false;
|
||||
return cuda_cell_gw3_prolong_params(src, dst, a, b);
|
||||
}
|
||||
return false;
|
||||
#else
|
||||
(void)type;
|
||||
@@ -427,6 +457,28 @@ bool cuda_aware_mpi_enabled()
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool cuda_amr_restrict_device_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_DEVICE");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool cuda_amr_prolong_device_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_AMR_PROLONG_DEVICE");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool cuda_mpi_diag_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
@@ -438,6 +490,17 @@ bool cuda_mpi_diag_enabled()
|
||||
return enabled != 0 || sync_profile_enabled();
|
||||
}
|
||||
|
||||
int cuda_mpi_diag_limit()
|
||||
{
|
||||
static int limit = -1;
|
||||
if (limit < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_MPI_DIAG_LIMIT");
|
||||
limit = (env && atoi(env) > 0) ? atoi(env) : 10;
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
double *alloc_device_comm_buffer(int length)
|
||||
{
|
||||
if (length <= 0)
|
||||
@@ -486,9 +549,11 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
||||
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
bool ok = false;
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
|
||||
bool ok = false;
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
|
||||
const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
|
||||
if (type == 1)
|
||||
{
|
||||
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||
@@ -509,14 +574,15 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
||||
int first_fine[3];
|
||||
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
|
||||
return false;
|
||||
ok = have_views
|
||||
? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2]) == 0
|
||||
: bssn_cuda_restrict_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
ok = have_views
|
||||
? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2],
|
||||
have_soa ? soa_flat : 0) == 0
|
||||
: bssn_cuda_restrict_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine[0], first_fine[1], first_fine[2]) == 0;
|
||||
}
|
||||
else if (type == 3)
|
||||
@@ -524,13 +590,14 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
||||
int first_fine_ii[3], coarse_lb[3];
|
||||
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
|
||||
return false;
|
||||
ok = have_views
|
||||
? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0
|
||||
: bssn_cuda_prolong_state_batch_to_device_buffer(
|
||||
ok = have_views
|
||||
? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
|
||||
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||
have_soa ? soa_flat : 0) == 0
|
||||
: bssn_cuda_prolong_state_batch_to_device_buffer(
|
||||
src->Bg, state_count, buffer, src->Bg->shape,
|
||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||
@@ -643,19 +710,39 @@ bool cuda_flush_device_segment_batch(Block *block,
|
||||
int state_count,
|
||||
const std::vector<int> &meta,
|
||||
int dir,
|
||||
int type,
|
||||
MyList<var> *vars)
|
||||
{
|
||||
if (!block || meta.empty())
|
||||
return true;
|
||||
const int segment_count = (int)(meta.size() / 8);
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
|
||||
if (dir == PACK)
|
||||
const int stride = (dir == PACK && type == 3) ? 11 : 8;
|
||||
const int segment_count = (int)(meta.size() / stride);
|
||||
double *views[BSSN_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
|
||||
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
|
||||
const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
|
||||
if (dir == PACK)
|
||||
{
|
||||
if (type == 2)
|
||||
return have_views
|
||||
? bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||
: bssn_cuda_restrict_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
if (type == 3)
|
||||
return have_views
|
||||
? bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count,
|
||||
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||
: bssn_cuda_prolong_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
return have_views
|
||||
? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||
: bssn_cuda_pack_state_segments_to_device_buffer(
|
||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||
}
|
||||
return have_views
|
||||
? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(
|
||||
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||
@@ -685,6 +772,7 @@ int cuda_data_packer_device_batched(double *data,
|
||||
|
||||
int size_out = 0;
|
||||
Block *batch_block = 0;
|
||||
int batch_type = 0;
|
||||
std::vector<int> batch_meta;
|
||||
batch_meta.reserve(64);
|
||||
|
||||
@@ -702,42 +790,72 @@ int cuda_data_packer_device_batched(double *data,
|
||||
type = 2;
|
||||
else
|
||||
type = 3;
|
||||
if (type != 1)
|
||||
return -1;
|
||||
|
||||
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
|
||||
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
|
||||
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
|
||||
return -1;
|
||||
|
||||
if (batch_block && batch_block != block)
|
||||
if (batch_block && (batch_block != block || batch_type != type))
|
||||
{
|
||||
MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
|
||||
return -1;
|
||||
batch_meta.clear();
|
||||
}
|
||||
batch_block = block;
|
||||
batch_type = type;
|
||||
|
||||
const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
|
||||
: cuda_seg_begin(dst->data, block, 0);
|
||||
const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
|
||||
: cuda_seg_begin(dst->data, block, 1);
|
||||
const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
|
||||
: cuda_seg_begin(dst->data, block, 2);
|
||||
const int sx = dst->data->shape[0];
|
||||
const int sy = dst->data->shape[1];
|
||||
const int sz = dst->data->shape[2];
|
||||
const int region_all = sx * sy * sz;
|
||||
|
||||
batch_meta.push_back(i0);
|
||||
batch_meta.push_back(j0);
|
||||
batch_meta.push_back(k0);
|
||||
batch_meta.push_back(sx);
|
||||
batch_meta.push_back(sy);
|
||||
batch_meta.push_back(sz);
|
||||
batch_meta.push_back(region_all);
|
||||
batch_meta.push_back(size_out);
|
||||
if (dir == UNPACK || type == 1)
|
||||
{
|
||||
const int i0 = cuda_seg_begin(dst->data, block, 0);
|
||||
const int j0 = cuda_seg_begin(dst->data, block, 1);
|
||||
const int k0 = cuda_seg_begin(dst->data, block, 2);
|
||||
batch_meta.push_back(i0);
|
||||
batch_meta.push_back(j0);
|
||||
batch_meta.push_back(k0);
|
||||
batch_meta.push_back(sx);
|
||||
batch_meta.push_back(sy);
|
||||
batch_meta.push_back(sz);
|
||||
batch_meta.push_back(region_all);
|
||||
batch_meta.push_back(size_out);
|
||||
}
|
||||
else if (type == 2)
|
||||
{
|
||||
int first_fine[3];
|
||||
if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
|
||||
return -1;
|
||||
batch_meta.push_back(sx);
|
||||
batch_meta.push_back(sy);
|
||||
batch_meta.push_back(sz);
|
||||
batch_meta.push_back(region_all);
|
||||
batch_meta.push_back(size_out);
|
||||
batch_meta.push_back(first_fine[0]);
|
||||
batch_meta.push_back(first_fine[1]);
|
||||
batch_meta.push_back(first_fine[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
int first_fine_ii[3], coarse_lb[3];
|
||||
if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
|
||||
return -1;
|
||||
batch_meta.push_back(sx);
|
||||
batch_meta.push_back(sy);
|
||||
batch_meta.push_back(sz);
|
||||
batch_meta.push_back(region_all);
|
||||
batch_meta.push_back(size_out);
|
||||
batch_meta.push_back(first_fine_ii[0]);
|
||||
batch_meta.push_back(first_fine_ii[1]);
|
||||
batch_meta.push_back(first_fine_ii[2]);
|
||||
batch_meta.push_back(coarse_lb[0]);
|
||||
batch_meta.push_back(coarse_lb[1]);
|
||||
batch_meta.push_back(coarse_lb[2]);
|
||||
}
|
||||
|
||||
size_out += state_count * region_all;
|
||||
}
|
||||
@@ -748,7 +866,7 @@ int cuda_data_packer_device_batched(double *data,
|
||||
if (batch_block)
|
||||
{
|
||||
MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
|
||||
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
|
||||
return -1;
|
||||
}
|
||||
return size_out;
|
||||
@@ -796,6 +914,89 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
||||
return has_work;
|
||||
}
|
||||
|
||||
struct CudaEligibilityStats
|
||||
{
|
||||
int active;
|
||||
int type1;
|
||||
int type2;
|
||||
int type3;
|
||||
int null_seg;
|
||||
int no_resident;
|
||||
int param_fail;
|
||||
int unsupported_state;
|
||||
};
|
||||
|
||||
void cuda_collect_eligibility_stats(MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
int dir,
|
||||
int myrank,
|
||||
int state_count,
|
||||
CudaEligibilityStats &stats)
|
||||
{
|
||||
if (!cuda_device_state_count_supported(state_count))
|
||||
{
|
||||
stats.unsupported_state++;
|
||||
return;
|
||||
}
|
||||
while (src && dst)
|
||||
{
|
||||
const bool active =
|
||||
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||
if (active)
|
||||
{
|
||||
stats.active++;
|
||||
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||
{
|
||||
stats.null_seg++;
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
continue;
|
||||
}
|
||||
int type;
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
type = 1;
|
||||
else if (src->data->Bg->lev > dst->data->Bg->lev)
|
||||
type = 2;
|
||||
else
|
||||
type = 3;
|
||||
if (type == 1) stats.type1++;
|
||||
else if (type == 2) stats.type2++;
|
||||
else stats.type3++;
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (dir == PACK)
|
||||
{
|
||||
if (bssn_cuda_has_resident_state(src->data->Bg) == 0)
|
||||
stats.no_resident++;
|
||||
else if (type == 2)
|
||||
{
|
||||
int first_fine[3];
|
||||
if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
|
||||
stats.param_fail++;
|
||||
}
|
||||
else if (type == 3)
|
||||
{
|
||||
int first_fine_ii[3], coarse_lb[3];
|
||||
if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
|
||||
stats.param_fail++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (bssn_cuda_has_resident_state(dst->data->Bg) == 0)
|
||||
stats.no_resident++;
|
||||
}
|
||||
#else
|
||||
(void)type;
|
||||
#endif
|
||||
}
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
}
|
||||
}
|
||||
|
||||
bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
@@ -5379,19 +5580,33 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0;
|
||||
cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0;
|
||||
}
|
||||
if (cuda_mpi_diag_enabled())
|
||||
{
|
||||
static int diag_reported = 0;
|
||||
int rep = diag_reported;
|
||||
if (myrank == 0 && rep < 10)
|
||||
{
|
||||
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
|
||||
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
|
||||
"device_recvs=%d cuda_aware_mpi=%d\n",
|
||||
myrank, cuda_device_sends, cuda_device_recvs,
|
||||
cuda_aware_mpi_enabled() ? 1 : 0);
|
||||
}
|
||||
}
|
||||
if (cuda_mpi_diag_enabled())
|
||||
{
|
||||
static int diag_reported = 0;
|
||||
int rep = diag_reported;
|
||||
if (myrank == 0 && rep < cuda_mpi_diag_limit())
|
||||
{
|
||||
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
|
||||
{
|
||||
CudaEligibilityStats send_stats = {};
|
||||
CudaEligibilityStats recv_stats = {};
|
||||
for (int n = 0; n < cpusize; n++)
|
||||
{
|
||||
cuda_collect_eligibility_stats(src[myrank], dst[myrank], n, PACK, myrank, state_count, send_stats);
|
||||
cuda_collect_eligibility_stats(src[n], dst[n], n, UNPACK, myrank, state_count, recv_stats);
|
||||
}
|
||||
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
|
||||
"device_recvs=%d cuda_aware_mpi=%d send_active=%d type=[%d,%d,%d] "
|
||||
"send_nores=%d send_param=%d recv_active=%d recv_type=[%d,%d,%d] recv_nores=%d\n",
|
||||
myrank, cuda_device_sends, cuda_device_recvs,
|
||||
cuda_aware_mpi_enabled() ? 1 : 0,
|
||||
send_stats.active, send_stats.type1, send_stats.type2, send_stats.type3,
|
||||
send_stats.no_resident, send_stats.param_fail,
|
||||
recv_stats.active, recv_stats.type1, recv_stats.type2, recv_stats.type3,
|
||||
recv_stats.no_resident);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -5688,7 +5903,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
||||
{
|
||||
static int diag_reported = 0;
|
||||
int rep = diag_reported;
|
||||
if (myrank == 0 && rep < 20)
|
||||
if (myrank == 0 && rep < cuda_mpi_diag_limit())
|
||||
{
|
||||
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
|
||||
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d "
|
||||
|
||||
Reference in New Issue
Block a user