Optimize BSSN-EM 8th-order AMR transfers
This commit is contained in:
@@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data,
|
|||||||
}
|
}
|
||||||
return size_out;
|
return size_out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool cuda_ensure_bssn_block_resident_for_pack(Block *block,
|
||||||
|
MyList<var> *vars,
|
||||||
|
int state_count,
|
||||||
|
std::vector<Block *> &uploaded)
|
||||||
|
{
|
||||||
|
if (!block)
|
||||||
|
return false;
|
||||||
|
if (bssn_cuda_has_resident_state(block) != 0)
|
||||||
|
return true;
|
||||||
|
for (size_t i = 0; i < uploaded.size(); ++i)
|
||||||
|
{
|
||||||
|
if (uploaded[i] == block)
|
||||||
|
return bssn_cuda_has_resident_state(block) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
|
||||||
|
if (!cuda_build_bssn_host_views(block, vars, state_count, views))
|
||||||
|
return false;
|
||||||
|
if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0)
|
||||||
|
return false;
|
||||||
|
uploaded.push_back(block);
|
||||||
|
return bssn_cuda_has_resident_state(block) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuda_host_batch_diag(const char *reason, int state_count, int type)
|
||||||
|
{
|
||||||
|
static int reported = 0;
|
||||||
|
const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG");
|
||||||
|
if (!env || atoi(env) == 0 || reported >= 32)
|
||||||
|
return;
|
||||||
|
int rank = 0;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
fprintf(stderr,
|
||||||
|
"[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n",
|
||||||
|
rank, reason ? reason : "unknown", state_count, type);
|
||||||
|
fflush(stderr);
|
||||||
|
reported++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cuda_transfer_active_length_if_batched_eligible(MyList<Parallel::gridseg> *src,
|
||||||
|
MyList<Parallel::gridseg> *dst,
|
||||||
|
int rank_in,
|
||||||
|
int dir,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
int state_count,
|
||||||
|
int myrank,
|
||||||
|
std::vector<Block *> &uploaded)
|
||||||
|
{
|
||||||
|
if (dir != PACK && dir != UNPACK)
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("bad_dir", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (!cuda_device_segment_batch_enabled())
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("segment_batch_off", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (!cuda_device_state_count_supported(state_count))
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("unsupported_state_count", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (cuda_amr_restrict_compare_enabled())
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("compare_enabled", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int total = 0;
|
||||||
|
bool has_work = false;
|
||||||
|
bool has_amr = false;
|
||||||
|
while (src && dst)
|
||||||
|
{
|
||||||
|
const bool active =
|
||||||
|
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||||
|
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||||
|
if (active)
|
||||||
|
{
|
||||||
|
has_work = true;
|
||||||
|
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("null_segment", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int type;
|
||||||
|
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||||
|
type = 1;
|
||||||
|
else if (src->data->Bg->lev > dst->data->Bg->lev)
|
||||||
|
type = 2;
|
||||||
|
else
|
||||||
|
type = 3;
|
||||||
|
|
||||||
|
if (type == 2 || type == 3)
|
||||||
|
has_amr = true;
|
||||||
|
if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("restrict_batch_off", state_count, type);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (dir == PACK)
|
||||||
|
{
|
||||||
|
if ((type == 2 || type == 3) &&
|
||||||
|
!cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists,
|
||||||
|
state_count, uploaded))
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("resident_upload_failed", state_count, type);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (!cuda_can_direct_pack(src->data, dst->data, type))
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("direct_pack_ineligible", state_count, type);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (!cuda_can_direct_unpack(dst->data, type))
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("direct_unpack_ineligible", state_count, type);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||||
|
}
|
||||||
|
src = src->next;
|
||||||
|
dst = dst->next;
|
||||||
|
}
|
||||||
|
if (!has_work)
|
||||||
|
return 0;
|
||||||
|
if (!has_amr)
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("no_amr_segment", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return total;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cuda_data_packer_host_staged_batched(double *host_data,
|
||||||
|
MyList<Parallel::gridseg> *src,
|
||||||
|
MyList<Parallel::gridseg> *dst,
|
||||||
|
int rank_in,
|
||||||
|
int dir,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
MyList<var> *VarListd,
|
||||||
|
int Symmetry)
|
||||||
|
{
|
||||||
|
if (!host_data || !cuda_amr_host_staged_enabled())
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
|
||||||
|
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||||
|
if (state_count < 0)
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("var_list_mismatch", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Block *> uploaded;
|
||||||
|
const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in,
|
||||||
|
dir, VarLists,
|
||||||
|
state_count, myrank,
|
||||||
|
uploaded);
|
||||||
|
if (total <= 0)
|
||||||
|
return total;
|
||||||
|
|
||||||
|
static double *stage_dev = 0;
|
||||||
|
static int stage_cap = 0;
|
||||||
|
if (total > stage_cap)
|
||||||
|
{
|
||||||
|
free_device_comm_buffer(stage_dev);
|
||||||
|
stage_dev = alloc_device_comm_buffer(total);
|
||||||
|
stage_cap = total;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dir == UNPACK)
|
||||||
|
{
|
||||||
|
cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double),
|
||||||
|
cudaMemcpyHostToDevice);
|
||||||
|
if (h2d != cudaSuccess)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n",
|
||||||
|
(int)h2d);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir,
|
||||||
|
VarLists, VarListd, Symmetry);
|
||||||
|
if (packed != total)
|
||||||
|
{
|
||||||
|
cuda_host_batch_diag("device_batched_failed", state_count, -1);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dir == PACK)
|
||||||
|
{
|
||||||
|
cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double),
|
||||||
|
cudaMemcpyDeviceToHost);
|
||||||
|
if (d2h != cudaSuccess)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n",
|
||||||
|
(int)d2h);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
{
|
||||||
|
const double dt = MPI_Wtime() - t0;
|
||||||
|
if (dir == PACK)
|
||||||
|
sync_profile_stats().direct_pack_sec += dt;
|
||||||
|
else
|
||||||
|
sync_profile_stats().direct_unpack_sec += dt;
|
||||||
|
}
|
||||||
|
return total;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
||||||
@@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (data && (dir == PACK || dir == UNPACK))
|
||||||
|
{
|
||||||
|
const int batched = cuda_data_packer_host_staged_batched(data, src, dst, rank_in, dir,
|
||||||
|
VarLists, VarListd, Symmetry);
|
||||||
|
if (batched >= 0)
|
||||||
|
return batched;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
int type; /* 1 copy, 2 restrict, 3 prolong */
|
int type; /* 1 copy, 2 restrict, 3 prolong */
|
||||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||||
type = 1;
|
type = 1;
|
||||||
|
|||||||
@@ -167,6 +167,12 @@ def _gpu_runtime_env():
|
|||||||
"AMSS_INTERP_GPU": "0",
|
"AMSS_INTERP_GPU": "0",
|
||||||
"AMSS_CUDA_AWARE_MPI": "0",
|
"AMSS_CUDA_AWARE_MPI": "0",
|
||||||
})
|
})
|
||||||
|
if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
|
||||||
|
defaults.update({
|
||||||
|
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
|
||||||
|
"AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
|
||||||
|
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
|
||||||
|
})
|
||||||
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
|
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
|
||||||
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
|
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
|
||||||
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
||||||
|
|||||||
Reference in New Issue
Block a user