Optimize BSSN CUDA resident AMR prolong path

This commit is contained in:
2026-04-30 10:58:15 +08:00
parent 1ee229a91f
commit 18e9c9cc50
3 changed files with 778 additions and 78 deletions

View File

@@ -190,6 +190,25 @@ bool cuda_build_bssn_host_views(Block *block,
}
return v == 0;
}
bool cuda_build_bssn_soa(MyList<var> *vars,
int state_count,
double *soa_flat)
{
if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT)
return false;
MyList<var> *v = vars;
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
{
if (!v)
return false;
soa_flat[3 * i + 0] = v->data->SoA[0];
soa_flat[3 * i + 1] = v->data->SoA[1];
soa_flat[3 * i + 2] = v->data->SoA[2];
v = v->next;
}
return v == 0;
}
#endif
#if USE_CUDA_BSSN || USE_CUDA_Z4C
@@ -198,6 +217,9 @@ int fortran_idint(double x)
return (int)x;
}
bool cuda_amr_restrict_device_enabled();
bool cuda_amr_prolong_device_enabled();
bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
const Parallel::gridseg *dst,
int first_fine[3])
@@ -226,7 +248,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1;
const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1;
first_fine[d] = 2 * lbc - lbf - 1;
if (first_fine[d] - 2 < 0)
if (first_fine[d] < 0)
return false;
if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d])
return false;
@@ -271,7 +293,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src,
const int first_coarse = first_fine_ii[d] / 2 - coarse_lb[d];
const int last_fine_ii = first_fine_ii[d] + dst->shape[d] - 1;
const int last_coarse = last_fine_ii / 2 - coarse_lb[d];
if (first_coarse - 2 < 0)
if (first_coarse < -1)
return false;
if (last_coarse + 3 >= src->Bg->shape[d])
return false;
@@ -306,13 +328,21 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
#elif USE_CUDA_BSSN
if (bssn_cuda_has_resident_state(src->Bg) == 0)
return false;
if (type == 1)
return true;
int a[3], b[3];
if (type == 2)
return cuda_cell_gw3_restrict_params(src, dst, a);
if (type == 3)
return cuda_cell_gw3_prolong_params(src, dst, a, b);
if (type == 1)
return true;
int a[3], b[3];
if (type == 2)
{
if (!cuda_amr_restrict_device_enabled())
return false;
return cuda_cell_gw3_restrict_params(src, dst, a);
}
if (type == 3)
{
if (!cuda_amr_prolong_device_enabled())
return false;
return cuda_cell_gw3_prolong_params(src, dst, a, b);
}
return false;
#else
(void)type;
@@ -427,6 +457,28 @@ bool cuda_aware_mpi_enabled()
return enabled != 0;
}
bool cuda_amr_restrict_device_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_DEVICE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_amr_prolong_device_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_PROLONG_DEVICE");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_mpi_diag_enabled()
{
static int enabled = -1;
@@ -438,6 +490,17 @@ bool cuda_mpi_diag_enabled()
return enabled != 0 || sync_profile_enabled();
}
int cuda_mpi_diag_limit()
{
static int limit = -1;
if (limit < 0)
{
const char *env = getenv("AMSS_CUDA_MPI_DIAG_LIMIT");
limit = (env && atoi(env) > 0) ? atoi(env) : 10;
}
return limit;
}
double *alloc_device_comm_buffer(int length)
{
if (length <= 0)
@@ -486,9 +549,11 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
bool ok = false;
double *views[BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
bool ok = false;
double *views[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
if (type == 1)
{
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
@@ -509,14 +574,15 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
int first_fine[3];
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
return false;
ok = have_views
? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
src->Bg, views, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine[0], first_fine[1], first_fine[2]) == 0
: bssn_cuda_restrict_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
ok = have_views
? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(
src->Bg, views, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine[0], first_fine[1], first_fine[2],
have_soa ? soa_flat : 0) == 0
: bssn_cuda_restrict_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine[0], first_fine[1], first_fine[2]) == 0;
}
else if (type == 3)
@@ -524,13 +590,14 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
int first_fine_ii[3], coarse_lb[3];
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
return false;
ok = have_views
? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
src->Bg, views, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0
: bssn_cuda_prolong_state_batch_to_device_buffer(
ok = have_views
? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(
src->Bg, views, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
have_soa ? soa_flat : 0) == 0
: bssn_cuda_prolong_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
@@ -643,19 +710,39 @@ bool cuda_flush_device_segment_batch(Block *block,
int state_count,
const std::vector<int> &meta,
int dir,
int type,
MyList<var> *vars)
{
if (!block || meta.empty())
return true;
const int segment_count = (int)(meta.size() / 8);
double *views[BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
if (dir == PACK)
const int stride = (dir == PACK && type == 3) ? 11 : 8;
const int segment_count = (int)(meta.size() / stride);
double *views[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
if (dir == PACK)
{
if (type == 2)
return have_views
? bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count,
meta.data(), have_soa ? soa_flat : 0) == 0
: bssn_cuda_restrict_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
if (type == 3)
return have_views
? bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count,
meta.data(), have_soa ? soa_flat : 0) == 0
: bssn_cuda_prolong_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
return have_views
? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
: bssn_cuda_pack_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
}
return have_views
? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
@@ -685,6 +772,7 @@ int cuda_data_packer_device_batched(double *data,
int size_out = 0;
Block *batch_block = 0;
int batch_type = 0;
std::vector<int> batch_meta;
batch_meta.reserve(64);
@@ -702,42 +790,72 @@ int cuda_data_packer_device_batched(double *data,
type = 2;
else
type = 3;
if (type != 1)
return -1;
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
return -1;
if (batch_block && batch_block != block)
if (batch_block && (batch_block != block || batch_type != type))
{
MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
return -1;
batch_meta.clear();
}
batch_block = block;
batch_type = type;
const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
: cuda_seg_begin(dst->data, block, 0);
const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
: cuda_seg_begin(dst->data, block, 1);
const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
: cuda_seg_begin(dst->data, block, 2);
const int sx = dst->data->shape[0];
const int sy = dst->data->shape[1];
const int sz = dst->data->shape[2];
const int region_all = sx * sy * sz;
batch_meta.push_back(i0);
batch_meta.push_back(j0);
batch_meta.push_back(k0);
batch_meta.push_back(sx);
batch_meta.push_back(sy);
batch_meta.push_back(sz);
batch_meta.push_back(region_all);
batch_meta.push_back(size_out);
if (dir == UNPACK || type == 1)
{
const int i0 = cuda_seg_begin(dst->data, block, 0);
const int j0 = cuda_seg_begin(dst->data, block, 1);
const int k0 = cuda_seg_begin(dst->data, block, 2);
batch_meta.push_back(i0);
batch_meta.push_back(j0);
batch_meta.push_back(k0);
batch_meta.push_back(sx);
batch_meta.push_back(sy);
batch_meta.push_back(sz);
batch_meta.push_back(region_all);
batch_meta.push_back(size_out);
}
else if (type == 2)
{
int first_fine[3];
if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
return -1;
batch_meta.push_back(sx);
batch_meta.push_back(sy);
batch_meta.push_back(sz);
batch_meta.push_back(region_all);
batch_meta.push_back(size_out);
batch_meta.push_back(first_fine[0]);
batch_meta.push_back(first_fine[1]);
batch_meta.push_back(first_fine[2]);
}
else
{
int first_fine_ii[3], coarse_lb[3];
if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
return -1;
batch_meta.push_back(sx);
batch_meta.push_back(sy);
batch_meta.push_back(sz);
batch_meta.push_back(region_all);
batch_meta.push_back(size_out);
batch_meta.push_back(first_fine_ii[0]);
batch_meta.push_back(first_fine_ii[1]);
batch_meta.push_back(first_fine_ii[2]);
batch_meta.push_back(coarse_lb[0]);
batch_meta.push_back(coarse_lb[1]);
batch_meta.push_back(coarse_lb[2]);
}
size_out += state_count * region_all;
}
@@ -748,7 +866,7 @@ int cuda_data_packer_device_batched(double *data,
if (batch_block)
{
MyList<var> *batch_vars = (dir == PACK) ? VarLists : VarListd;
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars))
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars))
return -1;
}
return size_out;
@@ -796,6 +914,89 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
return has_work;
}
struct CudaEligibilityStats
{
int active;
int type1;
int type2;
int type3;
int null_seg;
int no_resident;
int param_fail;
int unsupported_state;
};
void cuda_collect_eligibility_stats(MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
int dir,
int myrank,
int state_count,
CudaEligibilityStats &stats)
{
if (!cuda_device_state_count_supported(state_count))
{
stats.unsupported_state++;
return;
}
while (src && dst)
{
const bool active =
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
if (active)
{
stats.active++;
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
{
stats.null_seg++;
src = src->next;
dst = dst->next;
continue;
}
int type;
if (src->data->Bg->lev == dst->data->Bg->lev)
type = 1;
else if (src->data->Bg->lev > dst->data->Bg->lev)
type = 2;
else
type = 3;
if (type == 1) stats.type1++;
else if (type == 2) stats.type2++;
else stats.type3++;
#if USE_CUDA_BSSN
if (dir == PACK)
{
if (bssn_cuda_has_resident_state(src->data->Bg) == 0)
stats.no_resident++;
else if (type == 2)
{
int first_fine[3];
if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine))
stats.param_fail++;
}
else if (type == 3)
{
int first_fine_ii[3], coarse_lb[3];
if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb))
stats.param_fail++;
}
}
else
{
if (bssn_cuda_has_resident_state(dst->data->Bg) == 0)
stats.no_resident++;
}
#else
(void)type;
#endif
}
src = src->next;
dst = dst->next;
}
}
bool cuda_pack_to_device_eligible(MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
@@ -5379,19 +5580,33 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
cuda_device_sends += cache.send_buf_is_dev[n] ? 1 : 0;
cuda_device_recvs += cache.recv_buf_is_dev[n] ? 1 : 0;
}
if (cuda_mpi_diag_enabled())
{
static int diag_reported = 0;
int rep = diag_reported;
if (myrank == 0 && rep < 10)
{
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
"device_recvs=%d cuda_aware_mpi=%d\n",
myrank, cuda_device_sends, cuda_device_recvs,
cuda_aware_mpi_enabled() ? 1 : 0);
}
}
if (cuda_mpi_diag_enabled())
{
static int diag_reported = 0;
int rep = diag_reported;
if (myrank == 0 && rep < cuda_mpi_diag_limit())
{
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
{
CudaEligibilityStats send_stats = {};
CudaEligibilityStats recv_stats = {};
for (int n = 0; n < cpusize; n++)
{
cuda_collect_eligibility_stats(src[myrank], dst[myrank], n, PACK, myrank, state_count, send_stats);
cuda_collect_eligibility_stats(src[n], dst[n], n, UNPACK, myrank, state_count, recv_stats);
}
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] transfer_cached: device_sends=%d "
"device_recvs=%d cuda_aware_mpi=%d send_active=%d type=[%d,%d,%d] "
"send_nores=%d send_param=%d recv_active=%d recv_type=[%d,%d,%d] recv_nores=%d\n",
myrank, cuda_device_sends, cuda_device_recvs,
cuda_aware_mpi_enabled() ? 1 : 0,
send_stats.active, send_stats.type1, send_stats.type2, send_stats.type3,
send_stats.no_resident, send_stats.param_fail,
recv_stats.active, recv_stats.type1, recv_stats.type2, recv_stats.type3,
recv_stats.no_resident);
}
}
}
}
else
{
@@ -5688,7 +5903,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
{
static int diag_reported = 0;
int rep = diag_reported;
if (myrank == 0 && rep < 20)
if (myrank == 0 && rep < cuda_mpi_diag_limit())
{
if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1))
fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d "