Complete BSSN-EScalar CUDA resident transfers

This commit is contained in:
2026-05-05 23:57:42 +08:00
parent 85fe29cc2e
commit ae64a22178
5 changed files with 995 additions and 72 deletions

View File

@@ -329,6 +329,19 @@ bool cuda_state_count_direct_supported(int state_count)
#endif
}
#if USE_CUDA_BSSN
bool cuda_prepare_inter_time_device_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_PREPARE_INTER_DEVICE");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
#endif
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
{
if (!src || !dst || !src->Bg)
@@ -526,6 +539,26 @@ bool cuda_cached_device_buffers_enabled(int state_count)
return cuda_aware_mpi_enabled();
}
bool cuda_uncached_device_buffers_enabled(int state_count)
{
#if USE_CUDA_BSSN
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_UNCACHED_DEVICE_BUFFERS");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
if (!enabled)
return false;
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
return false;
return cuda_aware_mpi_enabled();
#else
(void)state_count;
return false;
#endif
}
bool cuda_amr_restrict_device_enabled()
{
static int enabled = -1;
@@ -570,6 +603,17 @@ bool cuda_amr_restrict_compare_enabled()
return enabled != 0;
}
bool cuda_amr_prolong_compare_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_PROLONG_COMPARE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_amr_restrict_batch_enabled()
{
static int enabled = -1;
@@ -673,6 +717,57 @@ void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length)
caps[idx] = length;
}
struct UncachedDeviceBuffers
{
int cpusize;
double **send_bufs;
double **recv_bufs;
int *send_caps;
int *recv_caps;
UncachedDeviceBuffers()
: cpusize(0), send_bufs(0), recv_bufs(0), send_caps(0), recv_caps(0)
{
}
};
UncachedDeviceBuffers &uncached_device_buffers()
{
static UncachedDeviceBuffers buffers;
return buffers;
}
void ensure_uncached_device_buffers(int cpusize)
{
UncachedDeviceBuffers &buffers = uncached_device_buffers();
if (buffers.cpusize == cpusize && buffers.send_bufs && buffers.recv_bufs)
return;
for (int i = 0; i < buffers.cpusize; ++i)
{
if (buffers.send_bufs && buffers.send_bufs[i])
free_device_comm_buffer(buffers.send_bufs[i]);
if (buffers.recv_bufs && buffers.recv_bufs[i])
free_device_comm_buffer(buffers.recv_bufs[i]);
}
delete[] buffers.send_bufs;
delete[] buffers.recv_bufs;
delete[] buffers.send_caps;
delete[] buffers.recv_caps;
buffers.cpusize = cpusize;
buffers.send_bufs = new double *[cpusize];
buffers.recv_bufs = new double *[cpusize];
buffers.send_caps = new int[cpusize];
buffers.recv_caps = new int[cpusize];
for (int i = 0; i < cpusize; ++i)
{
buffers.send_bufs[i] = 0;
buffers.recv_bufs[i] = 0;
buffers.send_caps[i] = 0;
buffers.recv_caps[i] = 0;
}
}
bool cuda_direct_pack_segment_to_device(double *buffer,
const Parallel::gridseg *src,
const Parallel::gridseg *dst,
@@ -849,10 +944,109 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
coarse_lb[0], coarse_lb[1], coarse_lb[2],
have_soa ? soa_flat : 0) == 0
: bssn_cuda_prolong_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0;
if (ok && cuda_amr_prolong_compare_enabled())
{
const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
const int total = state_count * region_all;
double *cpu = new double[total];
double *gpu = new double[total];
if (!cuda_download_resident_subset_to_host(src->Bg, VarLists, state_count))
{
delete[] cpu;
delete[] gpu;
return false;
}
int DIM = dim;
MyList<var> *v = VarLists;
for (int s = 0; s < state_count && v; ++s, v = v->next)
{
f_prolong3(DIM,
src->Bg->bbox,
src->Bg->bbox + dim,
src->Bg->shape,
src->Bg->fgfs[v->data->sgfn],
const_cast<double *>(dst->llb),
const_cast<double *>(dst->uub),
const_cast<int *>(dst->shape),
cpu + (size_t)s * region_all,
const_cast<double *>(dst->llb),
const_cast<double *>(dst->uub),
v->data->SoA,
Symmetry);
}
cudaError_t cerr = cudaMemcpy(gpu, buffer, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
if (cerr != cudaSuccess)
{
fprintf(stderr, "Parallel: prolong compare cudaMemcpy failed, err=%d\n", (int)cerr);
delete[] cpu;
delete[] gpu;
return false;
}
double max_abs = 0.0;
double max_rel = 0.0;
int max_idx = -1;
for (int i = 0; i < total; ++i)
{
const double diff = fabs(cpu[i] - gpu[i]);
const double den = fmax(fabs(cpu[i]), fabs(gpu[i]));
const double rel = den > 0.0 ? diff / den : diff;
if (diff > max_abs)
{
max_abs = diff;
max_rel = rel;
max_idx = i;
}
}
static int report_count = 0;
const double tol = cuda_amr_restrict_compare_tol();
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (max_abs > tol || max_rel > tol)
{
const int state = max_idx / region_all;
const int local = max_idx - state * region_all;
const int ii = local % dst->shape[0];
const int jj = (local / dst->shape[0]) % dst->shape[1];
const int kk = local / (dst->shape[0] * dst->shape[1]);
if (report_count < cuda_amr_restrict_compare_limit())
{
fprintf(stderr,
"[AMSS-CUDA-PROLONG-CMP][rank %d] mismatch state=%d point=(%d,%d,%d) "
"shape=(%d,%d,%d) first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) "
"max_abs=%.17e max_rel=%.17e cpu=%.17e gpu=%.17e src_lev=%d dst_lev=%d\n",
rank, state, ii, jj, kk,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
max_abs, max_rel, cpu[max_idx], gpu[max_idx],
src->Bg->lev, dst->Bg->lev);
fflush(stderr);
report_count++;
}
delete[] cpu;
delete[] gpu;
return false;
}
else if (report_count < cuda_amr_restrict_compare_limit())
{
fprintf(stderr,
"[AMSS-CUDA-PROLONG-CMP][rank %d] ok shape=(%d,%d,%d) "
"first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) max_abs=%.17e max_rel=%.17e\n",
rank,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
max_abs, max_rel);
fflush(stderr);
report_count++;
}
delete[] cpu;
delete[] gpu;
}
}
if (sync_profile_enabled())
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -941,22 +1135,34 @@ bool cuda_download_resident_subset_to_host(Block *block,
bool cuda_unpack_host_region_to_resident(Block *block,
int state_index,
double *buffer,
const Parallel::gridseg *dst)
const Parallel::gridseg *dst,
MyList<var> *vars,
int state_count)
{
#if USE_CUDA_BSSN
if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT)
if (!block || !dst || !vars || state_count <= 0 ||
state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT ||
state_index < 0 || state_index >= state_count)
return false;
if (bssn_cuda_has_resident_state(block) == 0)
return true;
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i)
{
if (!v)
return false;
views[i] = block->fgfs[v->data->sgfn];
v = v->next;
}
const int i0 = cuda_seg_begin(dst, block, 0);
const int j0 = cuda_seg_begin(dst, block, 1);
const int k0 = cuda_seg_begin(dst, block, 2);
return bssn_cuda_unpack_state_region_from_host_buffer(
block, state_index, buffer, block->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
return bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(
block, views, state_count, state_index, buffer, block->shape,
i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else
(void)block; (void)state_index; (void)buffer; (void)dst;
(void)block; (void)state_index; (void)buffer; (void)dst; (void)vars; (void)state_count;
return false;
#endif
}
@@ -5178,8 +5384,9 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
if (cuda_state_count_direct_supported(state_count) &&
dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg))
{
if (type != 2 && type != 3 &&
!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx,
data + size_out, dst->data,
VarListd, state_count))
{
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -5318,13 +5525,40 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
double **rec_data = new double *[cpusize];
int *send_lengths = new int[cpusize];
int *recv_lengths = new int[cpusize];
int *send_req_index = new int[cpusize];
unsigned char *send_is_dev = new unsigned char[cpusize];
unsigned char *recv_is_dev = new unsigned char[cpusize];
for (node = 0; node < cpusize; node++)
{
send_data[node] = rec_data[node] = 0;
send_lengths[node] = recv_lengths[node] = 0;
send_req_index[node] = -1;
send_is_dev[node] = recv_is_dev[node] = 0;
}
#if USE_CUDA_BSSN || USE_CUDA_Z4C
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count < 0)
{
cout << "Parallel::transfer: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int cuda_device_sends = 0;
if (cuda_uncached_device_buffers_enabled(state_count))
{
ensure_uncached_device_buffers(cpusize);
for (node = 0; node < cpusize; node++)
{
send_is_dev[node] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], node, state_count, myrank) ? 1 : 0;
recv_is_dev[node] = cuda_recv_to_device_eligible(src[node], dst[node], node, state_count, myrank) ? 1 : 0;
}
recv_is_dev[myrank] = (send_is_dev[myrank] && recv_is_dev[myrank]) ? 1 : 0;
for (node = 0; node < cpusize; node++)
cuda_device_sends += send_is_dev[node] ? 1 : 0;
}
#endif
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
@@ -5333,13 +5567,22 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
if (recv_is_dev[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, node, recv_lengths[node]);
MPI_Irecv((void *)dev_buffers.recv_bufs[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
}
else
{
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
@@ -5351,13 +5594,22 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
if (recv_is_dev[myrank])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, myrank, recv_lengths[myrank]);
data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
else
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
@@ -5368,19 +5620,51 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
if (send_is_dev[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
ensure_device_comm_buffer(dev_buffers.send_bufs, dev_buffers.send_caps, node, send_lengths[node]);
data_packer_with_device_buffer(dev_buffers.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
else
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
}
send_req_index[node] = req_no;
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
#if USE_CUDA_BSSN || USE_CUDA_Z4C
if (cuda_device_sends > 0)
cudaDeviceSynchronize();
#endif
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
if (send_lengths[node] > 0)
{
const int req_idx = send_req_index[node];
if (req_idx < 0)
continue;
if (send_is_dev[node])
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
MPI_Isend((void *)dev_buffers.send_bufs[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx);
}
else
{
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx);
}
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
@@ -5395,7 +5679,15 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
if (idx >= 0 && req_is_recv[idx])
{
int recv_node = req_node[idx];
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_is_dev[recv_node])
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
data_packer_with_device_buffer(dev_buffers.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
}
else
{
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
}
pending_recv--;
}
}
@@ -5403,7 +5695,12 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[myrank])
if (recv_is_dev[myrank] && recv_lengths[myrank] > 0)
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
else if (rec_data[myrank])
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++)
@@ -5423,6 +5720,9 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
delete[] send_req_index;
delete[] send_is_dev;
delete[] recv_is_dev;
}
//
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
@@ -7057,7 +7357,13 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (cuda_state_count_direct_supported(state_count) &&
const bool have_cuda_views =
cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views);
if (cuda_prepare_inter_time_device_enabled() &&
have_cuda_views &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
@@ -7093,6 +7399,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
varl2 = varl2->next;
varl3 = varl3->next;
}
#if USE_CUDA_BSSN
if (have_cuda_views && bssn_cuda_has_resident_state(cg))
bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count);
#endif
}
if (BP == Pat->ble)
break;
@@ -7133,7 +7443,14 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (cuda_state_count_direct_supported(state_count) &&
const bool have_cuda_views =
cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views);
if (cuda_prepare_inter_time_device_enabled() &&
have_cuda_views &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
@@ -7174,6 +7491,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
varl3 = varl3->next;
varl4 = varl4->next;
}
#if USE_CUDA_BSSN
if (have_cuda_views && bssn_cuda_has_resident_state(cg))
bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count);
#endif
}
if (BP == Pat->ble)
break;