Complete BSSN-EScalar CUDA resident transfers

This commit is contained in:
2026-05-05 23:57:42 +08:00
parent 85fe29cc2e
commit ae64a22178
5 changed files with 995 additions and 72 deletions

View File

@@ -329,6 +329,19 @@ bool cuda_state_count_direct_supported(int state_count)
#endif #endif
} }
#if USE_CUDA_BSSN
bool cuda_prepare_inter_time_device_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_PREPARE_INTER_DEVICE");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
#endif
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
{ {
if (!src || !dst || !src->Bg) if (!src || !dst || !src->Bg)
@@ -526,6 +539,26 @@ bool cuda_cached_device_buffers_enabled(int state_count)
return cuda_aware_mpi_enabled(); return cuda_aware_mpi_enabled();
} }
bool cuda_uncached_device_buffers_enabled(int state_count)
{
#if USE_CUDA_BSSN
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_UNCACHED_DEVICE_BUFFERS");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
if (!enabled)
return false;
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
return false;
return cuda_aware_mpi_enabled();
#else
(void)state_count;
return false;
#endif
}
bool cuda_amr_restrict_device_enabled() bool cuda_amr_restrict_device_enabled()
{ {
static int enabled = -1; static int enabled = -1;
@@ -570,6 +603,17 @@ bool cuda_amr_restrict_compare_enabled()
return enabled != 0; return enabled != 0;
} }
bool cuda_amr_prolong_compare_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_PROLONG_COMPARE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_amr_restrict_batch_enabled() bool cuda_amr_restrict_batch_enabled()
{ {
static int enabled = -1; static int enabled = -1;
@@ -673,6 +717,57 @@ void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length)
caps[idx] = length; caps[idx] = length;
} }
struct UncachedDeviceBuffers
{
int cpusize;
double **send_bufs;
double **recv_bufs;
int *send_caps;
int *recv_caps;
UncachedDeviceBuffers()
: cpusize(0), send_bufs(0), recv_bufs(0), send_caps(0), recv_caps(0)
{
}
};
UncachedDeviceBuffers &uncached_device_buffers()
{
static UncachedDeviceBuffers buffers;
return buffers;
}
void ensure_uncached_device_buffers(int cpusize)
{
UncachedDeviceBuffers &buffers = uncached_device_buffers();
if (buffers.cpusize == cpusize && buffers.send_bufs && buffers.recv_bufs)
return;
for (int i = 0; i < buffers.cpusize; ++i)
{
if (buffers.send_bufs && buffers.send_bufs[i])
free_device_comm_buffer(buffers.send_bufs[i]);
if (buffers.recv_bufs && buffers.recv_bufs[i])
free_device_comm_buffer(buffers.recv_bufs[i]);
}
delete[] buffers.send_bufs;
delete[] buffers.recv_bufs;
delete[] buffers.send_caps;
delete[] buffers.recv_caps;
buffers.cpusize = cpusize;
buffers.send_bufs = new double *[cpusize];
buffers.recv_bufs = new double *[cpusize];
buffers.send_caps = new int[cpusize];
buffers.recv_caps = new int[cpusize];
for (int i = 0; i < cpusize; ++i)
{
buffers.send_bufs[i] = 0;
buffers.recv_bufs[i] = 0;
buffers.send_caps[i] = 0;
buffers.recv_caps[i] = 0;
}
}
bool cuda_direct_pack_segment_to_device(double *buffer, bool cuda_direct_pack_segment_to_device(double *buffer,
const Parallel::gridseg *src, const Parallel::gridseg *src,
const Parallel::gridseg *dst, const Parallel::gridseg *dst,
@@ -849,10 +944,109 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
coarse_lb[0], coarse_lb[1], coarse_lb[2], coarse_lb[0], coarse_lb[1], coarse_lb[2],
have_soa ? soa_flat : 0) == 0 have_soa ? soa_flat : 0) == 0
: bssn_cuda_prolong_state_batch_to_device_buffer( : bssn_cuda_prolong_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape, src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2], dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0; coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0;
if (ok && cuda_amr_prolong_compare_enabled())
{
const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
const int total = state_count * region_all;
double *cpu = new double[total];
double *gpu = new double[total];
if (!cuda_download_resident_subset_to_host(src->Bg, VarLists, state_count))
{
delete[] cpu;
delete[] gpu;
return false;
}
int DIM = dim;
MyList<var> *v = VarLists;
for (int s = 0; s < state_count && v; ++s, v = v->next)
{
f_prolong3(DIM,
src->Bg->bbox,
src->Bg->bbox + dim,
src->Bg->shape,
src->Bg->fgfs[v->data->sgfn],
const_cast<double *>(dst->llb),
const_cast<double *>(dst->uub),
const_cast<int *>(dst->shape),
cpu + (size_t)s * region_all,
const_cast<double *>(dst->llb),
const_cast<double *>(dst->uub),
v->data->SoA,
Symmetry);
}
cudaError_t cerr = cudaMemcpy(gpu, buffer, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
if (cerr != cudaSuccess)
{
fprintf(stderr, "Parallel: prolong compare cudaMemcpy failed, err=%d\n", (int)cerr);
delete[] cpu;
delete[] gpu;
return false;
}
double max_abs = 0.0;
double max_rel = 0.0;
int max_idx = -1;
for (int i = 0; i < total; ++i)
{
const double diff = fabs(cpu[i] - gpu[i]);
const double den = fmax(fabs(cpu[i]), fabs(gpu[i]));
const double rel = den > 0.0 ? diff / den : diff;
if (diff > max_abs)
{
max_abs = diff;
max_rel = rel;
max_idx = i;
}
}
static int report_count = 0;
const double tol = cuda_amr_restrict_compare_tol();
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (max_abs > tol || max_rel > tol)
{
const int state = max_idx / region_all;
const int local = max_idx - state * region_all;
const int ii = local % dst->shape[0];
const int jj = (local / dst->shape[0]) % dst->shape[1];
const int kk = local / (dst->shape[0] * dst->shape[1]);
if (report_count < cuda_amr_restrict_compare_limit())
{
fprintf(stderr,
"[AMSS-CUDA-PROLONG-CMP][rank %d] mismatch state=%d point=(%d,%d,%d) "
"shape=(%d,%d,%d) first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) "
"max_abs=%.17e max_rel=%.17e cpu=%.17e gpu=%.17e src_lev=%d dst_lev=%d\n",
rank, state, ii, jj, kk,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
max_abs, max_rel, cpu[max_idx], gpu[max_idx],
src->Bg->lev, dst->Bg->lev);
fflush(stderr);
report_count++;
}
delete[] cpu;
delete[] gpu;
return false;
}
else if (report_count < cuda_amr_restrict_compare_limit())
{
fprintf(stderr,
"[AMSS-CUDA-PROLONG-CMP][rank %d] ok shape=(%d,%d,%d) "
"first_fine=(%d,%d,%d) coarse_lb=(%d,%d,%d) max_abs=%.17e max_rel=%.17e\n",
rank,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
max_abs, max_rel);
fflush(stderr);
report_count++;
}
delete[] cpu;
delete[] gpu;
}
} }
if (sync_profile_enabled()) if (sync_profile_enabled())
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -941,22 +1135,34 @@ bool cuda_download_resident_subset_to_host(Block *block,
bool cuda_unpack_host_region_to_resident(Block *block, bool cuda_unpack_host_region_to_resident(Block *block,
int state_index, int state_index,
double *buffer, double *buffer,
const Parallel::gridseg *dst) const Parallel::gridseg *dst,
MyList<var> *vars,
int state_count)
{ {
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT) if (!block || !dst || !vars || state_count <= 0 ||
state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT ||
state_index < 0 || state_index >= state_count)
return false; return false;
if (bssn_cuda_has_resident_state(block) == 0) if (bssn_cuda_has_resident_state(block) == 0)
return true; return true;
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i)
{
if (!v)
return false;
views[i] = block->fgfs[v->data->sgfn];
v = v->next;
}
const int i0 = cuda_seg_begin(dst, block, 0); const int i0 = cuda_seg_begin(dst, block, 0);
const int j0 = cuda_seg_begin(dst, block, 1); const int j0 = cuda_seg_begin(dst, block, 1);
const int k0 = cuda_seg_begin(dst, block, 2); const int k0 = cuda_seg_begin(dst, block, 2);
return bssn_cuda_unpack_state_region_from_host_buffer( return bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(
block, state_index, buffer, block->shape, block, views, state_count, state_index, buffer, block->shape,
i0, j0, k0, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else #else
(void)block; (void)state_index; (void)buffer; (void)dst; (void)block; (void)state_index; (void)buffer; (void)dst; (void)vars; (void)state_count;
return false; return false;
#endif #endif
} }
@@ -5178,8 +5384,9 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
if (cuda_state_count_direct_supported(state_count) && if (cuda_state_count_direct_supported(state_count) &&
dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg))
{ {
if (type != 2 && type != 3 && if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx,
!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) data + size_out, dst->data,
VarListd, state_count))
{ {
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
@@ -5318,13 +5525,40 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
double **rec_data = new double *[cpusize]; double **rec_data = new double *[cpusize];
int *send_lengths = new int[cpusize]; int *send_lengths = new int[cpusize];
int *recv_lengths = new int[cpusize]; int *recv_lengths = new int[cpusize];
int *send_req_index = new int[cpusize];
unsigned char *send_is_dev = new unsigned char[cpusize];
unsigned char *recv_is_dev = new unsigned char[cpusize];
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
send_data[node] = rec_data[node] = 0; send_data[node] = rec_data[node] = 0;
send_lengths[node] = recv_lengths[node] = 0; send_lengths[node] = recv_lengths[node] = 0;
send_req_index[node] = -1;
send_is_dev[node] = recv_is_dev[node] = 0;
} }
#if USE_CUDA_BSSN || USE_CUDA_Z4C
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count < 0)
{
cout << "Parallel::transfer: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int cuda_device_sends = 0;
if (cuda_uncached_device_buffers_enabled(state_count))
{
ensure_uncached_device_buffers(cpusize);
for (node = 0; node < cpusize; node++)
{
send_is_dev[node] = cuda_pack_to_device_eligible(src[myrank], dst[myrank], node, state_count, myrank) ? 1 : 0;
recv_is_dev[node] = cuda_recv_to_device_eligible(src[node], dst[node], node, state_count, myrank) ? 1 : 0;
}
recv_is_dev[myrank] = (send_is_dev[myrank] && recv_is_dev[myrank]) ? 1 : 0;
for (node = 0; node < cpusize; node++)
cuda_device_sends += send_is_dev[node] ? 1 : 0;
}
#endif
// Post receives first so peers can progress rendezvous early. // Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
{ {
@@ -5333,13 +5567,22 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0) if (recv_lengths[node] > 0)
{ {
rec_data[node] = new double[recv_lengths[node]]; if (recv_is_dev[node])
if (!rec_data[node])
{ {
cout << "out of memory when new in short transfer, place 1" << endl; UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
MPI_Abort(MPI_COMM_WORLD, 1); ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, node, recv_lengths[node]);
MPI_Irecv((void *)dev_buffers.recv_bufs[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
}
else
{
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
} }
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node; req_node[req_no] = node;
req_is_recv[req_no] = 1; req_is_recv[req_no] = 1;
req_no++; req_no++;
@@ -5351,13 +5594,22 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry); recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0) if (recv_lengths[myrank] > 0)
{ {
rec_data[myrank] = new double[recv_lengths[myrank]]; if (recv_is_dev[myrank])
if (!rec_data[myrank])
{ {
cout << "out of memory when new in short transfer, place 2" << endl; UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
MPI_Abort(MPI_COMM_WORLD, 1); ensure_device_comm_buffer(dev_buffers.recv_bufs, dev_buffers.recv_caps, myrank, recv_lengths[myrank]);
data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
else
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
} }
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
} }
// Pack and post sends. // Pack and post sends.
@@ -5368,19 +5620,51 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0) if (send_lengths[node] > 0)
{ {
send_data[node] = new double[send_lengths[node]]; if (send_is_dev[node])
if (!send_data[node])
{ {
cout << "out of memory when new in short transfer, place 3" << endl; UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
MPI_Abort(MPI_COMM_WORLD, 1); ensure_device_comm_buffer(dev_buffers.send_bufs, dev_buffers.send_caps, node, send_lengths[node]);
data_packer_with_device_buffer(dev_buffers.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
} }
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); else
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no); {
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
}
send_req_index[node] = req_no;
req_node[req_no] = node; req_node[req_no] = node;
req_is_recv[req_no] = 0; req_is_recv[req_no] = 0;
req_no++; req_no++;
} }
} }
#if USE_CUDA_BSSN || USE_CUDA_Z4C
if (cuda_device_sends > 0)
cudaDeviceSynchronize();
#endif
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
if (send_lengths[node] > 0)
{
const int req_idx = send_req_index[node];
if (req_idx < 0)
continue;
if (send_is_dev[node])
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
MPI_Isend((void *)dev_buffers.send_bufs[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx);
}
else
{
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_idx);
}
}
}
// Unpack as soon as receive completes to reduce pure wait time. // Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0) while (pending_recv > 0)
@@ -5395,7 +5679,15 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
if (idx >= 0 && req_is_recv[idx]) if (idx >= 0 && req_is_recv[idx])
{ {
int recv_node = req_node[idx]; int recv_node = req_node[idx];
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry); if (recv_is_dev[recv_node])
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
data_packer_with_device_buffer(dev_buffers.recv_bufs[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
}
else
{
data_packer(rec_data[recv_node], src[recv_node], dst[recv_node], recv_node, UNPACK, VarList1, VarList2, Symmetry);
}
pending_recv--; pending_recv--;
} }
} }
@@ -5403,7 +5695,12 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
if (req_no > 0) MPI_Waitall(req_no, reqs, stats); if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (rec_data[myrank]) if (recv_is_dev[myrank] && recv_lengths[myrank] > 0)
{
UncachedDeviceBuffers &dev_buffers = uncached_device_buffers();
data_packer_with_device_buffer(dev_buffers.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
else if (rec_data[myrank])
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry); data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++) for (node = 0; node < cpusize; node++)
@@ -5423,6 +5720,9 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
delete[] rec_data; delete[] rec_data;
delete[] send_lengths; delete[] send_lengths;
delete[] recv_lengths; delete[] recv_lengths;
delete[] send_req_index;
delete[] send_is_dev;
delete[] recv_is_dev;
} }
// //
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst, void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
@@ -7057,7 +7357,13 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2); const int state_count = cuda_state_var_count(VarList1, VarList2);
if (cuda_state_count_direct_supported(state_count) && const bool have_cuda_views =
cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views);
if (cuda_prepare_inter_time_device_enabled() &&
have_cuda_views &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
@@ -7093,6 +7399,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
varl2 = varl2->next; varl2 = varl2->next;
varl3 = varl3->next; varl3 = varl3->next;
} }
#if USE_CUDA_BSSN
if (have_cuda_views && bssn_cuda_has_resident_state(cg))
bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count);
#endif
} }
if (BP == Pat->ble) if (BP == Pat->ble)
break; break;
@@ -7133,7 +7443,14 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2); const int state_count = cuda_state_var_count(VarList1, VarList2);
if (cuda_state_count_direct_supported(state_count) && const bool have_cuda_views =
cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views);
if (cuda_prepare_inter_time_device_enabled() &&
have_cuda_views &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
@@ -7174,6 +7491,10 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
varl3 = varl3->next; varl3 = varl3->next;
varl4 = varl4->next; varl4 = varl4->next;
} }
#if USE_CUDA_BSSN
if (have_cuda_views && bssn_cuda_has_resident_state(cg))
bssn_cuda_upload_resident_state_count(cg, cg->shape, dst_views, state_count);
#endif
} }
if (BP == Pat->ble) if (BP == Pat->ble)
break; break;

View File

@@ -90,6 +90,22 @@ bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analy
return false; return false;
if (lev == analysis_lev) if (lev == analysis_lev)
return false; return false;
static int release_only_level = -2;
if (release_only_level == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_RELEASE_ONLY_LEVEL");
release_only_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
if (release_only_level >= 0)
return lev != release_only_level;
static int keep_level_limit = -2;
if (keep_level_limit == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_LEVELS_BELOW");
keep_level_limit = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
if (keep_level_limit >= 0)
return lev < keep_level_limit;
if (keep_all_levels) if (keep_all_levels)
return true; return true;
return lev < trfls_in; return lev < trfls_in;
@@ -125,6 +141,138 @@ bool bssn_escalar_timing_enabled()
return enabled != 0; return enabled != 0;
} }
bool bssn_escalar_cuda_post_rp_download_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_DOWNLOAD");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_escalar_cuda_post_rp_download_level_enabled(int lev)
{
if (!bssn_escalar_cuda_post_rp_download_enabled())
return false;
static int min_level = -2;
if (min_level == -2)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_MIN_LEVEL");
min_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
}
return min_level < 0 || lev >= min_level;
}
bool bssn_escalar_cuda_post_swap_release_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_POST_SWAP_RELEASE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_escalar_cuda_pre_rp_release_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_PRE_RP_RELEASE");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
bool bssn_escalar_cuda_bh_interp_resident_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
}
return enabled != 0;
}
bool bssn_escalar_cuda_prune_after_swap_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_PRUNE_AFTER_SWAP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_cuda_upload_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_in))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during upload" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_upload_resident_state(cg, cg->shape, state_in))
{
cout << "CUDA BSSN-EScalar resident state upload failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_escalar_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_key[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_key))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during prune" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_keep_only_resident_state(cg, cg->shape, state_key))
{
cout << "CUDA BSSN-EScalar resident state prune failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs, void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
double sync, double bh, double analysis, double swap, double sync, double bh, double analysis, double swap,
double resident, double rp) double resident, double rp)
@@ -1244,7 +1392,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{ {
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
(void)use_cuda_resident_sync; if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false);
#endif #endif
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++) for (int ithBH = 0; ithBH < BH_num; ithBH++)
@@ -1670,7 +1819,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{ {
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
(void)use_cuda_resident_sync; if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
#endif #endif
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++) for (int ithBH = 0; ithBH < BH_num; ithBH++)
@@ -1760,7 +1910,8 @@ void bssnEScalar_class::Step(int lev, int YN)
{ {
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev)) if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank,
bssn_escalar_cuda_pre_rp_release_enabled());
if (escalar_step_timing) if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0; escalar_t_resident += MPI_Wtime() - escalar_t0;
} }
@@ -1833,6 +1984,25 @@ void bssnEScalar_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
#endif
#if USE_CUDA_BSSN
bool release_after_sync = false;
if (use_cuda_resident_sync && bssn_escalar_cuda_post_rp_download_level_enabled(lev))
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
release_after_sync = bssn_escalar_cuda_post_swap_release_enabled();
bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, release_after_sync);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
if (use_cuda_resident_sync && !release_after_sync &&
bssn_escalar_cuda_prune_after_swap_enabled())
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
bssn_escalar_cuda_keep_only_level_state(GH->PatL[lev], StateList, myrank);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)

View File

@@ -102,7 +102,15 @@ bool amss_cached_rp_restrict_enabled()
{ {
static int enabled = -1; static int enabled = -1;
if (enabled < 0) if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0; {
#if (ABEtype == 1)
enabled = 1;
#else
enabled = 0;
#endif
if (amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT"))
enabled = 1;
}
return enabled != 0; return enabled != 0;
} }
@@ -110,7 +118,15 @@ bool amss_cached_rp_outbd_enabled()
{ {
static int enabled = -1; static int enabled = -1;
if (enabled < 0) if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0; {
#if (ABEtype == 1)
enabled = 1;
#else
enabled = 0;
#endif
if (amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD"))
enabled = 1;
}
return enabled != 0; return enabled != 0;
} }
@@ -118,7 +134,15 @@ bool amss_cached_rp_fine_sync_enabled()
{ {
static int enabled = -1; static int enabled = -1;
if (enabled < 0) if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0; {
#if (ABEtype == 1)
enabled = 1;
#else
enabled = 0;
#endif
if (amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC"))
enabled = 1;
}
return enabled != 0; return enabled != 0;
} }
@@ -819,6 +843,17 @@ bool bssn_cuda_regrid_flush_enabled()
return enabled != 0; return enabled != 0;
} }
bool bssn_cuda_regrid_flush_always_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_REGRID_FLUSH_ALWAYS");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0)
{ {
if (!GH || lev < GH->movls || lev >= GH->levels || !GH->PatL[lev]) if (!GH || lev < GH->movls || lev >= GH->levels || !GH->PatL[lev])
@@ -882,8 +917,11 @@ bool bssn_cuda_will_regrid_onelevel(cgh *GH, int lev, int Symmetry, int BH_num,
bool bssn_cuda_should_flush_before_regrid(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0) bool bssn_cuda_should_flush_before_regrid(cgh *GH, int lev, int Symmetry, int BH_num, double **Porg0)
{ {
return bssn_cuda_regrid_flush_enabled() && if (!bssn_cuda_regrid_flush_enabled())
bssn_cuda_will_regrid_onelevel(GH, lev, Symmetry, BH_num, Porg0); return false;
if (bssn_cuda_regrid_flush_always_enabled())
return GH && lev >= GH->movls && lev < GH->levels && GH->PatL[lev];
return bssn_cuda_will_regrid_onelevel(GH, lev, Symmetry, BH_num, Porg0);
} }
void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL, void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
@@ -925,6 +963,27 @@ bool bssn_constraint_recompute_from_state(int lev, bool level0_cache_valid)
} // namespace } // namespace
#endif #endif
#if USE_CUDA_BSSN
bool bssn_cuda_bh_interp_resident_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
if (env)
enabled = (atoi(env) != 0) ? 1 : 0;
#if (ABEtype == 1)
else
enabled = 1;
#else
else
enabled = 1;
#endif
}
return enabled != 0;
}
#endif
//================================================================================================ //================================================================================================
// define bssn_class // define bssn_class
@@ -3895,10 +3954,11 @@ void bssn_class::ParallelStep()
// a_stream<<lev<<": after calling GH->Regrid_Onelevel_aux for lower level"; // a_stream<<lev<<": after calling GH->Regrid_Onelevel_aux for lower level";
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],a_stream.str()); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],a_stream.str());
} }
}
}
#endif
} }
}
#endif
}
#ifdef WithShell #ifdef WithShell
SHStep(); SHStep();
@@ -8392,7 +8452,8 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
int lev = ilev; int lev = ilev;
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (bssn_cuda_use_resident_sync(lev) && if (bssn_cuda_bh_interp_resident_enabled() &&
bssn_cuda_use_resident_sync(lev) &&
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf)) bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
{ {
BH_RHS[n][0] = -shellf[0]; BH_RHS[n][0] = -shellf[0];

View File

@@ -502,7 +502,7 @@ static const int STAGE_SLOT_COUNT =
static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_STATE_COUNT = 24;
static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_MATTER_COUNT = 10;
static constexpr int BSSN_LK_FIELD_COUNT = 24; static constexpr int BSSN_LK_FIELD_COUNT = 24;
static constexpr int BSSN_RESIDENT_BANK_COUNT = 4; static constexpr int BSSN_RESIDENT_BANK_COUNT = 6;
static constexpr int BSSN_ESCALAR_STATE_COUNT = 26; static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT; static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT;
@@ -5285,11 +5285,24 @@ static bool resident_key_matches(const StepContext &ctx, int bank, double **host
static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count) static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count)
{ {
if (!host_key) return -1; if (!host_key) return -1;
int best = -1;
unsigned long long best_age = 0;
int best_invalid = -1;
unsigned long long best_invalid_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (resident_key_matches_count(ctx, b, host_key, state_count)) if (!resident_key_matches_count(ctx, b, host_key, state_count))
return b; continue;
if (ctx.resident_valid[b]) {
if (best < 0 || ctx.resident_age[b] > best_age) {
best = b;
best_age = ctx.resident_age[b];
}
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
best_invalid = b;
best_invalid_age = ctx.resident_age[b];
}
} }
return -1; return (best >= 0) ? best : best_invalid;
} }
static int find_resident_bank_subset(const StepContext &ctx, static int find_resident_bank_subset(const StepContext &ctx,
@@ -5299,6 +5312,10 @@ static int find_resident_bank_subset(const StepContext &ctx,
{ {
if (!host_key || !state_indices || subset_count <= 0) if (!host_key || !state_indices || subset_count <= 0)
return -1; return -1;
int best = -1;
unsigned long long best_age = 0;
int best_invalid = -1;
unsigned long long best_invalid_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
bool match = true; bool match = true;
for (int i = 0; i < subset_count; ++i) { for (int i = 0; i < subset_count; ++i) {
@@ -5310,10 +5327,19 @@ static int find_resident_bank_subset(const StepContext &ctx,
break; break;
} }
} }
if (match) if (!match)
return b; continue;
if (ctx.resident_valid[b]) {
if (best < 0 || ctx.resident_age[b] > best_age) {
best = b;
best_age = ctx.resident_age[b];
}
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
best_invalid = b;
best_invalid_age = ctx.resident_age[b];
}
} }
return -1; return (best >= 0) ? best : best_invalid;
} }
static int find_resident_bank(const StepContext &ctx, double **host_key) static int find_resident_bank(const StepContext &ctx, double **host_key)
@@ -5373,6 +5399,16 @@ static void mark_resident_host_subset_clean(StepContext &ctx,
} }
} }
static void mark_resident_host_state_clean(StepContext &ctx,
int bank,
int state_index,
bool clean)
{
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
}
static void mark_resident_current_bank(StepContext &ctx, int bank) static void mark_resident_current_bank(StepContext &ctx, int bank)
{ {
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
@@ -5632,6 +5668,12 @@ static int reserve_escalar_resident_output_bank(StepContext &ctx,
static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c); static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
int avoid_a,
int avoid_b,
int avoid_c,
size_t all);
static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
double **host_key, double **host_key,
size_t all, size_t all,
@@ -5658,7 +5700,7 @@ static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
} }
} }
if (bank < 0) if (bank < 0)
bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all); bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
ctx.resident_valid[bank] = false; ctx.resident_valid[bank] = false;
ctx.resident_age[bank] = ++ctx.resident_clock; ctx.resident_age[bank] = ++ctx.resident_clock;
@@ -5734,12 +5776,140 @@ static int reserve_resident_output_bank_avoiding(StepContext &ctx,
return bank; return bank;
} }
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
int avoid_a,
int avoid_b,
int avoid_c,
size_t all)
{
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
return b;
}
int best = -1;
unsigned long long best_age = 0;
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
if (best < 0 || ctx.resident_age[b] < best_age) {
best = b;
best_age = ctx.resident_age[b];
}
}
if (best < 0)
return choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);
writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
ctx.resident_valid[best] = false;
ctx.resident_host[best].fill(nullptr);
ctx.resident_host_clean[best].fill(0);
ctx.resident_age[best] = 0;
if (ctx.current_bank == best) {
ctx.current_bank = -1;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_curr.fill(nullptr);
}
update_state_ready(ctx);
return best;
}
static int ensure_resident_bank_avoiding(StepContext &ctx,
double **host_key,
size_t all,
bool upload_if_missing,
int avoid_a,
int avoid_b,
int avoid_c)
{
if (!resident_key_usable(host_key)) {
if (ctx.current_bank >= 0)
return ctx.current_bank;
return 0;
}
int bank = find_resident_bank(ctx, host_key);
if (bank >= 0) {
ctx.resident_age[bank] = ++ctx.resident_clock;
if (!ctx.resident_valid[bank] && upload_if_missing) {
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(host_key, all);
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
}
return bank;
}
bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key(ctx, bank, host_key);
if (upload_if_missing) {
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(host_key, all);
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
} else {
ctx.resident_valid[bank] = false;
set_resident_host_clean(ctx, bank, false);
}
update_state_ready(ctx);
return bank;
}
static int ensure_escalar_resident_bank_avoiding(StepContext &ctx,
double **host_key,
size_t all,
bool upload_if_missing,
int avoid_a,
int avoid_b,
int avoid_c)
{
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
if (ctx.current_bank >= 0)
return ctx.current_bank;
return 0;
}
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
if (bank >= 0) {
ctx.resident_age[bank] = ++ctx.resident_clock;
if (!ctx.resident_valid[bank] && upload_if_missing) {
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(host_key, all);
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
}
return bank;
}
bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
if (upload_if_missing) {
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(host_key, all);
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
set_resident_host_clean(ctx, bank, true);
} else {
ctx.resident_valid[bank] = false;
set_resident_host_clean(ctx, bank, false);
}
update_state_ready(ctx);
return bank;
}
static int active_or_keyed_bank(StepContext &ctx, static int active_or_keyed_bank(StepContext &ctx,
double **host_key, double **host_key,
size_t all, size_t all,
bool upload_if_missing) bool upload_if_missing,
int state_count = BSSN_STATE_COUNT)
{ {
if (resident_key_usable(host_key)) { if (state_count == BSSN_ESCALAR_STATE_COUNT &&
resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
int bank = ensure_escalar_resident_bank(ctx, host_key, all, upload_if_missing);
mark_resident_current_bank(ctx, bank);
return bank;
}
if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) {
int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing); int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
return bank; return bank;
@@ -6200,6 +6370,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
if (use_resident_state && input_bank >= 0)
set_resident_host_clean(ctx, input_bank, false);
} }
if (RK4 == 0) { if (RK4 == 0) {
@@ -6957,9 +7129,10 @@ static void copy_state_region_cuda(void *block_tag,
ctx.resident_valid[bank] = true; ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock; ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
mark_resident_host_state_clean(ctx, bank, state_index, false);
update_state_ready(ctx); update_state_ready(ctx);
} else { } else {
ctx.resident_host_clean[bank][state_index] = 1; mark_resident_host_state_clean(ctx, bank, state_index, true);
} }
} }
@@ -6970,9 +7143,11 @@ static void copy_state_region_packed_cuda(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz, int sx, int sy, int sz,
cudaMemcpyKind kind, cudaMemcpyKind kind,
double **state_host_key = nullptr) double **state_host_key = nullptr,
int state_count = BSSN_STATE_COUNT)
{ {
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return; if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
if (sx <= 0 || sy <= 0 || sz <= 0) return; if (sx <= 0 || sy <= 0 || sz <= 0) return;
const size_t src_pitch = (size_t)ex[0] * sizeof(double); const size_t src_pitch = (size_t)ex[0] * sizeof(double);
@@ -6980,7 +7155,8 @@ static void copy_state_region_packed_cuda(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
kind == cudaMemcpyHostToDevice); kind == cudaMemcpyHostToDevice,
state_count);
double *base_mem = ctx.d_resident_mem[bank]; double *base_mem = ctx.d_resident_mem[bank];
cudaMemcpy3DParms p = {}; cudaMemcpy3DParms p = {};
@@ -7003,9 +7179,10 @@ static void copy_state_region_packed_cuda(void *block_tag,
ctx.resident_valid[bank] = true; ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock; ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
mark_resident_host_state_clean(ctx, bank, state_index, false);
update_state_ready(ctx); update_state_ready(ctx);
} else { } else {
ctx.resident_host_clean[bank][state_index] = 1; mark_resident_host_state_clean(ctx, bank, state_index, true);
} }
} }
@@ -7024,7 +7201,8 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
kind == cudaMemcpyHostToDevice); kind == cudaMemcpyHostToDevice,
state_count);
double *base_mem = ctx.d_resident_mem[bank]; double *base_mem = ctx.d_resident_mem[bank];
const int region_all = sx * sy * sz; const int region_all = sx * sy * sz;
const size_t total_doubles = (size_t)state_count * (size_t)region_all; const size_t total_doubles = (size_t)state_count * (size_t)region_all;
@@ -7057,6 +7235,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
ctx.resident_valid[bank] = true; ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock; ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, false);
update_state_ready(ctx); update_state_ready(ctx);
} }
} }
@@ -7067,10 +7246,15 @@ static void download_resident_state_count(void *block_tag, int *ex, double **sta
const size_t bytes = all * sizeof(double); const size_t bytes = all * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, all); StepContext &ctx = ensure_step_ctx(block_tag, all);
int bank = find_resident_bank_count(ctx, state_host_out, state_count); int bank = find_resident_bank_count(ctx, state_host_out, state_count);
bool bank_matches_output_key = (bank >= 0);
if (bank < 0) { if (bank < 0) {
bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false); bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
} }
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
if (!bank_matches_output_key &&
resident_key_usable_count(state_host_out, state_count)) {
assign_resident_key_count(ctx, bank, state_host_out, state_count);
}
const bool profile = cuda_profile_enabled(); const bool profile = cuda_profile_enabled();
const double t0 = profile ? cuda_profile_now_ms() : 0.0; const double t0 = profile ? cuda_profile_now_ms() : 0.0;
static int direct_download = -1; static int direct_download = -1;
@@ -7117,6 +7301,72 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT); download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
} }
static void upload_resident_state_count(void *block_tag, int *ex, double **state_host_in, int state_count)
{
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
StepContext &ctx = ensure_step_ctx(block_tag, all);
int bank = -1;
if (state_count == BSSN_ESCALAR_STATE_COUNT) {
bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false);
bind_escalar_state_input_slots(ctx.d_resident[bank]);
upload_escalar_state_inputs(state_host_in, all);
} else if (state_count == BSSN_STATE_COUNT) {
bank = ensure_resident_bank(ctx, state_host_in, all, false);
bind_state_input_slots(ctx.d_resident[bank]);
upload_state_inputs(state_host_in, all);
} else {
return;
}
CUDA_CHECK(cudaDeviceSynchronize());
ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock;
set_resident_host_clean(ctx, bank, true);
mark_resident_current_bank(ctx, bank);
update_state_ready(ctx);
}
static void keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count)
{
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
return;
auto it = g_step_ctx.find(block_tag);
if (it == g_step_ctx.end()) return;
StepContext &ctx = it->second;
const int keep_bank = find_resident_bank_count(ctx, state_host_key, state_count);
if (keep_bank < 0 || !ctx.resident_valid[keep_bank])
return;
auto keep_clean = ctx.resident_host_clean[keep_bank];
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
ctx.resident_valid[b] = false;
ctx.resident_host[b].fill(nullptr);
ctx.resident_host_clean[b].fill(0);
ctx.resident_age[b] = 0;
}
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
ctx.d_state_curr.fill(nullptr);
ctx.d_state_next.fill(nullptr);
ctx.current_bank = -1;
ctx.resident_clock = 0;
ctx.matter_ready = false;
for (int i = 0; i < state_count; ++i) {
ctx.resident_host[keep_bank][i] = state_host_key[i];
ctx.resident_host_clean[keep_bank][i] = keep_clean[i] ? 1 : 0;
}
ctx.resident_valid[keep_bank] = true;
ctx.resident_age[keep_bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, keep_bank);
(void)ex;
update_state_ready(ctx);
}
static bool download_resident_state_count_if_present(void *block_tag, static bool download_resident_state_count_if_present(void *block_tag,
int *ex, int *ex,
double **state_host_out, double **state_host_out,
@@ -7183,8 +7433,27 @@ static void copy_state_subset(void *block_tag,
const size_t bytes = all * sizeof(double); const size_t bytes = all * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, all); StepContext &ctx = ensure_step_ctx(block_tag, all);
double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr; double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr;
const int bank = active_or_keyed_bank(ctx, full_key, all, int bank = -1;
kind == cudaMemcpyHostToDevice); if (state_host) {
if (full_key) {
bank = (subset_count == BSSN_ESCALAR_STATE_COUNT)
? find_resident_bank_count(ctx, full_key, BSSN_ESCALAR_STATE_COUNT)
: find_resident_bank(ctx, full_key);
} else {
bank = find_resident_bank_subset(ctx, state_host, state_indices, subset_count);
}
if (kind == cudaMemcpyDeviceToHost &&
(bank < 0 || !ctx.resident_valid[bank])) {
bank = -1;
}
}
if (bank < 0) {
bank = active_or_keyed_bank(ctx, full_key, all,
kind == cudaMemcpyHostToDevice,
subset_count);
} else {
mark_resident_current_bank(ctx, bank);
}
double *base_mem = ctx.d_resident_mem[bank]; double *base_mem = ctx.d_resident_mem[bank];
int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY]; int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY]; double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY];
@@ -7834,6 +8103,52 @@ int bssn_escalar_cuda_download_resident_state(void *block_tag,
return 0; return 0;
} }
extern "C"
int bssn_cuda_upload_resident_state_count(void *block_tag,
int *ex,
double **state_host_in,
int state_count)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
return 1;
upload_resident_state_count(block_tag, ex, state_host_in, state_count);
return 0;
}
extern "C"
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
int *ex,
double **state_host_in)
{
return bssn_cuda_upload_resident_state_count(block_tag, ex, state_host_in,
BSSN_ESCALAR_STATE_COUNT);
}
extern "C"
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
return 1;
keep_only_resident_state_count(block_tag, ex, state_host_key, state_count);
return 0;
}
extern "C"
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
int *ex,
double **state_host_key)
{
return bssn_cuda_keep_only_resident_state_count(block_tag, ex, state_host_key,
BSSN_ESCALAR_STATE_COUNT);
}
extern "C" extern "C"
int bssn_cuda_download_resident_state_count_if_present(void *block_tag, int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex, int *ex,
@@ -8032,6 +8347,28 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
return 0; return 0;
} }
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (!state_host_key ||
(state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT))
return 1;
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
i0, j0, k0, sx, sy, sz,
cudaMemcpyHostToDevice,
state_host_key, state_count);
return 0;
}
extern "C" extern "C"
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count, int state_count,
@@ -8115,7 +8452,8 @@ static void copy_state_device_batch(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
pack_not_unpack == 0 || state_host_key != nullptr); pack_not_unpack == 0 || state_host_key != nullptr,
state_count);
double *base_mem = ctx.d_resident_mem[bank]; double *base_mem = ctx.d_resident_mem[bank];
const int region_all = sx * sy * sz; const int region_all = sx * sy * sz;
dim3 launch_grid((unsigned int)grid((size_t)region_all), dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8164,7 +8502,8 @@ static void copy_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
pack_not_unpack == 0 || state_host_key != nullptr); pack_not_unpack == 0 || state_host_key != nullptr,
state_count);
double *base_mem = ctx.d_resident_mem[bank]; double *base_mem = ctx.d_resident_mem[bank];
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
@@ -8187,6 +8526,7 @@ static void copy_state_device_segments(void *block_tag,
ctx.resident_valid[bank] = true; ctx.resident_valid[bank] = true;
ctx.resident_age[bank] = ++ctx.resident_clock; ctx.resident_age[bank] = ++ctx.resident_clock;
mark_resident_current_bank(ctx, bank); mark_resident_current_bank(ctx, bank);
set_resident_host_clean(ctx, bank, false);
update_state_ready(ctx); update_state_ready(ctx);
} }
} }
@@ -8214,7 +8554,8 @@ static void restrict_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
state_host_key != nullptr); state_host_key != nullptr,
state_count);
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
(size_t)segment_count * 8 * sizeof(int), (size_t)segment_count * 8 * sizeof(int),
@@ -8253,7 +8594,8 @@ static void prolong_state_device_segments(void *block_tag,
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, const int bank = active_or_keyed_bank(ctx, state_host_key, all,
state_host_key != nullptr); state_host_key != nullptr,
state_count);
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11); int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11);
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
(size_t)segment_count * 11 * sizeof(int), (size_t)segment_count * 11 * sizeof(int),
@@ -8498,7 +8840,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
const int region_all = sx * sy * sz; const int region_all = sx * sy * sz;
upload_comm_state_soa(state_soa, state_count); upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all), dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8555,7 +8897,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
const int region_all = sx * sy * sz; const int region_all = sx * sy * sz;
upload_comm_state_soa(state_soa, state_count); upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all), dim3 launch_grid((unsigned int)grid((size_t)region_all),
@@ -8650,7 +8992,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true); src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true);
src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank); src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank);
src3_bank = (source_count == 3) src3_bank = (source_count == 3)
? ensure_escalar_resident_bank(ctx, src3_host_key, all, true, src1_bank) ? ensure_escalar_resident_bank_avoiding(ctx, src3_host_key, all, true,
src1_bank, src2_bank, -1)
: -1; : -1;
dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all, dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
src1_bank, src2_bank, src3_bank); src1_bank, src2_bank, src3_bank);
@@ -8658,7 +9001,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
src3_bank = (source_count == 3) src3_bank = (source_count == 3)
? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank) ? ensure_resident_bank_avoiding(ctx, src3_host_key, all, true,
src1_bank, src2_bank, -1)
: -1; : -1;
dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all, dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
src1_bank, src2_bank, src3_bank); src1_bank, src2_bank, src3_bank);

View File

@@ -104,6 +104,24 @@ int bssn_escalar_cuda_download_resident_state(void *block_tag,
int *ex, int *ex,
double **state_host_out); double **state_host_out);
int bssn_cuda_upload_resident_state_count(void *block_tag,
int *ex,
double **state_host_in,
int state_count);
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
int *ex,
double **state_host_in);
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count);
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
int *ex,
double **state_host_key);
int bssn_cuda_download_resident_state_count_if_present(void *block_tag, int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex, int *ex,
double **state_host_out, double **state_host_out,
@@ -169,6 +187,15 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz); int sx, int sy, int sz);
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag, int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count, int state_count,
double *host_buffer, double *host_buffer,