Batch GA/BH subset sync with indexed GPU pack/unpack buffers
This commit is contained in:
@@ -362,6 +362,8 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
|
||||
S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
|
||||
};
|
||||
|
||||
__constant__ int d_subset_state_indices[BSSN_STATE_COUNT];
|
||||
|
||||
static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
||||
1, 1, 1,
|
||||
1, 1, -1,
|
||||
@@ -395,19 +397,25 @@ struct StepContext {
|
||||
double *d_state_curr_mem;
|
||||
double *d_state_next_mem;
|
||||
double *d_matter_mem;
|
||||
double *d_comm_mem;
|
||||
double *h_comm_mem;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||
size_t cap_all;
|
||||
size_t cap_comm;
|
||||
bool h_comm_pinned;
|
||||
size_t cap_h_comm;
|
||||
bool matter_ready;
|
||||
bool state_ready;
|
||||
|
||||
StepContext()
|
||||
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
||||
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
||||
d_matter_mem(nullptr), cap_all(0),
|
||||
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
||||
cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
|
||||
matter_ready(false), state_ready(false)
|
||||
{
|
||||
d_state0.fill(nullptr);
|
||||
@@ -584,11 +592,65 @@ static void release_step_ctx(void *block_tag)
|
||||
{
|
||||
auto it = g_step_ctx.find(block_tag);
|
||||
if (it == g_step_ctx.end()) return;
|
||||
if (it->second.d_comm_mem) {
|
||||
cudaFree(it->second.d_comm_mem);
|
||||
it->second.d_comm_mem = nullptr;
|
||||
it->second.cap_comm = 0;
|
||||
}
|
||||
if (it->second.h_comm_mem) {
|
||||
if (it->second.h_comm_pinned) cudaFreeHost(it->second.h_comm_mem);
|
||||
else free(it->second.h_comm_mem);
|
||||
it->second.h_comm_mem = nullptr;
|
||||
it->second.h_comm_pinned = false;
|
||||
it->second.cap_h_comm = 0;
|
||||
}
|
||||
StepAllocation alloc = detach_step_allocation(it->second);
|
||||
recycle_step_allocation(alloc);
|
||||
g_step_ctx.erase(it);
|
||||
}
|
||||
|
||||
static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
||||
{
|
||||
if (needed_doubles == 0) return nullptr;
|
||||
if (ctx.cap_comm < needed_doubles) {
|
||||
if (ctx.d_comm_mem) {
|
||||
CUDA_CHECK(cudaFree(ctx.d_comm_mem));
|
||||
ctx.d_comm_mem = nullptr;
|
||||
}
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double)));
|
||||
ctx.cap_comm = needed_doubles;
|
||||
}
|
||||
return ctx.d_comm_mem;
|
||||
}
|
||||
|
||||
static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
||||
{
|
||||
if (needed_doubles == 0) return nullptr;
|
||||
if (ctx.cap_h_comm < needed_doubles) {
|
||||
if (ctx.h_comm_mem) {
|
||||
if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem);
|
||||
else free(ctx.h_comm_mem);
|
||||
ctx.h_comm_mem = nullptr;
|
||||
ctx.h_comm_pinned = false;
|
||||
}
|
||||
|
||||
const size_t bytes = needed_doubles * sizeof(double);
|
||||
cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes);
|
||||
if (err == cudaSuccess) {
|
||||
ctx.h_comm_pinned = true;
|
||||
} else {
|
||||
ctx.h_comm_mem = (double *)malloc(bytes);
|
||||
ctx.h_comm_pinned = false;
|
||||
if (!ctx.h_comm_mem) {
|
||||
fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
ctx.cap_h_comm = needed_doubles;
|
||||
}
|
||||
return ctx.h_comm_mem;
|
||||
}
|
||||
|
||||
static void upload_grid_params_if_needed(const GridParams &gp)
|
||||
{
|
||||
if (!g_gp_host_cache_valid ||
|
||||
@@ -1681,7 +1743,7 @@ __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
||||
- lgxy * lgxy * lgzz
|
||||
- lgxx * lgyz * lgyz;
|
||||
|
||||
lscale = ONE / cbrt(lscale);
|
||||
lscale = ONE / pow(lscale, F1O3);
|
||||
|
||||
lgxx *= lscale;
|
||||
lgxy *= lscale;
|
||||
@@ -3446,6 +3508,88 @@ static void download_state_outputs(double **state_host_out, size_t all)
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int nx, int ny,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz,
|
||||
int region_all,
|
||||
int state_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
{
|
||||
const int state_index = (int)(tid / (size_t)region_all);
|
||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||
const int ii = local % sx;
|
||||
const int jj = (local / sx) % sy;
|
||||
const int kk = local / (sx * sy);
|
||||
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
||||
const double * __restrict__ src,
|
||||
int nx, int ny,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz,
|
||||
int region_all,
|
||||
int state_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)region_all * (size_t)state_count;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
{
|
||||
const int state_index = (int)(tid / (size_t)region_all);
|
||||
const int local = (int)(tid - (size_t)state_index * region_all);
|
||||
const int ii = local % sx;
|
||||
const int jj = (local / sx) % sy;
|
||||
const int kk = local / (sx * sy);
|
||||
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int subset_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)subset_count * (size_t)all;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
{
|
||||
const int subset_slot = (int)(tid / (size_t)all);
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
const int src = (int)(tid - (size_t)subset_slot * all);
|
||||
dst[tid] = src_mem[(size_t)state_index * all + src];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
|
||||
const double * __restrict__ src,
|
||||
int subset_count,
|
||||
int all)
|
||||
{
|
||||
const size_t total = (size_t)subset_count * (size_t)all;
|
||||
for (size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += (size_t)blockDim.x * gridDim.x)
|
||||
{
|
||||
const int subset_slot = (int)(tid / (size_t)all);
|
||||
const int state_index = d_subset_state_indices[subset_slot];
|
||||
const int dst = (int)(tid - (size_t)subset_slot * all);
|
||||
dst_mem[(size_t)state_index * all + dst] = src[tid];
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_state_region_cuda(void *block_tag,
|
||||
int state_index,
|
||||
double *host_state,
|
||||
@@ -3508,6 +3652,41 @@ static void copy_state_region_packed_cuda(void *block_tag,
|
||||
CUDA_CHECK(cudaMemcpy3D(&p));
|
||||
}
|
||||
|
||||
static void copy_state_region_packed_batch_cuda(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
const int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz,
|
||||
cudaMemcpyKind kind)
|
||||
{
|
||||
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
|
||||
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
||||
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||
const int region_all = sx * sy * sz;
|
||||
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
|
||||
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||
|
||||
if (kind == cudaMemcpyDeviceToHost) {
|
||||
kern_pack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||
ex[0] * ex[1] * ex[2]);
|
||||
CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyDeviceToHost));
|
||||
} else {
|
||||
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyHostToDevice));
|
||||
kern_unpack_state_region_batch<<<grid(total_doubles), BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, ex[0], ex[1],
|
||||
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
||||
ex[0] * ex[1] * ex[2]);
|
||||
}
|
||||
}
|
||||
|
||||
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
||||
{
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
@@ -3521,6 +3700,63 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
|
||||
}
|
||||
}
|
||||
|
||||
static void copy_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host,
|
||||
cudaMemcpyKind kind)
|
||||
{
|
||||
if (subset_count <= 0) return;
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const size_t bytes = all * sizeof(double);
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
int active_state_indices[BSSN_STATE_COUNT];
|
||||
double *active_state_host[BSSN_STATE_COUNT];
|
||||
int active_count = 0;
|
||||
|
||||
for (int i = 0; i < subset_count; ++i) {
|
||||
const int state_index = state_indices[i];
|
||||
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue;
|
||||
if (!state_host[i]) continue;
|
||||
active_state_indices[active_count] = state_index;
|
||||
active_state_host[active_count] = state_host[i];
|
||||
++active_count;
|
||||
}
|
||||
if (active_count <= 0) return;
|
||||
|
||||
const size_t total_doubles = (size_t)active_count * all;
|
||||
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
||||
double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices,
|
||||
(size_t)active_count * sizeof(int),
|
||||
0, cudaMemcpyHostToDevice));
|
||||
if (kind == cudaMemcpyDeviceToHost) {
|
||||
kern_pack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyDeviceToHost));
|
||||
for (int i = 0; i < active_count; ++i) {
|
||||
std::memcpy(active_state_host[i],
|
||||
h_comm + (size_t)i * all,
|
||||
bytes);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < active_count; ++i) {
|
||||
std::memcpy(h_comm + (size_t)i * all,
|
||||
active_state_host[i],
|
||||
bytes);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
||||
total_doubles * sizeof(double),
|
||||
cudaMemcpyHostToDevice));
|
||||
kern_unpack_state_subset<<<grid(total_doubles), BLK>>>(
|
||||
ctx.d_state_curr_mem, d_comm, active_count, (int)all);
|
||||
}
|
||||
}
|
||||
|
||||
static bool has_resident_state(void *block_tag)
|
||||
{
|
||||
auto it = g_step_ctx.find(block_tag);
|
||||
@@ -4186,6 +4422,66 @@ int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
||||
i0, j0, k0, sx, sy, sz,
|
||||
cudaMemcpyDeviceToHost);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
||||
i0, j0, k0, sx, sy, sz,
|
||||
cudaMemcpyHostToDevice);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_download_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_out)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out,
|
||||
cudaMemcpyDeviceToHost);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_upload_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_in)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in,
|
||||
cudaMemcpyHostToDevice);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_has_resident_state(void *block_tag)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user