Register GPU transfer buffers as pinned host memory

This commit is contained in:
2026-04-09 18:36:10 +08:00
parent 5bc67ded06
commit b0dd069a2b
3 changed files with 128 additions and 1 deletions

View File

@@ -789,6 +789,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
double *stage_ptr = nullptr;
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
bssn_gpu_prepare_host_buffer(state0, n);
if (need_boundary_input) bssn_gpu_prepare_host_buffer(boundary_src, n);
if (need_stage_input) bssn_gpu_prepare_host_buffer(stage_data, n);
bssn_gpu_prepare_host_buffer(rhs_accum, n);
ok = ok &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
@@ -931,6 +936,7 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_ho
bssn_gpu_register_device_buffer(chi, device_chi);
if (download_to_host)
{
bssn_gpu_prepare_host_buffer(chi, n);
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
@@ -945,7 +951,9 @@ int bssn_cuda_download_buffer(int *ex, double *host_ptr)
if (!device_ptr)
return 1;
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
const int n = count_points(ex);
bssn_gpu_prepare_host_buffer(host_ptr, n);
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{

View File

@@ -159,6 +159,16 @@ struct OwnedBufferRegistry
int mapped_buffer_count = 0;
};
struct PinnedHostRegistry
{
static const int max_buffers = 512;
const double *host_buffers[max_buffers] = {nullptr};
size_t capacities[max_buffers] = {0};
bool registered[max_buffers] = {false};
bool failed[max_buffers] = {false};
int buffer_count = 0;
};
GpuRhsCache &gpu_rhs_cache()
{
static GpuRhsCache cache;
@@ -177,6 +187,12 @@ OwnedBufferRegistry &owned_buffer_registry()
return registry;
}
PinnedHostRegistry &pinned_host_registry()
{
static thread_local PinnedHostRegistry registry;
return registry;
}
void reset_meta(Meta *meta)
{
memset(meta, 0, sizeof(Meta));
@@ -354,6 +370,61 @@ bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
return true;
}
int find_pinned_host_slot(PinnedHostRegistry &registry, const double *host_ptr)
{
for (int i = 0; i < registry.buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
return i;
}
if (registry.buffer_count >= PinnedHostRegistry::max_buffers)
return -1;
const int slot = registry.buffer_count++;
registry.host_buffers[slot] = host_ptr;
registry.capacities[slot] = 0;
registry.registered[slot] = false;
registry.failed[slot] = false;
return slot;
}
void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
{
if (!host_ptr || bytes == 0)
return;
PinnedHostRegistry &registry = pinned_host_registry();
const int slot = find_pinned_host_slot(registry, host_ptr);
if (slot < 0)
return;
if (registry.registered[slot] && registry.capacities[slot] >= bytes)
return;
if (registry.failed[slot] && registry.capacities[slot] >= bytes)
return;
if (registry.registered[slot])
{
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(registry.host_buffers[slot]));
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
registry.registered[slot] = false;
}
cudaError_t err = cudaHostRegister(const_cast<double *>(host_ptr), bytes, cudaHostRegisterPortable);
if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered)
{
registry.registered[slot] = true;
registry.failed[slot] = false;
registry.capacities[slot] = bytes;
return;
}
registry.failed[slot] = true;
registry.capacities[slot] = bytes;
}
bool ensure_device_buffer(double **ptr, size_t count)
{
if (*ptr)
@@ -462,6 +533,7 @@ void cleanup_gpu_rhs_cache()
{
GpuRhsCache &cache = gpu_rhs_cache();
OwnedBufferRegistry &owned = owned_buffer_registry();
PinnedHostRegistry &pinned = pinned_host_registry();
if (!cache.allocated)
{
for (int i = 0; i < owned.mapped_buffer_count; ++i)
@@ -478,6 +550,20 @@ void cleanup_gpu_rhs_cache()
owned.host_buffers[i] = nullptr;
}
owned.mapped_buffer_count = 0;
for (int i = 0; i < pinned.buffer_count; ++i)
{
if (pinned.registered[i] && pinned.host_buffers[i])
{
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
}
pinned.host_buffers[i] = nullptr;
pinned.capacities[i] = 0;
pinned.registered[i] = false;
pinned.failed[i] = false;
}
pinned.buffer_count = 0;
return;
}
@@ -509,6 +595,20 @@ void cleanup_gpu_rhs_cache()
owned.host_buffers[i] = nullptr;
}
owned.mapped_buffer_count = 0;
for (int i = 0; i < pinned.buffer_count; ++i)
{
if (pinned.registered[i] && pinned.host_buffers[i])
{
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
}
pinned.host_buffers[i] = nullptr;
pinned.capacities[i] = 0;
pinned.registered[i] = false;
pinned.failed[i] = false;
}
pinned.buffer_count = 0;
}
bool register_gpu_rhs_cleanup()
@@ -837,13 +937,21 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
}
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count)
{
if (count > 0)
ensure_host_buffer_registered(host_ptr, static_cast<size_t>(count) * sizeof(double));
}
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
{
bssn_gpu_prepare_host_buffer(host_ptr, count);
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
}
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
{
bssn_gpu_prepare_host_buffer(host_ptr, count);
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
}
@@ -861,6 +969,11 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
if (!device_ptr)
return 1;
int full_count = 1;
for (int i = 0; i < 3; ++i)
full_count *= full_shape[i];
bssn_gpu_prepare_host_buffer(host_ptr, full_count);
int start[3] = {0, 0, 0};
for (int i = 0; i < 3; ++i)
{
@@ -928,6 +1041,11 @@ int bssn_gpu_stage_download_region(double *host_ptr,
if (!device_ptr)
return 1;
int full_count = 1;
for (int i = 0; i < 3; ++i)
full_count *= full_shape[i];
bssn_gpu_prepare_host_buffer(host_ptr, full_count);
int start[3] = {0, 0, 0};
for (int i = 0; i < 3; ++i)
{

View File

@@ -69,6 +69,7 @@ int bssn_gpu_bind_process_device(int mpi_rank);
void bssn_gpu_clear_cached_device_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_upload_region(const double *host_ptr,