Register GPU transfer buffers as pinned host memory
This commit is contained in:
@@ -789,6 +789,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
double *stage_ptr = nullptr;
|
||||
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
|
||||
|
||||
bssn_gpu_prepare_host_buffer(state0, n);
|
||||
if (need_boundary_input) bssn_gpu_prepare_host_buffer(boundary_src, n);
|
||||
if (need_stage_input) bssn_gpu_prepare_host_buffer(stage_data, n);
|
||||
bssn_gpu_prepare_host_buffer(rhs_accum, n);
|
||||
|
||||
ok = ok &&
|
||||
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
||||
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
|
||||
@@ -931,6 +936,7 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_ho
|
||||
bssn_gpu_register_device_buffer(chi, device_chi);
|
||||
if (download_to_host)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(chi, n);
|
||||
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||
ok = err == cudaSuccess;
|
||||
@@ -945,7 +951,9 @@ int bssn_cuda_download_buffer(int *ex, double *host_ptr)
|
||||
if (!device_ptr)
|
||||
return 1;
|
||||
|
||||
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
|
||||
const int n = count_points(ex);
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, n);
|
||||
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
|
||||
@@ -159,6 +159,16 @@ struct OwnedBufferRegistry
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
struct PinnedHostRegistry
|
||||
{
|
||||
static const int max_buffers = 512;
|
||||
const double *host_buffers[max_buffers] = {nullptr};
|
||||
size_t capacities[max_buffers] = {0};
|
||||
bool registered[max_buffers] = {false};
|
||||
bool failed[max_buffers] = {false};
|
||||
int buffer_count = 0;
|
||||
};
|
||||
|
||||
GpuRhsCache &gpu_rhs_cache()
|
||||
{
|
||||
static GpuRhsCache cache;
|
||||
@@ -177,6 +187,12 @@ OwnedBufferRegistry &owned_buffer_registry()
|
||||
return registry;
|
||||
}
|
||||
|
||||
PinnedHostRegistry &pinned_host_registry()
|
||||
{
|
||||
static thread_local PinnedHostRegistry registry;
|
||||
return registry;
|
||||
}
|
||||
|
||||
void reset_meta(Meta *meta)
|
||||
{
|
||||
memset(meta, 0, sizeof(Meta));
|
||||
@@ -354,6 +370,61 @@ bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
|
||||
return true;
|
||||
}
|
||||
|
||||
int find_pinned_host_slot(PinnedHostRegistry ®istry, const double *host_ptr)
|
||||
{
|
||||
for (int i = 0; i < registry.buffer_count; ++i)
|
||||
{
|
||||
if (registry.host_buffers[i] == host_ptr)
|
||||
return i;
|
||||
}
|
||||
|
||||
if (registry.buffer_count >= PinnedHostRegistry::max_buffers)
|
||||
return -1;
|
||||
|
||||
const int slot = registry.buffer_count++;
|
||||
registry.host_buffers[slot] = host_ptr;
|
||||
registry.capacities[slot] = 0;
|
||||
registry.registered[slot] = false;
|
||||
registry.failed[slot] = false;
|
||||
return slot;
|
||||
}
|
||||
|
||||
void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
|
||||
{
|
||||
if (!host_ptr || bytes == 0)
|
||||
return;
|
||||
|
||||
PinnedHostRegistry ®istry = pinned_host_registry();
|
||||
const int slot = find_pinned_host_slot(registry, host_ptr);
|
||||
if (slot < 0)
|
||||
return;
|
||||
|
||||
if (registry.registered[slot] && registry.capacities[slot] >= bytes)
|
||||
return;
|
||||
if (registry.failed[slot] && registry.capacities[slot] >= bytes)
|
||||
return;
|
||||
|
||||
if (registry.registered[slot])
|
||||
{
|
||||
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(registry.host_buffers[slot]));
|
||||
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
|
||||
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
|
||||
registry.registered[slot] = false;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaHostRegister(const_cast<double *>(host_ptr), bytes, cudaHostRegisterPortable);
|
||||
if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered)
|
||||
{
|
||||
registry.registered[slot] = true;
|
||||
registry.failed[slot] = false;
|
||||
registry.capacities[slot] = bytes;
|
||||
return;
|
||||
}
|
||||
|
||||
registry.failed[slot] = true;
|
||||
registry.capacities[slot] = bytes;
|
||||
}
|
||||
|
||||
bool ensure_device_buffer(double **ptr, size_t count)
|
||||
{
|
||||
if (*ptr)
|
||||
@@ -462,6 +533,7 @@ void cleanup_gpu_rhs_cache()
|
||||
{
|
||||
GpuRhsCache &cache = gpu_rhs_cache();
|
||||
OwnedBufferRegistry &owned = owned_buffer_registry();
|
||||
PinnedHostRegistry &pinned = pinned_host_registry();
|
||||
if (!cache.allocated)
|
||||
{
|
||||
for (int i = 0; i < owned.mapped_buffer_count; ++i)
|
||||
@@ -478,6 +550,20 @@ void cleanup_gpu_rhs_cache()
|
||||
owned.host_buffers[i] = nullptr;
|
||||
}
|
||||
owned.mapped_buffer_count = 0;
|
||||
for (int i = 0; i < pinned.buffer_count; ++i)
|
||||
{
|
||||
if (pinned.registered[i] && pinned.host_buffers[i])
|
||||
{
|
||||
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
|
||||
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
|
||||
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
|
||||
}
|
||||
pinned.host_buffers[i] = nullptr;
|
||||
pinned.capacities[i] = 0;
|
||||
pinned.registered[i] = false;
|
||||
pinned.failed[i] = false;
|
||||
}
|
||||
pinned.buffer_count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -509,6 +595,20 @@ void cleanup_gpu_rhs_cache()
|
||||
owned.host_buffers[i] = nullptr;
|
||||
}
|
||||
owned.mapped_buffer_count = 0;
|
||||
for (int i = 0; i < pinned.buffer_count; ++i)
|
||||
{
|
||||
if (pinned.registered[i] && pinned.host_buffers[i])
|
||||
{
|
||||
cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
|
||||
if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
|
||||
cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
|
||||
}
|
||||
pinned.host_buffers[i] = nullptr;
|
||||
pinned.capacities[i] = 0;
|
||||
pinned.registered[i] = false;
|
||||
pinned.failed[i] = false;
|
||||
}
|
||||
pinned.buffer_count = 0;
|
||||
}
|
||||
|
||||
bool register_gpu_rhs_cleanup()
|
||||
@@ -837,13 +937,21 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
|
||||
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
||||
}
|
||||
|
||||
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count)
|
||||
{
|
||||
if (count > 0)
|
||||
ensure_host_buffer_registered(host_ptr, static_cast<size_t>(count) * sizeof(double));
|
||||
}
|
||||
|
||||
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, count);
|
||||
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, count);
|
||||
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
|
||||
}
|
||||
|
||||
@@ -861,6 +969,11 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
|
||||
if (!device_ptr)
|
||||
return 1;
|
||||
|
||||
int full_count = 1;
|
||||
for (int i = 0; i < 3; ++i)
|
||||
full_count *= full_shape[i];
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, full_count);
|
||||
|
||||
int start[3] = {0, 0, 0};
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
@@ -928,6 +1041,11 @@ int bssn_gpu_stage_download_region(double *host_ptr,
|
||||
if (!device_ptr)
|
||||
return 1;
|
||||
|
||||
int full_count = 1;
|
||||
for (int i = 0; i < 3; ++i)
|
||||
full_count *= full_shape[i];
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, full_count);
|
||||
|
||||
int start[3] = {0, 0, 0};
|
||||
for (int i = 0; i < 3; ++i)
|
||||
{
|
||||
|
||||
@@ -69,6 +69,7 @@ int bssn_gpu_bind_process_device(int mpi_rank);
|
||||
void bssn_gpu_clear_cached_device_buffers();
|
||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
||||
void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
|
||||
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
|
||||
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
|
||||
int bssn_gpu_stage_upload_region(const double *host_ptr,
|
||||
|
||||
Reference in New Issue
Block a user