Register GPU transfer buffers as pinned host memory

2026-04-09 18:36:10 +08:00
parent 5bc67ded06
commit b0dd069a2b
3 changed files with 128 additions and 1 deletions
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -789,6 +789,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
  double *stage_ptr = nullptr;
  const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;

+  bssn_gpu_prepare_host_buffer(state0, n);
+  if (need_boundary_input) bssn_gpu_prepare_host_buffer(boundary_src, n);
+  if (need_stage_input) bssn_gpu_prepare_host_buffer(stage_data, n);
+  bssn_gpu_prepare_host_buffer(rhs_accum, n);
+
  ok = ok &&
       (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
@@ -931,6 +936,7 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_ho
    bssn_gpu_register_device_buffer(chi, device_chi);
    if (download_to_host)
    {
+      bssn_gpu_prepare_host_buffer(chi, n);
      cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
      ok = err == cudaSuccess;
@@ -945,7 +951,9 @@ int bssn_cuda_download_buffer(int *ex, double *host_ptr)
  if (!device_ptr)
    return 1;

-  const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
+  const int n = count_points(ex);
+  bssn_gpu_prepare_host_buffer(host_ptr, n);
+  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -159,6 +159,16 @@ struct OwnedBufferRegistry
 	int mapped_buffer_count = 0;
 };

+struct PinnedHostRegistry
+{
+	static const int max_buffers = 512;
+	const double *host_buffers[max_buffers] = {nullptr};
+	size_t capacities[max_buffers] = {0};
+	bool registered[max_buffers] = {false};
+	bool failed[max_buffers] = {false};
+	int buffer_count = 0;
+};
+
 GpuRhsCache &gpu_rhs_cache()
 {
 	static GpuRhsCache cache;
@@ -177,6 +187,12 @@ OwnedBufferRegistry &owned_buffer_registry()
 	return registry;
 }

+PinnedHostRegistry &pinned_host_registry()
+{
+	static thread_local PinnedHostRegistry registry;
+	return registry;
+}
+
 void reset_meta(Meta *meta)
 {
 	memset(meta, 0, sizeof(Meta));
@@ -354,6 +370,61 @@ bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
 	return true;
 }

+int find_pinned_host_slot(PinnedHostRegistry &registry, const double *host_ptr)
+{
+	for (int i = 0; i < registry.buffer_count; ++i)
+	{
+		if (registry.host_buffers[i] == host_ptr)
+			return i;
+	}
+
+	if (registry.buffer_count >= PinnedHostRegistry::max_buffers)
+		return -1;
+
+	const int slot = registry.buffer_count++;
+	registry.host_buffers[slot] = host_ptr;
+	registry.capacities[slot] = 0;
+	registry.registered[slot] = false;
+	registry.failed[slot] = false;
+	return slot;
+}
+
+void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
+{
+	if (!host_ptr || bytes == 0)
+		return;
+
+	PinnedHostRegistry &registry = pinned_host_registry();
+	const int slot = find_pinned_host_slot(registry, host_ptr);
+	if (slot < 0)
+		return;
+
+	if (registry.registered[slot] && registry.capacities[slot] >= bytes)
+		return;
+	if (registry.failed[slot] && registry.capacities[slot] >= bytes)
+		return;
+
+	if (registry.registered[slot])
+	{
+		cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(registry.host_buffers[slot]));
+		if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
+			cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
+		registry.registered[slot] = false;
+	}
+
+	cudaError_t err = cudaHostRegister(const_cast<double *>(host_ptr), bytes, cudaHostRegisterPortable);
+	if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered)
+	{
+		registry.registered[slot] = true;
+		registry.failed[slot] = false;
+		registry.capacities[slot] = bytes;
+		return;
+	}
+
+	registry.failed[slot] = true;
+	registry.capacities[slot] = bytes;
+}
+
 bool ensure_device_buffer(double **ptr, size_t count)
 {
 	if (*ptr)
@@ -462,6 +533,7 @@ void cleanup_gpu_rhs_cache()
 {
 	GpuRhsCache &cache = gpu_rhs_cache();
 	OwnedBufferRegistry &owned = owned_buffer_registry();
+	PinnedHostRegistry &pinned = pinned_host_registry();
 	if (!cache.allocated)
 	{
 		for (int i = 0; i < owned.mapped_buffer_count; ++i)
@@ -478,6 +550,20 @@ void cleanup_gpu_rhs_cache()
 			owned.host_buffers[i] = nullptr;
 		}
 		owned.mapped_buffer_count = 0;
+		for (int i = 0; i < pinned.buffer_count; ++i)
+		{
+			if (pinned.registered[i] && pinned.host_buffers[i])
+			{
+				cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
+				if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
+					cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
+			}
+			pinned.host_buffers[i] = nullptr;
+			pinned.capacities[i] = 0;
+			pinned.registered[i] = false;
+			pinned.failed[i] = false;
+		}
+		pinned.buffer_count = 0;
 		return;
 	}

@@ -509,6 +595,20 @@ void cleanup_gpu_rhs_cache()
 		owned.host_buffers[i] = nullptr;
 	}
 	owned.mapped_buffer_count = 0;
+	for (int i = 0; i < pinned.buffer_count; ++i)
+	{
+		if (pinned.registered[i] && pinned.host_buffers[i])
+		{
+			cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
+			if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
+				cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
+		}
+		pinned.host_buffers[i] = nullptr;
+		pinned.capacities[i] = 0;
+		pinned.registered[i] = false;
+		pinned.failed[i] = false;
+	}
+	pinned.buffer_count = 0;
 }

 bool register_gpu_rhs_cleanup()
@@ -837,13 +937,21 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
 	map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
 }

+void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count)
+{
+	if (count > 0)
+		ensure_host_buffer_registered(host_ptr, static_cast<size_t>(count) * sizeof(double));
+}
+
 int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
 {
+	bssn_gpu_prepare_host_buffer(host_ptr, count);
 	return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
 }

 int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
 {
+	bssn_gpu_prepare_host_buffer(host_ptr, count);
 	return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
 }

@@ -861,6 +969,11 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
 	if (!device_ptr)
 		return 1;

+	int full_count = 1;
+	for (int i = 0; i < 3; ++i)
+		full_count *= full_shape[i];
+	bssn_gpu_prepare_host_buffer(host_ptr, full_count);
+
 	int start[3] = {0, 0, 0};
 	for (int i = 0; i < 3; ++i)
 	{
@@ -928,6 +1041,11 @@ int bssn_gpu_stage_download_region(double *host_ptr,
 	if (!device_ptr)
 		return 1;

+	int full_count = 1;
+	for (int i = 0; i < 3; ++i)
+		full_count *= full_shape[i];
+	bssn_gpu_prepare_host_buffer(host_ptr, full_count);
+
 	int start[3] = {0, 0, 0};
 	for (int i = 0; i < 3; ++i)
 	{
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -69,6 +69,7 @@ int bssn_gpu_bind_process_device(int mpi_rank);
 void bssn_gpu_clear_cached_device_buffers();
 const double *bssn_gpu_find_device_buffer(const double *host_ptr);
 void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
+void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
 int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
 int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
 int bssn_gpu_stage_upload_region(const double *host_ptr,