Stabilize GPU buffer lifecycle around regrid

2026-04-09 20:48:06 +08:00
parent 46e94d1248
commit cf3c6d6218
8 changed files with 186 additions and 81 deletions
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -135,7 +135,7 @@ struct GpuRhsCache
 	const double *last_y = nullptr;
 	const double *last_z = nullptr;
 	bool meta_uploaded = false;
-	static const int max_mapped_buffers = 128;
+	static const int max_mapped_buffers = 512;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	const double *device_buffers[max_mapped_buffers] = {nullptr};
 	int mapped_buffer_count = 0;
@@ -143,7 +143,7 @@ struct GpuRhsCache

 struct ExternalBufferRegistry
 {
-	static const int max_mapped_buffers = 256;
+	static const int max_mapped_buffers = 4096;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	const double *device_buffers[max_mapped_buffers] = {nullptr};
 	int mapped_buffer_count = 0;
@@ -151,7 +151,7 @@ struct ExternalBufferRegistry

 struct OwnedBufferRegistry
 {
-	static const int max_mapped_buffers = 256;
+	static const int max_mapped_buffers = 4096;
 	const double *host_buffers[max_mapped_buffers] = {nullptr};
 	double *device_buffers[max_mapped_buffers] = {nullptr};
 	size_t capacities[max_mapped_buffers] = {0};
@@ -223,7 +223,11 @@ void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device
 	}

 	if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
+	{
+		cerr << "gpu RHS buffer registry exhausted at " << GpuRhsCache::max_mapped_buffers
+		     << " entries" << endl;
 		return;
+	}

 	cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
 	cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
@@ -255,7 +259,11 @@ void map_external_buffer(ExternalBufferRegistry &registry, const double *host_pt
 	}

 	if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
+	{
+		cerr << "external CUDA buffer registry exhausted at "
+		     << ExternalBufferRegistry::max_mapped_buffers << " entries" << endl;
 		return;
+	}

 	registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
 	registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
@@ -421,6 +429,7 @@ void ensure_host_buffer_registered(const double *host_ptr, size_t bytes)
 		return;
 	}

+	cerr << "cudaHostRegister failed: " << cudaGetErrorString(err) << endl;
 	registry.failed[slot] = true;
 	registry.capacities[slot] = bytes;
 }
@@ -932,6 +941,25 @@ void bssn_gpu_clear_cached_device_buffers()
 	invalidate_owned_buffer_map(owned_buffer_registry());
 }

+void bssn_gpu_release_pinned_host_buffers()
+{
+	PinnedHostRegistry &pinned = pinned_host_registry();
+	for (int i = 0; i < pinned.buffer_count; ++i)
+	{
+		if (pinned.registered[i] && pinned.host_buffers[i])
+		{
+			cudaError_t unreg_err = cudaHostUnregister(const_cast<double *>(pinned.host_buffers[i]));
+			if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered)
+				cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl;
+		}
+		pinned.host_buffers[i] = nullptr;
+		pinned.capacities[i] = 0;
+		pinned.registered[i] = false;
+		pinned.failed[i] = false;
+	}
+	pinned.buffer_count = 0;
+}
+
 void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
 {
 	map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);