Download staged GPU sync regions incrementally

2026-04-09 18:23:05 +08:00
parent 3b16795e78
commit 5bc67ded06
3 changed files with 147 additions and 5 deletions
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -914,6 +914,73 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
 	return 0;
 }

+int bssn_gpu_stage_download_region(double *host_ptr,
+                                   const int *full_shape,
+                                   const double *full_llb,
+                                   const double *full_uub,
+                                   const int *region_shape,
+                                   const double *region_llb)
+{
+	if (!host_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb)
+		return 1;
+
+	const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+	if (!device_ptr)
+		return 1;
+
+	int start[3] = {0, 0, 0};
+	for (int i = 0; i < 3; ++i)
+	{
+		if (full_shape[i] <= 0 || region_shape[i] <= 0)
+			return 1;
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i] - 1);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#ifdef Cell
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i]);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+		if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i])
+			return 1;
+	}
+
+	cudaMemcpy3DParms parms = {};
+	parms.srcPtr = make_cudaPitchedPtr(const_cast<double *>(device_ptr),
+	                                   static_cast<size_t>(full_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(full_shape[0]),
+	                                   static_cast<size_t>(full_shape[1]));
+	parms.dstPtr = make_cudaPitchedPtr(host_ptr,
+	                                   static_cast<size_t>(full_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(full_shape[0]),
+	                                   static_cast<size_t>(full_shape[1]));
+	parms.srcPos = make_cudaPos(static_cast<size_t>(start[0]) * sizeof(double),
+	                            static_cast<size_t>(start[1]),
+	                            static_cast<size_t>(start[2]));
+	parms.dstPos = parms.srcPos;
+	parms.extent = make_cudaExtent(static_cast<size_t>(region_shape[0]) * sizeof(double),
+	                               static_cast<size_t>(region_shape[1]),
+	                               static_cast<size_t>(region_shape[2]));
+	parms.kind = cudaMemcpyDeviceToHost;
+
+	cudaError_t err = cudaMemcpy3D(&parms);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaMemcpy3D(D2H region) failed: " << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+
+	return 0;
+}
+
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
 	if(_t == 0)