Stabilize GPU output path and MPI sync

2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -1,4 +1,5 @@
 #include "bssn_cuda_ops.h"
+#include "bssn_gpu.h"

 #include <cmath>
 #include <cstdio>
@@ -79,6 +80,29 @@ inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
  return true;
 }

+inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
+{
+  if (!ensure_capacity(dst, bytes))
+    return false;
+
+  const double *device_src = bssn_gpu_find_device_buffer(src);
+  cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+  const void *copy_src = src;
+  if (device_src)
+  {
+    copy_src = device_src;
+    kind = cudaMemcpyDeviceToDevice;
+  }
+
+  cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
+    return false;
+  }
+  return true;
+}
+
 __global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
@@ -374,7 +398,7 @@ __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
  }
 }

-inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
+inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
 {
  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
  if (err != cudaSuccess)
@@ -382,10 +406,10 @@ inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **ar
    report_cuda_error("cudaLaunchKernel", err);
    return false;
  }
-  err = cudaDeviceSynchronize();
+  err = cudaPeekAtLastError();
  if (err != cudaSuccess)
  {
-    report_cuda_error("cudaDeviceSynchronize", err);
+    report_cuda_error("cudaPeekAtLastError", err);
    return false;
  }
  return true;
@@ -411,18 +435,18 @@ int bssn_cuda_enforce_ga(int *ex,
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

-  bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
-            copy_to_device(cache.gxy, gxy, bytes) &&
-            copy_to_device(cache.gxz, gxz, bytes) &&
-            copy_to_device(cache.dyy, dyy, bytes) &&
-            copy_to_device(cache.gyz, gyz, bytes) &&
-            copy_to_device(cache.dzz, dzz, bytes) &&
-            copy_to_device(cache.Axx, Axx, bytes) &&
-            copy_to_device(cache.Axy, Axy, bytes) &&
-            copy_to_device(cache.Axz, Axz, bytes) &&
-            copy_to_device(cache.Ayy, Ayy, bytes) &&
-            copy_to_device(cache.Ayz, Ayz, bytes) &&
-            copy_to_device(cache.Azz, Azz, bytes);
+  bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
+            copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
+            copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
+            copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
+            copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
+            copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
+            copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
+            copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
+            copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
+            copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
+            copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
+            copy_to_device_preferring_device(cache.Azz, Azz, bytes);

  if (ok)
  {
@@ -432,7 +456,7 @@ int bssn_cuda_enforce_ga(int *ex,
    double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
-    ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
+    ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
  }

  if (ok)
@@ -527,10 +551,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
      (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;

  ok = ok &&
-       (!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
+       (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
-       (!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
-       (!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));
+       (!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) &&
+       (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));

  if (ok && !need_stage_input)
    ok = ensure_capacity(cache.stage, bytes);
@@ -539,11 +563,15 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    return 1;

  if (refresh_state0)
+  {
    cache.host_state0 = state0;
+    bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
+  }
  if (refresh_rhs)
  {
    cache.host_rhs = rhs_accum;
    cache.rhs_resident = true;
+    bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
  }

  double dX = X[1] - X[0];
@@ -582,14 +610,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                    &bam_source, &bam_target,
                    &soa0, &soa1, &soa2,
                    &symmetry};
-    ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
+    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

  if (ok)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
-    ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
+    ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
  }

  if (ok && lev > 0)
@@ -599,11 +627,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                      &has_xmin, &has_ymin, &has_zmin,
                      &has_xmax, &has_ymax, &has_zmax,
                      &d_state0, &d_stage};
-    ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
+    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
  }

  if (ok)
  {
+    bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr);
+
    cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
    ok = err == cudaSuccess;
@@ -621,13 +651,13 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

-  bool ok = copy_to_device(d_chi, chi, bytes);
+  bool ok = copy_to_device_preferring_device(d_chi, chi, bytes);

  if (ok)
  {
    double *ptr = d_chi.ptr;
    void *args[] = {&n, &ptr, &tinny};
-    ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
+    ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
  }

  if (ok)