Fix GPU RK4 boundary and sync correctness

2026-04-12 12:13:47 +08:00
parent b78874ef21
commit d9287ea530
4 changed files with 134 additions and 30 deletions
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -1,5 +1,7 @@
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
+#include "rungekutta4_rout.h"
+#include "sommerfeld_rout.h"

 #include <cmath>
 #include <cstdio>
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
  return true;
 }

+inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
+{
+  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+  if (!device_ptr)
+    return true;
+
+  bssn_gpu_prepare_host_buffer(host_ptr, count);
+  const size_t bytes = static_cast<size_t>(count) * sizeof(double);
+  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error(label, err);
+    return false;
+  }
+  return true;
+}
+
 inline bool copy_region_to_padded_stage(CachedBuffer &dst,
                                        const double *src,
                                        const int src_shape[3],
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
+                               const double *phi_field,
+                               const double *lap_field,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

-  if (ok)
+  if (ok && lev == 0)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,

  if (ok && lev > 0)
  {
-      double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
-      void *args[] = {&nx, &ny, &nz,
-                      &has_xmin, &has_ymin, &has_zmin,
-                      &has_xmax, &has_ymax, &has_zmax,
-                      &d_state0, &d_stage};
-    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
+    double *host_state0 = const_cast<double *>(state0);
+    double *host_phi = const_cast<double *>(phi_field);
+    double *host_lap = const_cast<double *>(lap_field);
+    double *host_rhs = rhs_accum;
+
+    ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
+         sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
+         sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
+         sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
+    if (ok)
+    {
+      bssn_gpu_prepare_host_buffer(stage_data, n);
+      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(D2H) stage_data", err);
+        ok = false;
+      }
+    }
+
+    if (ok)
+    {
+      int rk_stage_host = rk_stage;
+      int cor = 1;
+      f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
+      f_sommerfeld_rout(ex,
+                        const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
+                        xmin, ymin, zmin, xmax, ymax, zmax,
+                        dT,
+                        host_phi, host_lap,
+                        host_state0, stage_data,
+                        const_cast<double *>(SoA),
+                        symmetry, cor);
+
+      cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
+        ok = false;
+      }
+    }
  }

  if (ok)