Optimize GPU RK4 stage sync path

2026-04-12 18:36:05 +08:00
parent d9287ea530
commit c8977d8356
4 changed files with 698 additions and 33 deletions
@@ -23,7 +23,10 @@ using namespace std;
 #include <mpi.h>

 #include "misc.h"
-#include "macrodef.h"
+#include "macrodef.h"
+#ifdef USE_GPU
+extern void bssn_cuda_dump_stage_profile();
+#endif

 #ifndef ABEtype
 #error "not define ABEtype"
@@ -469,10 +472,13 @@ int main(int argc, char *argv[])
            cout << endl;
      }

-      ADM->Evolve(Steps);
-
-      if (myrank == 0)
-      {
+	      ADM->Evolve(Steps);
+#ifdef USE_GPU
+	      bssn_cuda_dump_stage_profile();
+#endif
+	
+	      if (myrank == 0)
+	      {
            cout << endl;
            cout << " Total Evolve Time: "  << MPI_Wtime() - End_clock   << " seconds!" << endl;
            cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;
@@ -50,6 +50,12 @@ struct CachedIntBuffer
  size_t capacity = 0;
 };

+struct CachedPtrBuffer
+{
+  void *ptr = nullptr;
+  size_t capacity = 0;
+};
+
 inline void release_buffer(CachedBuffer &buffer)
 {
  if (buffer.ptr)
@@ -74,6 +80,18 @@ inline void release_buffer(CachedIntBuffer &buffer)
  buffer.capacity = 0;
 }

+inline void release_buffer(CachedPtrBuffer &buffer)
+{
+  if (buffer.ptr)
+  {
+    cudaError_t free_err = cudaFree(buffer.ptr);
+    if (free_err != cudaSuccess)
+      report_cuda_error("cudaFree", free_err);
+    buffer.ptr = nullptr;
+  }
+  buffer.capacity = 0;
+}
+
 inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
 {
  if (bytes <= buffer.capacity && buffer.ptr)
@@ -124,6 +142,31 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
  return true;
 }

+inline bool ensure_capacity(CachedPtrBuffer &buffer, size_t bytes)
+{
+  if (bytes <= buffer.capacity && buffer.ptr)
+    return true;
+
+  if (buffer.ptr)
+  {
+    cudaError_t free_err = cudaFree(buffer.ptr);
+    if (free_err != cudaSuccess)
+      report_cuda_error("cudaFree", free_err);
+    buffer.ptr = nullptr;
+    buffer.capacity = 0;
+  }
+
+  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMalloc", err);
+    return false;
+  }
+
+  buffer.capacity = bytes;
+  return true;
+}
+
 struct Rk4VarCache
 {
  CachedBuffer X, Y, Z;
@@ -169,6 +212,13 @@ struct InterpBatchCache
  InterpStencilCacheEntry stencil_entry;
 };

+struct Rk4BatchCache
+{
+  CachedPtrBuffer state0_ptrs;
+  CachedPtrBuffer stage_ptrs;
+  CachedPtrBuffer rhs_ptrs;
+};
+
 std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
 {
  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
@@ -181,6 +231,12 @@ InterpBatchCache &interp_batch_cache()
  return cache;
 }

+Rk4BatchCache &rk4_batch_cache()
+{
+  static thread_local Rk4BatchCache cache;
+  return cache;
+}
+
 inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
 {
  release_buffer(entry.weights);
@@ -791,6 +847,63 @@ __global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
  }
 }

+__global__ void rk4_boundary_batch_kernel(int n, int nx, int ny, int nz,
+                                          int has_xmin, int has_ymin, int has_zmin,
+                                          int has_xmax, int has_ymax, int has_zmax,
+                                          int num_var, double dT,
+                                          const double *const *state0_list,
+                                          double *const *stage_list,
+                                          double *const *rhs_list,
+                                          int stage)
+{
+  const double half = 0.5;
+  const double one_sixth = 1.0 / 6.0;
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
+  {
+    const int plane = nx * ny;
+    const int k = idx / plane;
+    const int rem = idx - k * plane;
+    const int j = rem / nx;
+    const int i = rem - j * nx;
+    const bool is_boundary =
+        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
+        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
+        (has_zmin && k == 0) || (has_zmax && k == nz - 1);
+
+    for (int v = 0; v < num_var; ++v)
+    {
+      const double *f0 = state0_list[v];
+      double *f1 = stage_list[v];
+      double *rhs = rhs_list[v];
+
+      double out;
+      if (stage == 0)
+      {
+        out = f0[idx] + half * dT * rhs[idx];
+      }
+      else if (stage == 1)
+      {
+        rhs[idx] += 2.0 * f1[idx];
+        out = f0[idx] + half * dT * f1[idx];
+      }
+      else if (stage == 2)
+      {
+        rhs[idx] += 2.0 * f1[idx];
+        out = f0[idx] + dT * f1[idx];
+      }
+      else
+      {
+        out = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
+      }
+
+      if (is_boundary)
+        out = f0[idx];
+      f1[idx] = out;
+    }
+  }
+}
+
 __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
                                      const double *X, const double *Y, const double *Z,
                                      double xmin, double ymin, double zmin,
@@ -1032,6 +1145,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               int symmetry,
                               int lev,
                               int rk_stage,
+                               bool force_host_boundary_fix,
                               bool download_to_host)
 {
  Rk4VarCache &cache = rk4_var_cache_map()[state0];
@@ -1166,24 +1280,34 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

-  if (ok && lev == 0)
+  if (ok && (lev == 0 || !force_host_boundary_fix))
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
  }

-  if (ok && lev > 0)
+  if (ok && lev > 0 && !force_host_boundary_fix)
+  {
+    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
+    void *args[] = {&nx, &ny, &nz,
+                    &has_xmin, &has_ymin, &has_zmin,
+                    &has_xmax, &has_ymax, &has_zmax,
+                    &d_state0, &d_stage};
+    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
+  }
+
+  if (ok && lev > 0 && force_host_boundary_fix)
  {
    double *host_state0 = const_cast<double *>(state0);
    double *host_phi = const_cast<double *>(phi_field);
    double *host_lap = const_cast<double *>(lap_field);
    double *host_rhs = rhs_accum;

-    ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
-         sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
-         sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
-         sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
+    // state0/phi/lap are read-only during the current RK step, so the host copies
+    // remain valid even if cached device mirrors exist. Only the RHS accumulator
+    // is updated on device and must be synchronized back for the CPU fallback.
+    ok = sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
    if (ok)
    {
      bssn_gpu_prepare_host_buffer(stage_data, n);
@@ -1232,6 +1356,176 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
  return ok ? 0 : 1;
 }

+int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
+                                 const double *X, const double *Y, const double *Z,
+                                 double xmin, double ymin, double zmin,
+                                 double xmax, double ymax, double zmax,
+                                 int symmetry,
+                                 const double *const *state0_list,
+                                 double *const *stage_data_list,
+                                 double *const *rhs_accum_list,
+                                 int num_var,
+                                 int rk_stage,
+                                 bool download_to_host)
+{
+  if (!state0_list || !stage_data_list || !rhs_accum_list || num_var <= 0)
+    return 1;
+
+  const int nx = ex[0];
+  const int ny = ex[1];
+  const int nz = ex[2];
+  const int n = count_points(ex);
+  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
+  const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
+  dim3 block(256);
+  dim3 grid(div_up(n, static_cast<int>(block.x)));
+
+  std::vector<const double *> host_state0_ptrs(num_var);
+  std::vector<double *> host_stage_ptrs(num_var);
+  std::vector<double *> host_rhs_ptrs(num_var);
+
+  bool ok = true;
+  for (int v = 0; v < num_var && ok; ++v)
+  {
+    const double *state0 = state0_list[v];
+    double *stage_data = stage_data_list[v];
+    double *rhs_accum = rhs_accum_list[v];
+    Rk4VarCache &cache = rk4_var_cache_map()[state0];
+
+    const bool refresh_state0 =
+        (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
+    const bool refresh_rhs =
+        (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
+    const bool need_stage_input = (rk_stage != 0);
+    double *stage_ptr = nullptr;
+    const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
+    const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
+    const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;
+
+    if (refresh_state0 && !mapped_state0_ptr)
+      bssn_gpu_prepare_host_buffer(state0, n);
+    if (need_stage_input && !mapped_stage_ptr)
+      bssn_gpu_prepare_host_buffer(stage_data, n);
+    if (refresh_rhs && !mapped_rhs_ptr)
+      bssn_gpu_prepare_host_buffer(rhs_accum, n);
+
+    ok = (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
+         (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
+    if (!ok)
+      break;
+
+    if (need_stage_input)
+    {
+      if (mapped_stage_ptr)
+      {
+        stage_ptr = const_cast<double *>(mapped_stage_ptr);
+      }
+      else
+      {
+        ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
+        stage_ptr = cache.stage.ptr;
+      }
+    }
+    else
+    {
+      ok = ensure_capacity(cache.stage, bytes);
+      stage_ptr = cache.stage.ptr;
+    }
+    if (!ok)
+      break;
+
+    if (refresh_state0)
+    {
+      cache.host_state0 = state0;
+      cache.nx = nx;
+      cache.ny = ny;
+      cache.nz = nz;
+      bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
+    }
+    if (refresh_rhs)
+    {
+      cache.host_rhs = rhs_accum;
+      cache.rhs_resident = true;
+      bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
+    }
+
+    host_state0_ptrs[v] = cache.state0.ptr;
+    host_stage_ptrs[v] = stage_ptr;
+    host_rhs_ptrs[v] = cache.rhs.ptr;
+  }
+
+  if (!ok)
+    return 1;
+
+  Rk4BatchCache &batch_cache = rk4_batch_cache();
+  ok = ensure_capacity(batch_cache.state0_ptrs, ptr_bytes) &&
+       ensure_capacity(batch_cache.stage_ptrs, ptr_bytes) &&
+       ensure_capacity(batch_cache.rhs_ptrs, ptr_bytes);
+  if (!ok)
+    return 1;
+
+  cudaError_t err = cudaMemcpy(batch_cache.state0_ptrs.ptr, &host_state0_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D) batch state0 ptrs", err);
+    return 1;
+  }
+  err = cudaMemcpy(batch_cache.stage_ptrs.ptr, &host_stage_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D) batch stage ptrs", err);
+    return 1;
+  }
+  err = cudaMemcpy(batch_cache.rhs_ptrs.ptr, &host_rhs_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error("cudaMemcpy(H2D) batch rhs ptrs", err);
+    return 1;
+  }
+
+  double dX = X[1] - X[0];
+  double dY = Y[1] - Y[0];
+  double dZ = Z[1] - Z[0];
+  const int no_symm = 0, octant = 2;
+  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
+  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
+  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
+  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
+  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
+  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
+
+  int n_arg = n, nx_arg = nx, ny_arg = ny, nz_arg = nz;
+  int num_var_arg = num_var, rk_stage_arg = rk_stage;
+  void *args[] = {&n_arg, &nx_arg, &ny_arg, &nz_arg,
+                  &has_xmin, &has_ymin, &has_zmin,
+                  &has_xmax, &has_ymax, &has_zmax,
+                  &num_var_arg, &dT,
+                  &batch_cache.state0_ptrs.ptr,
+                  &batch_cache.stage_ptrs.ptr,
+                  &batch_cache.rhs_ptrs.ptr,
+                  &rk_stage_arg};
+  ok = launch_kernel(grid, block, (const void *)rk4_boundary_batch_kernel, args);
+
+  if (!ok)
+    return 1;
+
+  for (int v = 0; v < num_var; ++v)
+  {
+    bssn_gpu_register_device_buffer(stage_data_list[v], host_stage_ptrs[v]);
+    if (download_to_host)
+    {
+      err = cudaMemcpy(stage_data_list[v], host_stage_ptrs[v], bytes, cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(D2H) batch stage_data", err);
+        return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
 void bssn_cuda_release_rk4_caches()
 {
  std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
@@ -1248,6 +1542,9 @@ void bssn_cuda_release_rk4_caches()
    release_buffer(cache.rhs);
  }
  cache_map.clear();
+  release_buffer(rk4_batch_cache().state0_ptrs);
+  release_buffer(rk4_batch_cache().stage_ptrs);
+  release_buffer(rk4_batch_cache().rhs_ptrs);
 }

 void bssn_cuda_release_interp_caches()
@@ -22,8 +22,21 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               int symmetry,
                               int lev,
                               int rk_stage,
+                               bool force_host_boundary_fix,
                               bool download_to_host = true);

+int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
+                                 const double *X, const double *Y, const double *Z,
+                                 double xmin, double ymin, double zmin,
+                                 double xmax, double ymax, double zmax,
+                                 int symmetry,
+                                 const double *const *state0_list,
+                                 double *const *stage_data_list,
+                                 double *const *rhs_accum_list,
+                                 int num_var,
+                                 int rk_stage,
+                                 bool download_to_host = false);
+
 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
 int bssn_cuda_download_buffer(int *ex, double *host_ptr);
 void bssn_cuda_release_rk4_caches();
@@ -4,7 +4,9 @@

 #include <algorithm>
 #include <cmath>
+#include <cstring>
 #include <cstdlib>
+#include <iomanip>
 #include <vector>

 #include "bssn_class.h"
@@ -12,18 +14,172 @@
 #include "bssn_gpu.h"
 #include "bssn_macro.h"

+namespace
+{
+enum StageProfileMetric
+{
+  STAGE_PROFILE_TOTAL = 0,
+  STAGE_PROFILE_RHS,
+  STAGE_PROFILE_RUN_STAGE,
+  STAGE_PROFILE_RUN_STAGE_DEVICE,
+  STAGE_PROFILE_RUN_STAGE_HOST_FIX,
+  STAGE_PROFILE_LOWERBOUND,
+  STAGE_PROFILE_ENSURE,
+  STAGE_PROFILE_DOWNLOAD,
+  STAGE_PROFILE_CLEAR_CACHE,
+  STAGE_PROFILE_SYNC_START,
+  STAGE_PROFILE_SYNC_FINISH,
+  STAGE_PROFILE_REFRESH,
+  STAGE_PROFILE_COUNT
+};
+
+static const int kStageProfileMaxLevels = 32;
+
+struct StageProfileStore
+{
+  bool env_checked;
+  bool enabled;
+  int calls[kStageProfileMaxLevels];
+  double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
+};
+
+StageProfileStore &stage_profile_store()
+{
+  static StageProfileStore store = {};
+  return store;
+}
+
+bool stage_profile_enabled()
+{
+  StageProfileStore &store = stage_profile_store();
+  if (!store.env_checked)
+  {
+    const char *env = getenv("AMSS_GPU_STAGE_TIMING");
+    store.enabled = (env && env[0] && strcmp(env, "0") != 0);
+    store.env_checked = true;
+  }
+  return store.enabled;
+}
+
+void stage_profile_note_call(int lev)
+{
+  if (lev >= 0 && lev < kStageProfileMaxLevels)
+    stage_profile_store().calls[lev]++;
+}
+
+void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
+{
+  if (lev >= 0 && lev < kStageProfileMaxLevels)
+    stage_profile_store().metric[lev][metric] += seconds;
+}
+
+const char *stage_profile_metric_name(StageProfileMetric metric)
+{
+  switch (metric)
+  {
+  case STAGE_PROFILE_TOTAL:
+    return "total";
+  case STAGE_PROFILE_RHS:
+    return "rhs";
+  case STAGE_PROFILE_RUN_STAGE:
+    return "run_stage";
+  case STAGE_PROFILE_RUN_STAGE_DEVICE:
+    return "run_stage_dev";
+  case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
+    return "run_stage_host";
+  case STAGE_PROFILE_LOWERBOUND:
+    return "lower";
+  case STAGE_PROFILE_ENSURE:
+    return "ensure";
+  case STAGE_PROFILE_DOWNLOAD:
+    return "download";
+  case STAGE_PROFILE_CLEAR_CACHE:
+    return "clear_cache";
+  case STAGE_PROFILE_SYNC_START:
+    return "sync_start";
+  case STAGE_PROFILE_SYNC_FINISH:
+    return "sync_finish";
+  case STAGE_PROFILE_REFRESH:
+    return "refresh";
+  default:
+    return "unknown";
+  }
+}
+} // namespace
+
+void bssn_cuda_dump_stage_profile()
+{
+  if (!stage_profile_enabled())
+    return;
+
+  int myrank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+  StageProfileStore &store = stage_profile_store();
+  int global_calls_sum[kStageProfileMaxLevels] = {};
+  double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
+  double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
+
+  MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+  MPI_Reduce(store.metric[0], global_metric_sum[0],
+             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
+             MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+  MPI_Reduce(store.metric[0], global_metric_max[0],
+             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
+             MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+  if (myrank != 0)
+    return;
+
+  cout << endl;
+  cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
+  cout << " lev  calls";
+  for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
+    cout << "  " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
+  cout << endl;
+
+  for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
+  {
+    if (global_calls_sum[lev] == 0)
+      continue;
+
+    cout << setw(4) << lev << "  " << setw(5) << global_calls_sum[lev];
+    for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
+    {
+      cout << "  "
+           << setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
+           << "/"
+           << setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
+    }
+    cout << endl;
+  }
+  cout << endl;
+}
+
 void bssn_class::Step_MainPath_GPU(int lev, int YN)
 {
 #ifdef WithShell
 #error "Step_MainPath_GPU currently supports Patch grids only."
 #endif

+  const bool profile_enabled = stage_profile_enabled();
+  const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
+  if (profile_enabled)
+    stage_profile_note_call(lev);
+
  if (bssn_gpu_bind_process_device(myrank))
  {
    cerr << "GPU device bind failure on MPI rank " << myrank << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
-  bssn_gpu_clear_cached_device_buffers();
+  if (profile_enabled)
+  {
+    const double t0 = MPI_Wtime();
+    bssn_gpu_clear_cached_device_buffers();
+    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
+  }
+  else
+    bssn_gpu_clear_cached_device_buffers();

  setpbh(BH_num, Porg0, Mass, BH_num_input);

@@ -62,6 +218,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
  int iter_count = 0;
  int pre = 0, cor = 1;
  int ERROR = 0;
+  const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);

  auto run_stage_on_block =
      [&](Block *cg, Patch *patch, MyList<var> *state0_list,
@@ -71,9 +228,27 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
        MyList<var> *varlb = boundary_src_list;
        MyList<var> *varls = stage_data_list;
        MyList<var> *varlr = rhs_list;
+        std::vector<const double *> batch_state0;
+        std::vector<double *> batch_stage;
+        std::vector<double *> batch_rhs;

        while (varl0)
        {
+          const bool force_host_boundary_fix = false;
+          const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
+          if (can_batch_device_path)
+          {
+            batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
+            batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
+            batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
+            varl0 = varl0->next;
+            varlb = varlb->next;
+            varls = varls->next;
+            varlr = varlr->next;
+            continue;
+          }
+
+          const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
          if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
                                         cg->X[0], cg->X[1], cg->X[2],
                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
@@ -86,7 +261,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                                         cg->fgfs[varlr->data->sgfn],
                                         varl0->data->propspeed,
                                         varl0->data->SoA,
-                                         Symmetry, lev, rk_stage, false))
+                                         Symmetry, lev, rk_stage,
+                                         force_host_boundary_fix, false))
          {
            cerr << "GPU rk4/boundary failure: lev=" << lev
                 << " rk_stage=" << rk_stage
@@ -97,18 +273,59 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
            ERROR = 1;
            break;
          }
+          if (profile_enabled)
+          {
+            stage_profile_add(lev,
+                              force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
+                                                      : STAGE_PROFILE_RUN_STAGE_DEVICE,
+                              MPI_Wtime() - var_begin);
+          }
          varl0 = varl0->next;
          varlb = varlb->next;
          varls = varls->next;
          varlr = varlr->next;
        }
+
+        if (!ERROR && !batch_state0.empty())
+        {
+          const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
+          if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
+                                           cg->X[0], cg->X[1], cg->X[2],
+                                           patch->bbox[0], patch->bbox[1], patch->bbox[2],
+                                           patch->bbox[3], patch->bbox[4], patch->bbox[5],
+                                           Symmetry,
+                                           &batch_state0[0],
+                                           &batch_stage[0],
+                                           &batch_rhs[0],
+                                           static_cast<int>(batch_state0.size()),
+                                           rk_stage, false))
+          {
+            cerr << "GPU rk4/boundary batch failure: lev=" << lev
+                 << " rk_stage=" << rk_stage
+                 << " vars=" << batch_state0.size()
+                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+          else if (profile_enabled)
+          {
+            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
+          }
+        }
      };

  auto stage_download_var_list =
-      [&](Block *cg, MyList<var> *var_list) {
+      [&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
        while (var_list)
        {
-          if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn]))
+          double *host_ptr = cg->fgfs[var_list->data->sgfn];
+          if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
+          {
+            var_list = var_list->next;
+            continue;
+          }
+          if (bssn_cuda_download_buffer(cg->shape, host_ptr))
          {
            cerr << "GPU stage download failure: lev=" << lev
                 << " var=" << var_list->data->name
@@ -123,7 +340,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
      };

  auto stage_download_patch_list =
-      [&](MyList<var> *var_list) {
+      [&](MyList<var> *var_list, bool skip_unmapped) {
        MyList<Patch> *patch_it = GH->PatL[lev];
        while (patch_it)
        {
@@ -132,7 +349,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
          {
            Block *cg = block_it->data;
            if (myrank == cg->rank)
-              stage_download_var_list(cg, var_list);
+              stage_download_var_list(cg, var_list, skip_unmapped);

            if (block_it == patch_it->data->ble)
              break;
@@ -341,11 +558,22 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
+        double t0 = 0.0;
+        if (profile_enabled)
+          t0 = MPI_Wtime();
        if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
          ERROR = 1;
+        if (profile_enabled)
+          stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);

+        if (profile_enabled)
+          t0 = MPI_Wtime();
        run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
+        if (profile_enabled)
+          stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);

+        if (profile_enabled)
+          t0 = MPI_Wtime();
        if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
        {
          cerr << "GPU lowerbound failure: lev=" << lev
@@ -356,6 +584,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
+        if (profile_enabled)
+          stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
      }
      if (BP == Pp->data->ble)
        break;
@@ -366,9 +596,23 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)

  if (!ERROR)
  {
-    stage_download_patch_list(SynchList_pre);
-    if (!ERROR)
-      bssn_gpu_clear_cached_device_buffers();
+    if (!keep_stage_sync_on_device)
+    {
+      double t0 = 0.0;
+      if (profile_enabled)
+        t0 = MPI_Wtime();
+      stage_download_patch_list(SynchList_pre, false);
+      if (profile_enabled)
+        stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
+      if (!ERROR)
+      {
+        if (profile_enabled)
+          t0 = MPI_Wtime();
+        bssn_gpu_clear_cached_device_buffers();
+        if (profile_enabled)
+          stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
+      }
+    }
  }

  MPI_Request err_req_pre;
@@ -378,10 +622,35 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
  }

  Parallel::AsyncSyncState async_pre;
-  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true);
-  if (!ERROR)
-    refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
+  if (profile_enabled)
+  {
+    const double t0 = MPI_Wtime();
+    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
+    stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
+  }
+  else
+    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
+  if (profile_enabled)
+  {
+    const double t0 = MPI_Wtime();
+    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
+                          !keep_stage_sync_on_device);
+    stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
+  }
+  else
+    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
+                          !keep_stage_sync_on_device);
+  if (!ERROR && !keep_stage_sync_on_device)
+  {
+    if (profile_enabled)
+    {
+      const double t0 = MPI_Wtime();
+      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
+      stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
+    }
+    else
+      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
+  }

  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
@@ -433,12 +702,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
+          double t0 = 0.0;
+          if (profile_enabled)
+            t0 = MPI_Wtime();
          ensure_stage_device_var_list(cg, SynchList_pre);
+          if (profile_enabled)
+            stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
+
+          if (profile_enabled)
+            t0 = MPI_Wtime();
          if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
            ERROR = 1;
+          if (profile_enabled)
+            stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);

+          if (profile_enabled)
+            t0 = MPI_Wtime();
          run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
+          if (profile_enabled)
+            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);

+          if (profile_enabled)
+            t0 = MPI_Wtime();
          if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
          {
          cerr << "GPU lowerbound failure: lev=" << lev
@@ -449,6 +734,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
+          if (profile_enabled)
+            stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
        }

        if (BP == Pp->data->ble)
@@ -460,9 +747,23 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)

    if (!ERROR)
    {
-      stage_download_patch_list(SynchList_cor);
-      if (!ERROR)
-        bssn_gpu_clear_cached_device_buffers();
+      if (!keep_stage_sync_on_device)
+      {
+        double t0 = 0.0;
+        if (profile_enabled)
+          t0 = MPI_Wtime();
+        stage_download_patch_list(SynchList_cor, false);
+        if (profile_enabled)
+          stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
+        if (!ERROR)
+        {
+          if (profile_enabled)
+            t0 = MPI_Wtime();
+          bssn_gpu_clear_cached_device_buffers();
+          if (profile_enabled)
+            stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
+        }
+      }
    }

    MPI_Request err_req_cor;
@@ -472,10 +773,35 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
    }

    Parallel::AsyncSyncState async_cor;
-    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true);
-    if (!ERROR && iter_count < 3)
-      refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
+    if (profile_enabled)
+    {
+      const double t0 = MPI_Wtime();
+      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
+      stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
+    }
+    else
+      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
+    if (profile_enabled)
+    {
+      const double t0 = MPI_Wtime();
+      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
+                            !keep_stage_sync_on_device);
+      stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
+    }
+    else
+      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
+                            !keep_stage_sync_on_device);
+    if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
+    {
+      if (profile_enabled)
+      {
+        const double t0 = MPI_Wtime();
+        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
+        stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
+      }
+      else
+        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
+    }

    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
@@ -545,8 +871,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
  RestrictProlong(lev, YN, BB);
 #endif

-  bssn_gpu_clear_cached_device_buffers();
-
  Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -563,6 +887,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
    Pp = Pp->next;
  }

+  if (!ERROR && keep_stage_sync_on_device)
+  {
+    MyList<var> *final_host_lists[] = {StateList, OldStateList, SynchList_cor, SynchList_pre};
+    const int final_host_list_count = sizeof(final_host_lists) / sizeof(final_host_lists[0]);
+    for (int list_i = 0; list_i < final_host_list_count && !ERROR; ++list_i)
+    {
+      const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
+      stage_download_patch_list(final_host_lists[list_i], true);
+      if (profile_enabled)
+        stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
+    }
+  }
+
+  if (profile_enabled)
+  {
+    const double t0 = MPI_Wtime();
+    bssn_gpu_clear_cached_device_buffers();
+    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
+  }
+  else
+    bssn_gpu_clear_cached_device_buffers();
+
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
@@ -572,6 +918,9 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
+
+  if (profile_enabled)
+    stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
 }

 #endif