Fix GPU RK4 boundary and sync correctness

This commit is contained in:
2026-04-12 12:13:47 +08:00
parent b78874ef21
commit d9287ea530
4 changed files with 134 additions and 30 deletions

View File

@@ -1,5 +1,7 @@
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "rungekutta4_rout.h"
#include "sommerfeld_rout.h"
#include <cmath>
#include <cstdio>
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
return true;
}
inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
{
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
if (!device_ptr)
return true;
bssn_gpu_prepare_host_buffer(host_ptr, count);
const size_t bytes = static_cast<size_t>(count) * sizeof(double);
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
report_cuda_error(label, err);
return false;
}
return true;
}
inline bool copy_region_to_padded_stage(CachedBuffer &dst,
const double *src,
const int src_shape[3],
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *phi_field,
const double *lap_field,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
}
if (ok)
if (ok && lev == 0)
{
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
if (ok && lev > 0)
{
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
void *args[] = {&nx, &ny, &nz,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
&d_state0, &d_stage};
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
double *host_state0 = const_cast<double *>(state0);
double *host_phi = const_cast<double *>(phi_field);
double *host_lap = const_cast<double *>(lap_field);
double *host_rhs = rhs_accum;
ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
if (ok)
{
bssn_gpu_prepare_host_buffer(stage_data, n);
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = false;
}
}
if (ok)
{
int rk_stage_host = rk_stage;
int cor = 1;
f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
f_sommerfeld_rout(ex,
const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
xmin, ymin, zmin, xmax, ymax, zmax,
dT,
host_phi, host_lap,
host_state0, stage_data,
const_cast<double *>(SoA),
symmetry, cor);
cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
ok = false;
}
}
}
if (ok)