Fix GPU RK4 boundary and sync correctness
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
#include "bssn_cuda_ops.h"
|
||||
#include "bssn_gpu.h"
|
||||
#include "rungekutta4_rout.h"
|
||||
#include "sommerfeld_rout.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
|
||||
{
|
||||
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
||||
if (!device_ptr)
|
||||
return true;
|
||||
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, count);
|
||||
const size_t bytes = static_cast<size_t>(count) * sizeof(double);
|
||||
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error(label, err);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_region_to_padded_stage(CachedBuffer &dst,
|
||||
const double *src,
|
||||
const int src_shape[3],
|
||||
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
double xmin, double ymin, double zmin,
|
||||
double xmax, double ymax, double zmax,
|
||||
const double *state0,
|
||||
const double *phi_field,
|
||||
const double *lap_field,
|
||||
const double *boundary_src,
|
||||
double *stage_data,
|
||||
double *rhs_accum,
|
||||
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
if (ok && lev == 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok && lev > 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
|
||||
void *args[] = {&nx, &ny, &nz,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
&d_state0, &d_stage};
|
||||
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||
double *host_state0 = const_cast<double *>(state0);
|
||||
double *host_phi = const_cast<double *>(phi_field);
|
||||
double *host_lap = const_cast<double *>(lap_field);
|
||||
double *host_rhs = rhs_accum;
|
||||
|
||||
ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
|
||||
sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
|
||||
sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
|
||||
sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(stage_data, n);
|
||||
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
int rk_stage_host = rk_stage;
|
||||
int cor = 1;
|
||||
f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
|
||||
f_sommerfeld_rout(ex,
|
||||
const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
|
||||
xmin, ymin, zmin, xmax, ymax, zmax,
|
||||
dT,
|
||||
host_phi, host_lap,
|
||||
host_state0, stage_data,
|
||||
const_cast<double *>(SoA),
|
||||
symmetry, cor);
|
||||
|
||||
cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
|
||||
Reference in New Issue
Block a user