Stabilize GPU output path and MPI sync

This commit is contained in:
2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions

View File

@@ -1,4 +1,5 @@
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include <cmath>
#include <cstdio>
@@ -79,6 +80,29 @@ inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
return true;
}
inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
{
if (!ensure_capacity(dst, bytes))
return false;
const double *device_src = bssn_gpu_find_device_buffer(src);
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
const void *copy_src = src;
if (device_src)
{
copy_src = device_src;
kind = cudaMemcpyDeviceToDevice;
}
cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
if (err != cudaSuccess)
{
report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
return false;
}
return true;
}
__global__ void enforce_ga_kernel(int n,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
@@ -374,7 +398,7 @@ __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
}
}
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
{
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
if (err != cudaSuccess)
@@ -382,10 +406,10 @@ inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **ar
report_cuda_error("cudaLaunchKernel", err);
return false;
}
err = cudaDeviceSynchronize();
err = cudaPeekAtLastError();
if (err != cudaSuccess)
{
report_cuda_error("cudaDeviceSynchronize", err);
report_cuda_error("cudaPeekAtLastError", err);
return false;
}
return true;
@@ -411,18 +435,18 @@ int bssn_cuda_enforce_ga(int *ex,
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
copy_to_device(cache.gxy, gxy, bytes) &&
copy_to_device(cache.gxz, gxz, bytes) &&
copy_to_device(cache.dyy, dyy, bytes) &&
copy_to_device(cache.gyz, gyz, bytes) &&
copy_to_device(cache.dzz, dzz, bytes) &&
copy_to_device(cache.Axx, Axx, bytes) &&
copy_to_device(cache.Axy, Axy, bytes) &&
copy_to_device(cache.Axz, Axz, bytes) &&
copy_to_device(cache.Ayy, Ayy, bytes) &&
copy_to_device(cache.Ayz, Ayz, bytes) &&
copy_to_device(cache.Azz, Azz, bytes);
bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
copy_to_device_preferring_device(cache.Azz, Azz, bytes);
if (ok)
{
@@ -432,7 +456,7 @@ int bssn_cuda_enforce_ga(int *ex,
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
}
if (ok)
@@ -527,10 +551,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
ok = ok &&
(!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
(!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));
(!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) &&
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
if (ok && !need_stage_input)
ok = ensure_capacity(cache.stage, bytes);
@@ -539,11 +563,15 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
return 1;
if (refresh_state0)
{
cache.host_state0 = state0;
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
}
if (refresh_rhs)
{
cache.host_rhs = rhs_accum;
cache.rhs_resident = true;
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
}
double dX = X[1] - X[0];
@@ -582,14 +610,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
&bam_source, &bam_target,
&soa0, &soa1, &soa2,
&symmetry};
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
}
if (ok)
{
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
}
if (ok && lev > 0)
@@ -599,11 +627,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
&d_state0, &d_stage};
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
}
if (ok)
{
bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr);
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
@@ -621,13 +651,13 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
bool ok = copy_to_device(d_chi, chi, bytes);
bool ok = copy_to_device_preferring_device(d_chi, chi, bytes);
if (ok)
{
double *ptr = d_chi.ptr;
void *args[] = {&n, &ptr, &tinny};
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
}
if (ok)