#include "bssn_cuda_ops.h" #include #include #include namespace { inline void report_cuda_error(const char *where, cudaError_t err) { if (err != cudaSuccess) std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err)); } inline int count_points(const int ex[3]) { return ex[0] * ex[1] * ex[2]; } inline int div_up(int a, int b) { return (a + b - 1) / b; } struct DeviceArrays { double *x = nullptr; double *y = nullptr; double *z = nullptr; double *a = nullptr; double *b = nullptr; double *c = nullptr; double *d = nullptr; }; struct CachedBuffer { double *ptr = nullptr; size_t capacity = 0; }; inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) { if (bytes <= buffer.capacity && buffer.ptr) return true; if (buffer.ptr) { cudaError_t free_err = cudaFree(buffer.ptr); if (free_err != cudaSuccess) report_cuda_error("cudaFree", free_err); buffer.ptr = nullptr; buffer.capacity = 0; } cudaError_t err = cudaMalloc(&buffer.ptr, bytes); if (err != cudaSuccess) { report_cuda_error("cudaMalloc", err); return false; } buffer.capacity = bytes; return true; } inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes) { if (!ensure_capacity(dst, bytes)) return false; cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice); if (err != cudaSuccess) { report_cuda_error("cudaMemcpy(H2D)", err); return false; } return true; } __global__ void enforce_ga_kernel(int n, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz) { const double one = 1.0; const double two = 2.0; const double one_third = 1.0 / 3.0; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { double lgxx = dxx[idx] + one; double lgyy = dyy[idx] + one; double lgzz = dzz[idx] + one; double lgxy = gxy[idx]; double lgxz = gxz[idx]; double lgyz = gyz[idx]; double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz; double scale = 1.0 / cbrt(det); lgxx *= scale; lgxy *= scale; lgxz *= scale; lgyy *= scale; lgyz *= scale; lgzz *= scale; dxx[idx] = lgxx - one; gxy[idx] = lgxy; gxz[idx] = lgxz; dyy[idx] = lgyy - one; gyz[idx] = lgyz; dzz[idx] = lgzz - one; double gupxx = (lgyy * lgzz - lgyz * lgyz); double gupxy = -(lgxy * lgzz - lgyz * lgxz); double gupxz = (lgxy * lgyz - lgyy * lgxz); double gupyy = (lgxx * lgzz - lgxz * lgxz); double gupyz = -(lgxx * lgyz - lgxy * lgxz); double gupzz = (lgxx * lgyy - lgxy * lgxy); double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx] + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]); Axx[idx] -= one_third * lgxx * trA; Axy[idx] -= one_third * lgxy * trA; Axz[idx] -= one_third * lgxz * trA; Ayy[idx] -= one_third * lgyy * trA; Ayz[idx] -= one_third * lgyz * trA; Azz[idx] -= one_third * lgzz * trA; } } __device__ inline int index3(int i, int j, int k, int nx, int ny) { return i + j * nx + k * nx * ny; } __device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k, int nx, int ny, int nz, const double soa[3], int symmetry) { double sign = 1.0; if (i < 0) { i = -i - 1; sign *= soa[0]; } if (j < 0) { j = -j - 1; sign *= soa[1]; } if (k < 0) { k = -k - 1; sign *= soa[2]; } if (i >= nx) i = nx - 1; if (j >= ny) j = ny - 1; if (k >= nz) k = nz - 1; (void)symmetry; return sign * f[index3(i, j, k, nx, ny)]; } __global__ void rk4_kernel(int n, double dT, const double *f0, double *f1, double *rhs, int stage) { const double half = 0.5; const double one_sixth = 1.0 / 6.0; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { if (stage == 0) { f1[idx] = f0[idx] + half * dT * rhs[idx]; } else if (stage == 1) { rhs[idx] += 2.0 * f1[idx]; f1[idx] = f0[idx] + half * dT * f1[idx]; } else if (stage == 2) { rhs[idx] += 2.0 * f1[idx]; f1[idx] = f0[idx] + dT * f1[idx]; } else { f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]); } } } __global__ void lowerbound_kernel(int n, double *chi, double tinny) { for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { if (chi[idx] < tinny) chi[idx] = tinny; } } __global__ void copy_physical_boundary_kernel(int nx, int ny, int nz, int has_xmin, int has_ymin, int has_zmin, int has_xmax, int has_ymax, int has_zmax, const double *src, double *dst) { const int n = nx * ny * nz; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { const int plane = nx * ny; const int k = idx / plane; const int rem = idx - k * plane; const int j = rem / nx; const int i = rem - j * nx; if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) || (has_ymin && j == 0) || (has_ymax && j == ny - 1) || (has_zmin && k == 0) || (has_zmax && k == nz - 1)) { dst[idx] = src[idx]; } } } __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz, const double *X, const double *Y, const double *Z, double xmin, double ymin, double zmin, double xmax, double ymax, double zmax, int has_xmin, int has_ymin, int has_zmin, int has_xmax, int has_ymax, int has_zmax, int imin, int jmin, int kmin, double propspeed, const double *f0, double *target, double soa0, double soa1, double soa2, int symmetry) { const double one = 1.0; const double two = 2.0; const int n = nx * ny * nz; const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; const double d2dx = one / (two * dX); const double d2dy = one / (two * dY); const double d2dz = one / (two * dZ); const double soa[3] = {soa0, soa1, soa2}; for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { const int plane = nx * ny; const int k = idx / plane; const int rem = idx - k * plane; const int j = rem / nx; const int i = rem - j * nx; const bool on_boundary = (has_xmin && i == 0) || (has_xmax && i == nx - 1) || (has_ymin && j == 0) || (has_ymax && j == ny - 1) || (has_zmin && k == 0) || (has_zmax && k == nz - 1); if (!on_boundary) continue; const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]); if (radius == 0.0) continue; const double wx = propspeed * X[i] / radius; const double wy = propspeed * Y[j] / radius; const double wz = propspeed * Z[k] / radius; double fx = 0.0; double fy = 0.0; double fz = 0.0; if (wx > 0.0) { if (i - 2 >= imin) fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry)); else if (i - 1 >= imin) fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)); else fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); } else if (wx < 0.0) { if (i + 2 <= nx - 1) fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); else if (i + 1 <= nx - 1) fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)); else fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry)); } if (wy > 0.0) { if (j - 2 >= jmin) fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry)); else if (j - 1 >= jmin) fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)); else fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); } else if (wy < 0.0) { if (j + 2 <= ny - 1) fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); else if (j + 1 <= ny - 1) fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)); else fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry)); } if (wz > 0.0) { if (k - 2 >= kmin) fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry)); else if (k - 1 >= kmin) fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)); else fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); } else if (wz < 0.0) { if (k + 2 <= nz - 1) fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry) + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry) - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); else if (k + 1 <= nz - 1) fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)); else fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry)); } target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius; } } inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args) { cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr); if (err != cudaSuccess) { report_cuda_error("cudaLaunchKernel", err); return false; } err = cudaDeviceSynchronize(); if (err != cudaSuccess) { report_cuda_error("cudaDeviceSynchronize", err); return false; } return true; } } // namespace int bssn_cuda_enforce_ga(int *ex, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz) { struct EnforceGaCache { CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz; CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz; }; static thread_local EnforceGaCache cache; int n = count_points(ex); const size_t bytes = static_cast(n) * sizeof(double); dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); bool ok = copy_to_device(cache.dxx, dxx, bytes) && copy_to_device(cache.gxy, gxy, bytes) && copy_to_device(cache.gxz, gxz, bytes) && copy_to_device(cache.dyy, dyy, bytes) && copy_to_device(cache.gyz, gyz, bytes) && copy_to_device(cache.dzz, dzz, bytes) && copy_to_device(cache.Axx, Axx, bytes) && copy_to_device(cache.Axy, Axy, bytes) && copy_to_device(cache.Axz, Axz, bytes) && copy_to_device(cache.Ayy, Ayy, bytes) && copy_to_device(cache.Ayz, Ayz, bytes) && copy_to_device(cache.Azz, Azz, bytes); if (ok) { double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr; double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr; double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr; double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr; void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz, &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz}; ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args); } if (ok) { cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err); ok = err == cudaSuccess; if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; } if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; } } return ok ? 0 : 1; } int bssn_cuda_rk4_boundary_var(int *ex, double dT, const double *X, const double *Y, const double *Z, double xmin, double ymin, double zmin, double xmax, double ymax, double zmax, const double *state0, const double *boundary_src, double *stage_data, double *rhs_accum, double propspeed, const double SoA[3], int symmetry, int lev, int rk_stage) { struct Rk4BoundaryCache { CachedBuffer X, Y, Z; CachedBuffer state0, boundary, stage, rhs; const double *host_X = nullptr; const double *host_Y = nullptr; const double *host_Z = nullptr; int nx = 0; int ny = 0; int nz = 0; }; static thread_local Rk4BoundaryCache cache; int nx = ex[0]; int ny = ex[1]; int nz = ex[2]; int n = count_points(ex); const size_t bytes = static_cast(n) * sizeof(double); const size_t bytes_x = static_cast(nx) * sizeof(double); const size_t bytes_y = static_cast(ny) * sizeof(double); const size_t bytes_z = static_cast(nz) * sizeof(double); dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); bool ok = true; if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || cache.nx != nx || cache.ny != ny || cache.nz != nz) { ok = copy_to_device(cache.X, X, bytes_x) && copy_to_device(cache.Y, Y, bytes_y) && copy_to_device(cache.Z, Z, bytes_z); if (ok) { cache.host_X = X; cache.host_Y = Y; cache.host_Z = Z; cache.nx = nx; cache.ny = ny; cache.nz = nz; } } ok = ok && copy_to_device(cache.state0, state0, bytes) && copy_to_device(cache.boundary, boundary_src, bytes) && copy_to_device(cache.stage, stage_data, bytes) && copy_to_device(cache.rhs, rhs_accum, bytes); if (!ok) return 1; double dX = X[1] - X[0]; double dY = Y[1] - Y[0]; double dZ = Z[1] - Z[0]; const int no_symm = 0, eq_symm = 1, octant = 2; int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX); int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY); int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ); int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0); int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0); int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0); double soa0 = SoA[0]; double soa1 = SoA[1]; double soa2 = SoA[2]; if (lev == 0) { int imin = 1; int jmin = 1; int kmin = 1; if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0; if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0; if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0; double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr; double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr; double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; double *bam_target = (rk_stage == 0) ? d_rhs : d_stage; const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary; void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax, &has_xmin, &has_ymin, &has_zmin, &has_xmax, &has_ymax, &has_zmax, &imin, &jmin, &kmin, &propspeed, &bam_source, &bam_target, &soa0, &soa1, &soa2, &symmetry}; ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args); } if (ok) { double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args); } if (ok && lev > 0) { double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr; void *args[] = {&nx, &ny, &nz, &has_xmin, &has_ymin, &has_zmin, &has_xmax, &has_ymax, &has_zmax, &d_state0, &d_stage}; ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args); } if (ok) { cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; if (ok) { err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); ok = err == cudaSuccess; } } return ok ? 0 : 1; } int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) { static thread_local CachedBuffer d_chi; int n = count_points(ex); const size_t bytes = static_cast(n) * sizeof(double); dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); bool ok = copy_to_device(d_chi, chi, bytes); if (ok) { double *ptr = d_chi.ptr; void *args[] = {&n, &ptr, &tinny}; ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args); } if (ok) { cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); ok = err == cudaSuccess; } return ok ? 0 : 1; }