573 lines
21 KiB
Plaintext
573 lines
21 KiB
Plaintext
#include "bssn_cuda_ops.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cuda_runtime.h>
|
|
|
|
namespace {
|
|
|
|
inline void report_cuda_error(const char *where, cudaError_t err)
|
|
{
|
|
if (err != cudaSuccess)
|
|
std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
|
|
}
|
|
|
|
inline int count_points(const int ex[3])
|
|
{
|
|
return ex[0] * ex[1] * ex[2];
|
|
}
|
|
|
|
inline int div_up(int a, int b)
|
|
{
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
struct DeviceArrays
|
|
{
|
|
double *x = nullptr;
|
|
double *y = nullptr;
|
|
double *z = nullptr;
|
|
double *a = nullptr;
|
|
double *b = nullptr;
|
|
double *c = nullptr;
|
|
double *d = nullptr;
|
|
};
|
|
|
|
inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
|
{
|
|
cudaError_t err = cudaMalloc(&dst, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline void free_device(double *ptr)
|
|
{
|
|
if (ptr)
|
|
cudaFree(ptr);
|
|
}
|
|
|
|
__global__ void enforce_ga_kernel(int n,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const double one_third = 1.0 / 3.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
double lgxx = dxx[idx] + one;
|
|
double lgyy = dyy[idx] + one;
|
|
double lgzz = dzz[idx] + one;
|
|
double lgxy = gxy[idx];
|
|
double lgxz = gxz[idx];
|
|
double lgyz = gyz[idx];
|
|
|
|
double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
|
|
- lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
|
|
double scale = 1.0 / cbrt(det);
|
|
|
|
lgxx *= scale;
|
|
lgxy *= scale;
|
|
lgxz *= scale;
|
|
lgyy *= scale;
|
|
lgyz *= scale;
|
|
lgzz *= scale;
|
|
|
|
dxx[idx] = lgxx - one;
|
|
gxy[idx] = lgxy;
|
|
gxz[idx] = lgxz;
|
|
dyy[idx] = lgyy - one;
|
|
gyz[idx] = lgyz;
|
|
dzz[idx] = lgzz - one;
|
|
|
|
double gupxx = (lgyy * lgzz - lgyz * lgyz);
|
|
double gupxy = -(lgxy * lgzz - lgyz * lgxz);
|
|
double gupxz = (lgxy * lgyz - lgyy * lgxz);
|
|
double gupyy = (lgxx * lgzz - lgxz * lgxz);
|
|
double gupyz = -(lgxx * lgyz - lgxy * lgxz);
|
|
double gupzz = (lgxx * lgyy - lgxy * lgxy);
|
|
|
|
double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
|
|
+ two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
|
|
|
|
Axx[idx] -= one_third * lgxx * trA;
|
|
Axy[idx] -= one_third * lgxy * trA;
|
|
Axz[idx] -= one_third * lgxz * trA;
|
|
Ayy[idx] -= one_third * lgyy * trA;
|
|
Ayz[idx] -= one_third * lgyz * trA;
|
|
Azz[idx] -= one_third * lgzz * trA;
|
|
}
|
|
}
|
|
|
|
__device__ inline int index3(int i, int j, int k, int nx, int ny)
|
|
{
|
|
return i + j * nx + k * nx * ny;
|
|
}
|
|
|
|
__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
|
|
int nx, int ny, int nz,
|
|
const double soa[3], int symmetry)
|
|
{
|
|
double sign = 1.0;
|
|
if (i < 0)
|
|
{
|
|
i = -i - 1;
|
|
sign *= soa[0];
|
|
}
|
|
if (j < 0)
|
|
{
|
|
j = -j - 1;
|
|
sign *= soa[1];
|
|
}
|
|
if (k < 0)
|
|
{
|
|
k = -k - 1;
|
|
sign *= soa[2];
|
|
}
|
|
if (i >= nx) i = nx - 1;
|
|
if (j >= ny) j = ny - 1;
|
|
if (k >= nz) k = nz - 1;
|
|
(void)symmetry;
|
|
return sign * f[index3(i, j, k, nx, ny)];
|
|
}
|
|
|
|
__global__ void rk4_kernel(int n, double dT,
|
|
const double *f0,
|
|
double *f1,
|
|
double *rhs,
|
|
int stage)
|
|
{
|
|
const double half = 0.5;
|
|
const double one_sixth = 1.0 / 6.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (stage == 0)
|
|
{
|
|
f1[idx] = f0[idx] + half * dT * rhs[idx];
|
|
}
|
|
else if (stage == 1)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + half * dT * f1[idx];
|
|
}
|
|
else if (stage == 2)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + dT * f1[idx];
|
|
}
|
|
else
|
|
{
|
|
f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void lowerbound_kernel(int n, double *chi, double tinny)
|
|
{
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (chi[idx] < tinny)
|
|
chi[idx] = tinny;
|
|
}
|
|
}
|
|
|
|
__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
const double *src, double *dst)
|
|
{
|
|
const int n = nx * ny * nz;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1))
|
|
{
|
|
dst[idx] = src[idx];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
int imin, int jmin, int kmin,
|
|
double propspeed,
|
|
const double *f0,
|
|
double *target,
|
|
double soa0, double soa1, double soa2,
|
|
int symmetry)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const int n = nx * ny * nz;
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const double d2dx = one / (two * dX);
|
|
const double d2dy = one / (two * dY);
|
|
const double d2dz = one / (two * dZ);
|
|
const double soa[3] = {soa0, soa1, soa2};
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
const bool on_boundary =
|
|
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
|
|
|
if (!on_boundary)
|
|
continue;
|
|
|
|
const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
|
|
if (radius == 0.0)
|
|
continue;
|
|
|
|
const double wx = propspeed * X[i] / radius;
|
|
const double wy = propspeed * Y[j] / radius;
|
|
const double wz = propspeed * Z[k] / radius;
|
|
|
|
double fx = 0.0;
|
|
double fy = 0.0;
|
|
double fz = 0.0;
|
|
|
|
if (wx > 0.0)
|
|
{
|
|
if (i - 2 >= imin)
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i - 1 >= imin)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wx < 0.0)
|
|
{
|
|
if (i + 2 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i + 1 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wy > 0.0)
|
|
{
|
|
if (j - 2 >= jmin)
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
else if (j - 1 >= jmin)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wy < 0.0)
|
|
{
|
|
if (j + 2 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (j + 1 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wz > 0.0)
|
|
{
|
|
if (k - 2 >= kmin)
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
else if (k - 1 >= kmin)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wz < 0.0)
|
|
{
|
|
if (k + 2 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (k + 1 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
|
|
}
|
|
}
|
|
|
|
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
|
|
{
|
|
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaLaunchKernel", err);
|
|
return false;
|
|
}
|
|
err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaDeviceSynchronize", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int bssn_cuda_enforce_ga(int *ex,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
|
|
double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
|
|
double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
|
|
double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
|
|
|
|
bool ok = copy_to_device(d_dxx, dxx, bytes) &&
|
|
copy_to_device(d_gxy, gxy, bytes) &&
|
|
copy_to_device(d_gxz, gxz, bytes) &&
|
|
copy_to_device(d_dyy, dyy, bytes) &&
|
|
copy_to_device(d_gyz, gyz, bytes) &&
|
|
copy_to_device(d_dzz, dzz, bytes) &&
|
|
copy_to_device(d_Axx, Axx, bytes) &&
|
|
copy_to_device(d_Axy, Axy, bytes) &&
|
|
copy_to_device(d_Axz, Axz, bytes) &&
|
|
copy_to_device(d_Ayy, Ayy, bytes) &&
|
|
copy_to_device(d_Ayz, Ayz, bytes) &&
|
|
copy_to_device(d_Azz, Azz, bytes);
|
|
|
|
if (ok)
|
|
{
|
|
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
|
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
|
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
|
|
ok = err == cudaSuccess;
|
|
if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
|
if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
|
}
|
|
|
|
free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
|
|
free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
|
|
free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
|
|
free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
|
|
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
const double *state0,
|
|
const double *boundary_src,
|
|
double *stage_data,
|
|
double *rhs_accum,
|
|
double propspeed,
|
|
const double SoA[3],
|
|
int symmetry,
|
|
int lev,
|
|
int rk_stage)
|
|
{
|
|
int nx = ex[0];
|
|
int ny = ex[1];
|
|
int nz = ex[2];
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
|
|
const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
|
|
const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
|
|
double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
|
|
|
|
bool ok = copy_to_device(d_X, X, bytes_x) &&
|
|
copy_to_device(d_Y, Y, bytes_y) &&
|
|
copy_to_device(d_Z, Z, bytes_z) &&
|
|
copy_to_device(d_state0, state0, bytes) &&
|
|
copy_to_device(d_boundary, boundary_src, bytes) &&
|
|
copy_to_device(d_stage, stage_data, bytes) &&
|
|
copy_to_device(d_rhs, rhs_accum, bytes);
|
|
|
|
if (!ok)
|
|
{
|
|
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
|
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
|
return 1;
|
|
}
|
|
|
|
double dX = X[1] - X[0];
|
|
double dY = Y[1] - Y[0];
|
|
double dZ = Z[1] - Z[0];
|
|
const int no_symm = 0, eq_symm = 1, octant = 2;
|
|
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
|
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
|
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
|
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
|
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
|
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
|
double soa0 = SoA[0];
|
|
double soa1 = SoA[1];
|
|
double soa2 = SoA[2];
|
|
|
|
if (lev == 0)
|
|
{
|
|
int imin = 1;
|
|
int jmin = 1;
|
|
int kmin = 1;
|
|
if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
|
|
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
|
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
|
|
|
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
|
|
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
|
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
|
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&imin, &jmin, &kmin, &propspeed,
|
|
&bam_source, &bam_target,
|
|
&soa0, &soa1, &soa2,
|
|
&symmetry};
|
|
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
|
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
|
|
}
|
|
|
|
if (ok && lev > 0)
|
|
{
|
|
void *args[] = {&nx, &ny, &nz,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&d_state0, &d_stage};
|
|
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
|
ok = err == cudaSuccess;
|
|
if (ok)
|
|
{
|
|
err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
}
|
|
|
|
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
|
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
|
{
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
double *d_chi = nullptr;
|
|
bool ok = copy_to_device(d_chi, chi, bytes);
|
|
|
|
if (ok)
|
|
{
|
|
void *args[] = {&n, &d_chi, &tinny};
|
|
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
|
|
free_device(d_chi);
|
|
return ok ? 0 : 1;
|
|
}
|