Add runnable GPU main-path prototype
This commit is contained in:
@@ -29,16 +29,11 @@ using namespace std;
|
|||||||
#error "not define ABEtype"
|
#error "not define ABEtype"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (ABEtype == 0)
|
#if (ABEtype == 0)
|
||||||
|
#include "bssn_class.h"
|
||||||
#ifdef USE_GPU
|
|
||||||
#include "bssn_gpu_class.h"
|
#elif (ABEtype == 1)
|
||||||
#else
|
#include "bssnEScalar_class.h"
|
||||||
#include "bssn_class.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#elif (ABEtype == 1)
|
|
||||||
#include "bssnEScalar_class.h"
|
|
||||||
|
|
||||||
#elif (ABEtype == 2)
|
#elif (ABEtype == 2)
|
||||||
#include "Z4c_class.h"
|
#include "Z4c_class.h"
|
||||||
|
|||||||
@@ -3026,9 +3026,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
|
|||||||
|
|
||||||
#if (PSTR == 0)
|
#if (PSTR == 0)
|
||||||
#if 1
|
#if 1
|
||||||
void bssn_class::Step(int lev, int YN)
|
void bssn_class::Step(int lev, int YN)
|
||||||
{
|
{
|
||||||
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
#ifdef USE_GPU
|
||||||
|
Step_MainPath_GPU(lev, YN);
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
||||||
|
|
||||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||||
|
|
||||||
|
|||||||
@@ -171,16 +171,19 @@ public:
|
|||||||
|
|
||||||
bool check_Stdin_Abort();
|
bool check_Stdin_Abort();
|
||||||
|
|
||||||
virtual void Setup_Initial_Data_Cao();
|
virtual void Setup_Initial_Data_Cao();
|
||||||
virtual void Setup_Initial_Data_Lousto();
|
virtual void Setup_Initial_Data_Lousto();
|
||||||
virtual void Initialize();
|
virtual void Initialize();
|
||||||
virtual void Read_Ansorg();
|
virtual void Read_Ansorg();
|
||||||
virtual void Read_Pablo() {};
|
virtual void Read_Pablo() {};
|
||||||
virtual void Compute_Psi4(int lev);
|
virtual void Compute_Psi4(int lev);
|
||||||
virtual void Step(int lev, int YN);
|
virtual void Step(int lev, int YN);
|
||||||
virtual void Interp_Constraint(bool infg);
|
#ifdef USE_GPU
|
||||||
virtual void Constraint_Out();
|
void Step_MainPath_GPU(int lev, int YN);
|
||||||
virtual void Compute_Constraint();
|
#endif
|
||||||
|
virtual void Interp_Constraint(bool infg);
|
||||||
|
virtual void Constraint_Out();
|
||||||
|
virtual void Compute_Constraint();
|
||||||
|
|
||||||
#ifdef With_AHF
|
#ifdef With_AHF
|
||||||
protected:
|
protected:
|
||||||
|
|||||||
572
AMSS_NCKU_source/bssn_cuda_ops.cu
Normal file
572
AMSS_NCKU_source/bssn_cuda_ops.cu
Normal file
@@ -0,0 +1,572 @@
|
|||||||
|
#include "bssn_cuda_ops.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
inline void report_cuda_error(const char *where, cudaError_t err)
|
||||||
|
{
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int count_points(const int ex[3])
|
||||||
|
{
|
||||||
|
return ex[0] * ex[1] * ex[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int div_up(int a, int b)
|
||||||
|
{
|
||||||
|
return (a + b - 1) / b;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DeviceArrays
|
||||||
|
{
|
||||||
|
double *x = nullptr;
|
||||||
|
double *y = nullptr;
|
||||||
|
double *z = nullptr;
|
||||||
|
double *a = nullptr;
|
||||||
|
double *b = nullptr;
|
||||||
|
double *c = nullptr;
|
||||||
|
double *d = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaMalloc(&dst, bytes);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error("cudaMalloc", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error("cudaMemcpy(H2D)", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void free_device(double *ptr)
|
||||||
|
{
|
||||||
|
if (ptr)
|
||||||
|
cudaFree(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void enforce_ga_kernel(int n,
|
||||||
|
double *dxx, double *gxy, double *gxz,
|
||||||
|
double *dyy, double *gyz, double *dzz,
|
||||||
|
double *Axx, double *Axy, double *Axz,
|
||||||
|
double *Ayy, double *Ayz, double *Azz)
|
||||||
|
{
|
||||||
|
const double one = 1.0;
|
||||||
|
const double two = 2.0;
|
||||||
|
const double one_third = 1.0 / 3.0;
|
||||||
|
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
double lgxx = dxx[idx] + one;
|
||||||
|
double lgyy = dyy[idx] + one;
|
||||||
|
double lgzz = dzz[idx] + one;
|
||||||
|
double lgxy = gxy[idx];
|
||||||
|
double lgxz = gxz[idx];
|
||||||
|
double lgyz = gyz[idx];
|
||||||
|
|
||||||
|
double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
|
||||||
|
- lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
|
||||||
|
double scale = 1.0 / cbrt(det);
|
||||||
|
|
||||||
|
lgxx *= scale;
|
||||||
|
lgxy *= scale;
|
||||||
|
lgxz *= scale;
|
||||||
|
lgyy *= scale;
|
||||||
|
lgyz *= scale;
|
||||||
|
lgzz *= scale;
|
||||||
|
|
||||||
|
dxx[idx] = lgxx - one;
|
||||||
|
gxy[idx] = lgxy;
|
||||||
|
gxz[idx] = lgxz;
|
||||||
|
dyy[idx] = lgyy - one;
|
||||||
|
gyz[idx] = lgyz;
|
||||||
|
dzz[idx] = lgzz - one;
|
||||||
|
|
||||||
|
double gupxx = (lgyy * lgzz - lgyz * lgyz);
|
||||||
|
double gupxy = -(lgxy * lgzz - lgyz * lgxz);
|
||||||
|
double gupxz = (lgxy * lgyz - lgyy * lgxz);
|
||||||
|
double gupyy = (lgxx * lgzz - lgxz * lgxz);
|
||||||
|
double gupyz = -(lgxx * lgyz - lgxy * lgxz);
|
||||||
|
double gupzz = (lgxx * lgyy - lgxy * lgxy);
|
||||||
|
|
||||||
|
double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
|
||||||
|
+ two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
|
||||||
|
|
||||||
|
Axx[idx] -= one_third * lgxx * trA;
|
||||||
|
Axy[idx] -= one_third * lgxy * trA;
|
||||||
|
Axz[idx] -= one_third * lgxz * trA;
|
||||||
|
Ayy[idx] -= one_third * lgyy * trA;
|
||||||
|
Ayz[idx] -= one_third * lgyz * trA;
|
||||||
|
Azz[idx] -= one_third * lgzz * trA;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ inline int index3(int i, int j, int k, int nx, int ny)
|
||||||
|
{
|
||||||
|
return i + j * nx + k * nx * ny;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
|
||||||
|
int nx, int ny, int nz,
|
||||||
|
const double soa[3], int symmetry)
|
||||||
|
{
|
||||||
|
double sign = 1.0;
|
||||||
|
if (i < 0)
|
||||||
|
{
|
||||||
|
i = -i - 1;
|
||||||
|
sign *= soa[0];
|
||||||
|
}
|
||||||
|
if (j < 0)
|
||||||
|
{
|
||||||
|
j = -j - 1;
|
||||||
|
sign *= soa[1];
|
||||||
|
}
|
||||||
|
if (k < 0)
|
||||||
|
{
|
||||||
|
k = -k - 1;
|
||||||
|
sign *= soa[2];
|
||||||
|
}
|
||||||
|
if (i >= nx) i = nx - 1;
|
||||||
|
if (j >= ny) j = ny - 1;
|
||||||
|
if (k >= nz) k = nz - 1;
|
||||||
|
(void)symmetry;
|
||||||
|
return sign * f[index3(i, j, k, nx, ny)];
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void rk4_kernel(int n, double dT,
|
||||||
|
const double *f0,
|
||||||
|
double *f1,
|
||||||
|
double *rhs,
|
||||||
|
int stage)
|
||||||
|
{
|
||||||
|
const double half = 0.5;
|
||||||
|
const double one_sixth = 1.0 / 6.0;
|
||||||
|
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
if (stage == 0)
|
||||||
|
{
|
||||||
|
f1[idx] = f0[idx] + half * dT * rhs[idx];
|
||||||
|
}
|
||||||
|
else if (stage == 1)
|
||||||
|
{
|
||||||
|
rhs[idx] += 2.0 * f1[idx];
|
||||||
|
f1[idx] = f0[idx] + half * dT * f1[idx];
|
||||||
|
}
|
||||||
|
else if (stage == 2)
|
||||||
|
{
|
||||||
|
rhs[idx] += 2.0 * f1[idx];
|
||||||
|
f1[idx] = f0[idx] + dT * f1[idx];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void lowerbound_kernel(int n, double *chi, double tinny)
|
||||||
|
{
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
if (chi[idx] < tinny)
|
||||||
|
chi[idx] = tinny;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
|
||||||
|
int has_xmin, int has_ymin, int has_zmin,
|
||||||
|
int has_xmax, int has_ymax, int has_zmax,
|
||||||
|
const double *src, double *dst)
|
||||||
|
{
|
||||||
|
const int n = nx * ny * nz;
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int plane = nx * ny;
|
||||||
|
const int k = idx / plane;
|
||||||
|
const int rem = idx - k * plane;
|
||||||
|
const int j = rem / nx;
|
||||||
|
const int i = rem - j * nx;
|
||||||
|
|
||||||
|
if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
||||||
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
||||||
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1))
|
||||||
|
{
|
||||||
|
dst[idx] = src[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
||||||
|
const double *X, const double *Y, const double *Z,
|
||||||
|
double xmin, double ymin, double zmin,
|
||||||
|
double xmax, double ymax, double zmax,
|
||||||
|
int has_xmin, int has_ymin, int has_zmin,
|
||||||
|
int has_xmax, int has_ymax, int has_zmax,
|
||||||
|
int imin, int jmin, int kmin,
|
||||||
|
double propspeed,
|
||||||
|
const double *f0,
|
||||||
|
double *target,
|
||||||
|
double soa0, double soa1, double soa2,
|
||||||
|
int symmetry)
|
||||||
|
{
|
||||||
|
const double one = 1.0;
|
||||||
|
const double two = 2.0;
|
||||||
|
const int n = nx * ny * nz;
|
||||||
|
const double dX = X[1] - X[0];
|
||||||
|
const double dY = Y[1] - Y[0];
|
||||||
|
const double dZ = Z[1] - Z[0];
|
||||||
|
const double d2dx = one / (two * dX);
|
||||||
|
const double d2dy = one / (two * dY);
|
||||||
|
const double d2dz = one / (two * dZ);
|
||||||
|
const double soa[3] = {soa0, soa1, soa2};
|
||||||
|
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int plane = nx * ny;
|
||||||
|
const int k = idx / plane;
|
||||||
|
const int rem = idx - k * plane;
|
||||||
|
const int j = rem / nx;
|
||||||
|
const int i = rem - j * nx;
|
||||||
|
|
||||||
|
const bool on_boundary =
|
||||||
|
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
||||||
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
||||||
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
||||||
|
|
||||||
|
if (!on_boundary)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
|
||||||
|
if (radius == 0.0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
const double wx = propspeed * X[i] / radius;
|
||||||
|
const double wy = propspeed * Y[j] / radius;
|
||||||
|
const double wz = propspeed * Z[k] / radius;
|
||||||
|
|
||||||
|
double fx = 0.0;
|
||||||
|
double fy = 0.0;
|
||||||
|
double fz = 0.0;
|
||||||
|
|
||||||
|
if (wx > 0.0)
|
||||||
|
{
|
||||||
|
if (i - 2 >= imin)
|
||||||
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (i - 1 >= imin)
|
||||||
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
else if (wx < 0.0)
|
||||||
|
{
|
||||||
|
if (i + 2 <= nx - 1)
|
||||||
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (i + 1 <= nx - 1)
|
||||||
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wy > 0.0)
|
||||||
|
{
|
||||||
|
if (j - 2 >= jmin)
|
||||||
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (j - 1 >= jmin)
|
||||||
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
else if (wy < 0.0)
|
||||||
|
{
|
||||||
|
if (j + 2 <= ny - 1)
|
||||||
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (j + 1 <= ny - 1)
|
||||||
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wz > 0.0)
|
||||||
|
{
|
||||||
|
if (k - 2 >= kmin)
|
||||||
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (k - 1 >= kmin)
|
||||||
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
else if (wz < 0.0)
|
||||||
|
{
|
||||||
|
if (k + 2 <= nz - 1)
|
||||||
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
||||||
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
||||||
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
||||||
|
else if (k + 1 <= nz - 1)
|
||||||
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
||||||
|
else
|
||||||
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
||||||
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
||||||
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
||||||
|
}
|
||||||
|
|
||||||
|
target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error("cudaLaunchKernel", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
err = cudaDeviceSynchronize();
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error("cudaDeviceSynchronize", err);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
int bssn_cuda_enforce_ga(int *ex,
|
||||||
|
double *dxx, double *gxy, double *gxz,
|
||||||
|
double *dyy, double *gyz, double *dzz,
|
||||||
|
double *Axx, double *Axy, double *Axz,
|
||||||
|
double *Ayy, double *Ayz, double *Azz)
|
||||||
|
{
|
||||||
|
int n = count_points(ex);
|
||||||
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||||
|
dim3 block(256);
|
||||||
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||||
|
|
||||||
|
double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
|
||||||
|
double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
|
||||||
|
double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
|
||||||
|
double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
|
||||||
|
|
||||||
|
bool ok = copy_to_device(d_dxx, dxx, bytes) &&
|
||||||
|
copy_to_device(d_gxy, gxy, bytes) &&
|
||||||
|
copy_to_device(d_gxz, gxz, bytes) &&
|
||||||
|
copy_to_device(d_dyy, dyy, bytes) &&
|
||||||
|
copy_to_device(d_gyz, gyz, bytes) &&
|
||||||
|
copy_to_device(d_dzz, dzz, bytes) &&
|
||||||
|
copy_to_device(d_Axx, Axx, bytes) &&
|
||||||
|
copy_to_device(d_Axy, Axy, bytes) &&
|
||||||
|
copy_to_device(d_Axz, Axz, bytes) &&
|
||||||
|
copy_to_device(d_Ayy, Ayy, bytes) &&
|
||||||
|
copy_to_device(d_Ayz, Ayz, bytes) &&
|
||||||
|
copy_to_device(d_Azz, Azz, bytes);
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
||||||
|
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
||||||
|
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
|
||||||
|
if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
|
||||||
|
}
|
||||||
|
|
||||||
|
free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
|
||||||
|
free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
|
||||||
|
free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
|
||||||
|
free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
|
||||||
|
|
||||||
|
return ok ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||||
|
const double *X, const double *Y, const double *Z,
|
||||||
|
double xmin, double ymin, double zmin,
|
||||||
|
double xmax, double ymax, double zmax,
|
||||||
|
const double *state0,
|
||||||
|
const double *boundary_src,
|
||||||
|
double *stage_data,
|
||||||
|
double *rhs_accum,
|
||||||
|
double propspeed,
|
||||||
|
const double SoA[3],
|
||||||
|
int symmetry,
|
||||||
|
int lev,
|
||||||
|
int rk_stage)
|
||||||
|
{
|
||||||
|
int nx = ex[0];
|
||||||
|
int ny = ex[1];
|
||||||
|
int nz = ex[2];
|
||||||
|
int n = count_points(ex);
|
||||||
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||||
|
const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
|
||||||
|
const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
|
||||||
|
const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
|
||||||
|
dim3 block(256);
|
||||||
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||||
|
|
||||||
|
double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
|
||||||
|
double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
|
||||||
|
|
||||||
|
bool ok = copy_to_device(d_X, X, bytes_x) &&
|
||||||
|
copy_to_device(d_Y, Y, bytes_y) &&
|
||||||
|
copy_to_device(d_Z, Z, bytes_z) &&
|
||||||
|
copy_to_device(d_state0, state0, bytes) &&
|
||||||
|
copy_to_device(d_boundary, boundary_src, bytes) &&
|
||||||
|
copy_to_device(d_stage, stage_data, bytes) &&
|
||||||
|
copy_to_device(d_rhs, rhs_accum, bytes);
|
||||||
|
|
||||||
|
if (!ok)
|
||||||
|
{
|
||||||
|
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||||
|
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
double dX = X[1] - X[0];
|
||||||
|
double dY = Y[1] - Y[0];
|
||||||
|
double dZ = Z[1] - Z[0];
|
||||||
|
const int no_symm = 0, eq_symm = 1, octant = 2;
|
||||||
|
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
||||||
|
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
||||||
|
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
||||||
|
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
||||||
|
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
||||||
|
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
||||||
|
double soa0 = SoA[0];
|
||||||
|
double soa1 = SoA[1];
|
||||||
|
double soa2 = SoA[2];
|
||||||
|
|
||||||
|
if (lev == 0)
|
||||||
|
{
|
||||||
|
int imin = 1;
|
||||||
|
int jmin = 1;
|
||||||
|
int kmin = 1;
|
||||||
|
if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
|
||||||
|
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
||||||
|
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
||||||
|
|
||||||
|
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
|
||||||
|
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
||||||
|
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
||||||
|
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
|
||||||
|
&has_xmin, &has_ymin, &has_zmin,
|
||||||
|
&has_xmax, &has_ymax, &has_zmax,
|
||||||
|
&imin, &jmin, &kmin, &propspeed,
|
||||||
|
&bam_source, &bam_target,
|
||||||
|
&soa0, &soa1, &soa2,
|
||||||
|
&symmetry};
|
||||||
|
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||||
|
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok && lev > 0)
|
||||||
|
{
|
||||||
|
void *args[] = {&nx, &ny, &nz,
|
||||||
|
&has_xmin, &has_ymin, &has_zmin,
|
||||||
|
&has_xmax, &has_ymax, &has_zmax,
|
||||||
|
&d_state0, &d_stage};
|
||||||
|
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free_device(d_X); free_device(d_Y); free_device(d_Z);
|
||||||
|
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
|
||||||
|
return ok ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||||
|
{
|
||||||
|
int n = count_points(ex);
|
||||||
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||||
|
dim3 block(256);
|
||||||
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||||
|
|
||||||
|
double *d_chi = nullptr;
|
||||||
|
bool ok = copy_to_device(d_chi, chi, bytes);
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
void *args[] = {&n, &d_chi, &tinny};
|
||||||
|
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
}
|
||||||
|
|
||||||
|
free_device(d_chi);
|
||||||
|
return ok ? 0 : 1;
|
||||||
|
}
|
||||||
26
AMSS_NCKU_source/bssn_cuda_ops.h
Normal file
26
AMSS_NCKU_source/bssn_cuda_ops.h
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#ifndef BSSN_CUDA_OPS_H
|
||||||
|
#define BSSN_CUDA_OPS_H
|
||||||
|
|
||||||
|
int bssn_cuda_enforce_ga(int *ex,
|
||||||
|
double *dxx, double *gxy, double *gxz,
|
||||||
|
double *dyy, double *gyz, double *dzz,
|
||||||
|
double *Axx, double *Axy, double *Axz,
|
||||||
|
double *Ayy, double *Ayz, double *Azz);
|
||||||
|
|
||||||
|
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||||
|
const double *X, const double *Y, const double *Z,
|
||||||
|
double xmin, double ymin, double zmin,
|
||||||
|
double xmax, double ymax, double zmax,
|
||||||
|
const double *state0,
|
||||||
|
const double *boundary_src,
|
||||||
|
double *stage_data,
|
||||||
|
double *rhs_accum,
|
||||||
|
double propspeed,
|
||||||
|
const double SoA[3],
|
||||||
|
int symmetry,
|
||||||
|
int lev,
|
||||||
|
int rk_stage);
|
||||||
|
|
||||||
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
|
||||||
|
|
||||||
|
#endif
|
||||||
364
AMSS_NCKU_source/bssn_cuda_step.C
Normal file
364
AMSS_NCKU_source/bssn_cuda_step.C
Normal file
@@ -0,0 +1,364 @@
|
|||||||
|
#include "macrodef.h"
|
||||||
|
|
||||||
|
#ifdef USE_GPU
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "bssn_class.h"
|
||||||
|
#include "bssn_cuda_ops.h"
|
||||||
|
#include "bssn_gpu.h"
|
||||||
|
#include "bssn_macro.h"
|
||||||
|
#include "rungekutta4_rout.h"
|
||||||
|
|
||||||
|
void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||||
|
{
|
||||||
|
#ifdef WithShell
|
||||||
|
#error "Step_MainPath_GPU currently supports Patch grids only."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
||||||
|
|
||||||
|
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||||
|
|
||||||
|
#if (MAPBH == 1)
|
||||||
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
{
|
||||||
|
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
||||||
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||||
|
{
|
||||||
|
for (int ith = 0; ith < 3; ith++)
|
||||||
|
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
|
||||||
|
if (Symmetry > 0)
|
||||||
|
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
|
||||||
|
if (Symmetry == 2)
|
||||||
|
{
|
||||||
|
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
|
||||||
|
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lev == a_lev)
|
||||||
|
AnalysisStuff(lev, dT_lev);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef With_AHF
|
||||||
|
AH_Step_Find(lev, dT_lev);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
|
||||||
|
(void)BB;
|
||||||
|
double ndeps = (lev < GH->movls) ? numepsb : numepss;
|
||||||
|
double TRK4 = PhysTime;
|
||||||
|
int iter_count = 0;
|
||||||
|
int pre = 0, cor = 1;
|
||||||
|
int ERROR = 0;
|
||||||
|
|
||||||
|
auto run_stage_on_block =
|
||||||
|
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
|
||||||
|
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
|
||||||
|
MyList<var> *rhs_list, int rk_stage) {
|
||||||
|
MyList<var> *varl0 = state0_list;
|
||||||
|
MyList<var> *varlb = boundary_src_list;
|
||||||
|
MyList<var> *varls = stage_data_list;
|
||||||
|
MyList<var> *varlr = rhs_list;
|
||||||
|
|
||||||
|
while (varl0)
|
||||||
|
{
|
||||||
|
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
|
||||||
|
cg->X[0], cg->X[1], cg->X[2],
|
||||||
|
patch->bbox[0], patch->bbox[1], patch->bbox[2],
|
||||||
|
patch->bbox[3], patch->bbox[4], patch->bbox[5],
|
||||||
|
cg->fgfs[varl0->data->sgfn],
|
||||||
|
cg->fgfs[varlb->data->sgfn],
|
||||||
|
cg->fgfs[varls->data->sgfn],
|
||||||
|
cg->fgfs[varlr->data->sgfn],
|
||||||
|
varl0->data->propspeed,
|
||||||
|
varl0->data->SoA,
|
||||||
|
Symmetry, lev, rk_stage))
|
||||||
|
{
|
||||||
|
cerr << "GPU rk4/boundary failure: lev=" << lev
|
||||||
|
<< " rk_stage=" << rk_stage
|
||||||
|
<< " var=" << varl0->data->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
varl0 = varl0->next;
|
||||||
|
varlb = varlb->next;
|
||||||
|
varls = varls->next;
|
||||||
|
varlr = varlr->next;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
MyList<Patch> *Pp = GH->PatL[lev];
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *cg = BP->data;
|
||||||
|
if (myrank == cg->rank)
|
||||||
|
{
|
||||||
|
#if (AGM == 0)
|
||||||
|
if (bssn_cuda_enforce_ga(cg->shape,
|
||||||
|
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||||
|
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||||
|
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||||
|
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]))
|
||||||
|
{
|
||||||
|
cerr << "GPU enforce_ga failure: lev=" << lev
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
|
||||||
|
ERROR = 1;
|
||||||
|
|
||||||
|
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
|
||||||
|
|
||||||
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
|
||||||
|
{
|
||||||
|
cerr << "GPU lowerbound failure: lev=" << lev
|
||||||
|
<< " rk_stage=" << iter_count
|
||||||
|
<< " var=" << phi->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Request err_req_pre;
|
||||||
|
{
|
||||||
|
int erh = ERROR;
|
||||||
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
|
||||||
|
}
|
||||||
|
|
||||||
|
Parallel::AsyncSyncState async_pre;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||||
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
|
||||||
|
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if (MAPBH == 0)
|
||||||
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
{
|
||||||
|
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
||||||
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||||
|
{
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
||||||
|
if (Symmetry > 0)
|
||||||
|
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
|
||||||
|
if (Symmetry == 2)
|
||||||
|
{
|
||||||
|
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
|
||||||
|
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lev == a_lev)
|
||||||
|
AnalysisStuff(lev, dT_lev);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||||
|
{
|
||||||
|
if (iter_count == 1 || iter_count == 3)
|
||||||
|
TRK4 += dT_lev / 2;
|
||||||
|
|
||||||
|
Pp = GH->PatL[lev];
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *cg = BP->data;
|
||||||
|
if (myrank == cg->rank)
|
||||||
|
{
|
||||||
|
#if (AGM == 0)
|
||||||
|
if (bssn_cuda_enforce_ga(cg->shape,
|
||||||
|
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||||
|
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||||
|
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||||
|
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
|
||||||
|
{
|
||||||
|
cerr << "GPU enforce_ga failure: lev=" << lev
|
||||||
|
<< " rk_stage=" << iter_count
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
}
|
||||||
|
#elif (AGM == 1)
|
||||||
|
if (iter_count == 3 &&
|
||||||
|
bssn_cuda_enforce_ga(cg->shape,
|
||||||
|
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||||
|
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||||
|
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||||
|
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
|
||||||
|
{
|
||||||
|
cerr << "GPU enforce_ga failure: lev=" << lev
|
||||||
|
<< " rk_stage=" << iter_count
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
|
||||||
|
ERROR = 1;
|
||||||
|
|
||||||
|
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
|
||||||
|
|
||||||
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
|
||||||
|
{
|
||||||
|
cerr << "GPU lowerbound failure: lev=" << lev
|
||||||
|
<< " rk_stage=" << iter_count
|
||||||
|
<< " var=" << phi1->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Request err_req_cor;
|
||||||
|
{
|
||||||
|
int erh = ERROR;
|
||||||
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||||
|
}
|
||||||
|
|
||||||
|
Parallel::AsyncSyncState async_cor;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||||
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||||
|
|
||||||
|
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||||
|
<< " variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if (MAPBH == 0)
|
||||||
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
{
|
||||||
|
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
|
||||||
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||||
|
{
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
||||||
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
||||||
|
if (Symmetry > 0)
|
||||||
|
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
|
||||||
|
if (Symmetry == 2)
|
||||||
|
{
|
||||||
|
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
|
||||||
|
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (iter_count < 3)
|
||||||
|
{
|
||||||
|
Pp = GH->PatL[lev];
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if (MAPBH == 0)
|
||||||
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
{
|
||||||
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||||
|
{
|
||||||
|
Porg[ithBH][0] = Porg1[ithBH][0];
|
||||||
|
Porg[ithBH][1] = Porg1[ithBH][1];
|
||||||
|
Porg[ithBH][2] = Porg1[ithBH][2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if (RPS == 0)
|
||||||
|
RestrictProlong(lev, YN, BB);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Pp = GH->PatL[lev];
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *cg = BP->data;
|
||||||
|
cg->swapList(StateList, SynchList_cor, myrank);
|
||||||
|
cg->swapList(OldStateList, SynchList_cor, myrank);
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
{
|
||||||
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||||
|
{
|
||||||
|
Porg0[ithBH][0] = Porg1[ithBH][0];
|
||||||
|
Porg0[ithBH][1] = Porg1[ithBH][1];
|
||||||
|
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -18,12 +18,18 @@ using namespace std;
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void compare_result_gpu(int ftag1,double * datac,int data_num){
|
void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||||
double * data = (double*)malloc(sizeof(double)*data_num);
|
#ifdef RESULT_CHECK
|
||||||
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
|
double * data = (double*)malloc(sizeof(double)*data_num);
|
||||||
compare_result(ftag1,data,data_num);
|
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
|
||||||
free(data);
|
compare_result(ftag1,data,data_num);
|
||||||
}
|
free(data);
|
||||||
|
#else
|
||||||
|
(void)ftag1;
|
||||||
|
(void)datac;
|
||||||
|
(void)data_num;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void test_const_address(double * testd){
|
__global__ void test_const_address(double * testd){
|
||||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||||
|
|||||||
@@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
|||||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||||
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
||||||
|
|
||||||
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
cgh.o surface_integral.o ShellPatch.o\
|
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||||
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
||||||
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
||||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||||
NullShellPatch2_Evo.o \
|
NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
|
||||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
|
||||||
|
|
||||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||||
@@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
|
|||||||
|
|
||||||
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
||||||
|
|
||||||
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
|
||||||
|
|
||||||
# file dependences
|
# file dependences
|
||||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
|||||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
||||||
|
CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
|
||||||
|
|
||||||
## Memory allocator switch
|
## Memory allocator switch
|
||||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
||||||
@@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1)
|
|||||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
|
||||||
|
|
||||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import AMSS_NCKU_Input as input_data
|
import AMSS_NCKU_Input as input_data
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@@ -57,6 +58,48 @@ BUILD_JOBS = 64
|
|||||||
##################################################################
|
##################################################################
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
|
||||||
|
def prepare_gpu_runtime_env():
|
||||||
|
"""
|
||||||
|
Create a user-private CUDA MPS environment for GPU runs.
|
||||||
|
|
||||||
|
On shared machines another user's daemon may already occupy the default
|
||||||
|
/tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
|
||||||
|
fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private
|
||||||
|
pipe directory avoids cross-user interference.
|
||||||
|
"""
|
||||||
|
env = os.environ.copy()
|
||||||
|
|
||||||
|
pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
|
||||||
|
log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}")
|
||||||
|
|
||||||
|
os.makedirs(pipe_dir, exist_ok=True)
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
|
||||||
|
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
|
||||||
|
|
||||||
|
control_socket = os.path.join(pipe_dir, "control")
|
||||||
|
if not os.path.exists(control_socket):
|
||||||
|
start = subprocess.run(
|
||||||
|
["nvidia-cuda-mps-control", "-d"],
|
||||||
|
env=env,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
)
|
||||||
|
if start.returncode != 0:
|
||||||
|
print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
|
||||||
|
else:
|
||||||
|
print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
|
||||||
|
else:
|
||||||
|
print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
|
||||||
|
|
||||||
|
return env
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
|
|
||||||
@@ -146,16 +189,29 @@ def run_ABE():
|
|||||||
|
|
||||||
## Define the command to run; cast other values to strings as needed
|
## Define the command to run; cast other values to strings as needed
|
||||||
|
|
||||||
|
run_env = None
|
||||||
|
|
||||||
if (input_data.GPU_Calculation == "no"):
|
if (input_data.GPU_Calculation == "no"):
|
||||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||||
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||||
mpi_command_outfile = "ABE_out.log"
|
mpi_command_outfile = "ABE_out.log"
|
||||||
elif (input_data.GPU_Calculation == "yes"):
|
elif (input_data.GPU_Calculation == "yes"):
|
||||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
run_env = prepare_gpu_runtime_env()
|
||||||
|
if int(input_data.MPI_processes) == 1:
|
||||||
|
mpi_command = "./ABEGPU"
|
||||||
|
else:
|
||||||
|
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||||
mpi_command_outfile = "ABEGPU_out.log"
|
mpi_command_outfile = "ABEGPU_out.log"
|
||||||
|
|
||||||
## Execute the MPI command and stream output
|
## Execute the MPI command and stream output
|
||||||
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
mpi_process = subprocess.Popen(
|
||||||
|
mpi_command,
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
env=run_env,
|
||||||
|
)
|
||||||
|
|
||||||
## Write ABE run output to file while printing to stdout
|
## Write ABE run output to file while printing to stdout
|
||||||
with open(mpi_command_outfile, 'w') as file0:
|
with open(mpi_command_outfile, 'w') as file0:
|
||||||
|
|||||||
Reference in New Issue
Block a user