Add runnable GPU main-path prototype

This commit is contained in:
2026-04-08 19:14:37 +08:00
parent 8c1f4d8108
commit ea470737db
10 changed files with 1068 additions and 39 deletions

View File

@@ -29,16 +29,11 @@ using namespace std;
#error "not define ABEtype" #error "not define ABEtype"
#endif #endif
#if (ABEtype == 0) #if (ABEtype == 0)
#include "bssn_class.h"
#ifdef USE_GPU
#include "bssn_gpu_class.h" #elif (ABEtype == 1)
#else #include "bssnEScalar_class.h"
#include "bssn_class.h"
#endif
#elif (ABEtype == 1)
#include "bssnEScalar_class.h"
#elif (ABEtype == 2) #elif (ABEtype == 2)
#include "Z4c_class.h" #include "Z4c_class.h"

View File

@@ -3026,9 +3026,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
#if (PSTR == 0) #if (PSTR == 0)
#if 1 #if 1
void bssn_class::Step(int lev, int YN) void bssn_class::Step(int lev, int YN)
{ {
setpbh(BH_num, Porg0, Mass, BH_num_input); #ifdef USE_GPU
Step_MainPath_GPU(lev, YN);
return;
#endif
setpbh(BH_num, Porg0, Mass, BH_num_input);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); double dT_lev = dT * pow(0.5, Mymax(lev, trfls));

View File

@@ -171,16 +171,19 @@ public:
bool check_Stdin_Abort(); bool check_Stdin_Abort();
virtual void Setup_Initial_Data_Cao(); virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto(); virtual void Setup_Initial_Data_Lousto();
virtual void Initialize(); virtual void Initialize();
virtual void Read_Ansorg(); virtual void Read_Ansorg();
virtual void Read_Pablo() {}; virtual void Read_Pablo() {};
virtual void Compute_Psi4(int lev); virtual void Compute_Psi4(int lev);
virtual void Step(int lev, int YN); virtual void Step(int lev, int YN);
virtual void Interp_Constraint(bool infg); #ifdef USE_GPU
virtual void Constraint_Out(); void Step_MainPath_GPU(int lev, int YN);
virtual void Compute_Constraint(); #endif
virtual void Interp_Constraint(bool infg);
virtual void Constraint_Out();
virtual void Compute_Constraint();
#ifdef With_AHF #ifdef With_AHF
protected: protected:

View File

@@ -0,0 +1,572 @@
#include "bssn_cuda_ops.h"
#include <cmath>
#include <cstdio>
#include <cuda_runtime.h>
namespace {
inline void report_cuda_error(const char *where, cudaError_t err)
{
if (err != cudaSuccess)
std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
}
inline int count_points(const int ex[3])
{
return ex[0] * ex[1] * ex[2];
}
inline int div_up(int a, int b)
{
return (a + b - 1) / b;
}
struct DeviceArrays
{
double *x = nullptr;
double *y = nullptr;
double *z = nullptr;
double *a = nullptr;
double *b = nullptr;
double *c = nullptr;
double *d = nullptr;
};
inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
{
cudaError_t err = cudaMalloc(&dst, bytes);
if (err != cudaSuccess)
{
report_cuda_error("cudaMalloc", err);
return false;
}
err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
report_cuda_error("cudaMemcpy(H2D)", err);
return false;
}
return true;
}
inline void free_device(double *ptr)
{
if (ptr)
cudaFree(ptr);
}
__global__ void enforce_ga_kernel(int n,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz)
{
const double one = 1.0;
const double two = 2.0;
const double one_third = 1.0 / 3.0;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
{
double lgxx = dxx[idx] + one;
double lgyy = dyy[idx] + one;
double lgzz = dzz[idx] + one;
double lgxy = gxy[idx];
double lgxz = gxz[idx];
double lgyz = gyz[idx];
double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
- lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
double scale = 1.0 / cbrt(det);
lgxx *= scale;
lgxy *= scale;
lgxz *= scale;
lgyy *= scale;
lgyz *= scale;
lgzz *= scale;
dxx[idx] = lgxx - one;
gxy[idx] = lgxy;
gxz[idx] = lgxz;
dyy[idx] = lgyy - one;
gyz[idx] = lgyz;
dzz[idx] = lgzz - one;
double gupxx = (lgyy * lgzz - lgyz * lgyz);
double gupxy = -(lgxy * lgzz - lgyz * lgxz);
double gupxz = (lgxy * lgyz - lgyy * lgxz);
double gupyy = (lgxx * lgzz - lgxz * lgxz);
double gupyz = -(lgxx * lgyz - lgxy * lgxz);
double gupzz = (lgxx * lgyy - lgxy * lgxy);
double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
+ two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
Axx[idx] -= one_third * lgxx * trA;
Axy[idx] -= one_third * lgxy * trA;
Axz[idx] -= one_third * lgxz * trA;
Ayy[idx] -= one_third * lgyy * trA;
Ayz[idx] -= one_third * lgyz * trA;
Azz[idx] -= one_third * lgzz * trA;
}
}
__device__ inline int index3(int i, int j, int k, int nx, int ny)
{
return i + j * nx + k * nx * ny;
}
__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
int nx, int ny, int nz,
const double soa[3], int symmetry)
{
double sign = 1.0;
if (i < 0)
{
i = -i - 1;
sign *= soa[0];
}
if (j < 0)
{
j = -j - 1;
sign *= soa[1];
}
if (k < 0)
{
k = -k - 1;
sign *= soa[2];
}
if (i >= nx) i = nx - 1;
if (j >= ny) j = ny - 1;
if (k >= nz) k = nz - 1;
(void)symmetry;
return sign * f[index3(i, j, k, nx, ny)];
}
__global__ void rk4_kernel(int n, double dT,
const double *f0,
double *f1,
double *rhs,
int stage)
{
const double half = 0.5;
const double one_sixth = 1.0 / 6.0;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
{
if (stage == 0)
{
f1[idx] = f0[idx] + half * dT * rhs[idx];
}
else if (stage == 1)
{
rhs[idx] += 2.0 * f1[idx];
f1[idx] = f0[idx] + half * dT * f1[idx];
}
else if (stage == 2)
{
rhs[idx] += 2.0 * f1[idx];
f1[idx] = f0[idx] + dT * f1[idx];
}
else
{
f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
}
}
}
__global__ void lowerbound_kernel(int n, double *chi, double tinny)
{
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
{
if (chi[idx] < tinny)
chi[idx] = tinny;
}
}
__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
int has_xmin, int has_ymin, int has_zmin,
int has_xmax, int has_ymax, int has_zmax,
const double *src, double *dst)
{
const int n = nx * ny * nz;
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
{
const int plane = nx * ny;
const int k = idx / plane;
const int rem = idx - k * plane;
const int j = rem / nx;
const int i = rem - j * nx;
if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
(has_zmin && k == 0) || (has_zmax && k == nz - 1))
{
dst[idx] = src[idx];
}
}
}
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
int has_xmin, int has_ymin, int has_zmin,
int has_xmax, int has_ymax, int has_zmax,
int imin, int jmin, int kmin,
double propspeed,
const double *f0,
double *target,
double soa0, double soa1, double soa2,
int symmetry)
{
const double one = 1.0;
const double two = 2.0;
const int n = nx * ny * nz;
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const double d2dx = one / (two * dX);
const double d2dy = one / (two * dY);
const double d2dz = one / (two * dZ);
const double soa[3] = {soa0, soa1, soa2};
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
{
const int plane = nx * ny;
const int k = idx / plane;
const int rem = idx - k * plane;
const int j = rem / nx;
const int i = rem - j * nx;
const bool on_boundary =
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
if (!on_boundary)
continue;
const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
if (radius == 0.0)
continue;
const double wx = propspeed * X[i] / radius;
const double wy = propspeed * Y[j] / radius;
const double wz = propspeed * Z[k] / radius;
double fx = 0.0;
double fy = 0.0;
double fz = 0.0;
if (wx > 0.0)
{
if (i - 2 >= imin)
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
else if (i - 1 >= imin)
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
else
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
}
else if (wx < 0.0)
{
if (i + 2 <= nx - 1)
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
else if (i + 1 <= nx - 1)
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
else
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
}
if (wy > 0.0)
{
if (j - 2 >= jmin)
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
else if (j - 1 >= jmin)
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
else
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
}
else if (wy < 0.0)
{
if (j + 2 <= ny - 1)
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
else if (j + 1 <= ny - 1)
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
else
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
}
if (wz > 0.0)
{
if (k - 2 >= kmin)
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
else if (k - 1 >= kmin)
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
else
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
}
else if (wz < 0.0)
{
if (k + 2 <= nz - 1)
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
else if (k + 1 <= nz - 1)
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
else
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
}
target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
}
}
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
{
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
if (err != cudaSuccess)
{
report_cuda_error("cudaLaunchKernel", err);
return false;
}
err = cudaDeviceSynchronize();
if (err != cudaSuccess)
{
report_cuda_error("cudaDeviceSynchronize", err);
return false;
}
return true;
}
} // namespace
int bssn_cuda_enforce_ga(int *ex,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz)
{
int n = count_points(ex);
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
bool ok = copy_to_device(d_dxx, dxx, bytes) &&
copy_to_device(d_gxy, gxy, bytes) &&
copy_to_device(d_gxz, gxz, bytes) &&
copy_to_device(d_dyy, dyy, bytes) &&
copy_to_device(d_gyz, gyz, bytes) &&
copy_to_device(d_dzz, dzz, bytes) &&
copy_to_device(d_Axx, Axx, bytes) &&
copy_to_device(d_Axy, Axy, bytes) &&
copy_to_device(d_Axz, Axz, bytes) &&
copy_to_device(d_Ayy, Ayy, bytes) &&
copy_to_device(d_Ayz, Ayz, bytes) &&
copy_to_device(d_Azz, Azz, bytes);
if (ok)
{
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
}
if (ok)
{
cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
ok = err == cudaSuccess;
if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
}
free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
return ok ? 0 : 1;
}
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
double propspeed,
const double SoA[3],
int symmetry,
int lev,
int rk_stage)
{
int nx = ex[0];
int ny = ex[1];
int nz = ex[2];
int n = count_points(ex);
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
bool ok = copy_to_device(d_X, X, bytes_x) &&
copy_to_device(d_Y, Y, bytes_y) &&
copy_to_device(d_Z, Z, bytes_z) &&
copy_to_device(d_state0, state0, bytes) &&
copy_to_device(d_boundary, boundary_src, bytes) &&
copy_to_device(d_stage, stage_data, bytes) &&
copy_to_device(d_rhs, rhs_accum, bytes);
if (!ok)
{
free_device(d_X); free_device(d_Y); free_device(d_Z);
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
return 1;
}
double dX = X[1] - X[0];
double dY = Y[1] - Y[0];
double dZ = Z[1] - Z[0];
const int no_symm = 0, eq_symm = 1, octant = 2;
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
double soa0 = SoA[0];
double soa1 = SoA[1];
double soa2 = SoA[2];
if (lev == 0)
{
int imin = 1;
int jmin = 1;
int kmin = 1;
if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
&imin, &jmin, &kmin, &propspeed,
&bam_source, &bam_target,
&soa0, &soa1, &soa2,
&symmetry};
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
}
if (ok)
{
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
}
if (ok && lev > 0)
{
void *args[] = {&nx, &ny, &nz,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
&d_state0, &d_stage};
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
}
if (ok)
{
cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
if (ok)
{
err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
ok = err == cudaSuccess;
}
}
free_device(d_X); free_device(d_Y); free_device(d_Z);
free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
return ok ? 0 : 1;
}
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
{
int n = count_points(ex);
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
double *d_chi = nullptr;
bool ok = copy_to_device(d_chi, chi, bytes);
if (ok)
{
void *args[] = {&n, &d_chi, &tinny};
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
}
if (ok)
{
cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
}
free_device(d_chi);
return ok ? 0 : 1;
}

View File

@@ -0,0 +1,26 @@
#ifndef BSSN_CUDA_OPS_H
#define BSSN_CUDA_OPS_H
int bssn_cuda_enforce_ga(int *ex,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz,
double *Ayy, double *Ayz, double *Azz);
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double *X, const double *Y, const double *Z,
double xmin, double ymin, double zmin,
double xmax, double ymax, double zmax,
const double *state0,
const double *boundary_src,
double *stage_data,
double *rhs_accum,
double propspeed,
const double SoA[3],
int symmetry,
int lev,
int rk_stage);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
#endif

View File

@@ -0,0 +1,364 @@
#include "macrodef.h"
#ifdef USE_GPU
#include <cmath>
#include <vector>
#include "bssn_class.h"
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "bssn_macro.h"
#include "rungekutta4_rout.h"
void bssn_class::Step_MainPath_GPU(int lev, int YN)
{
#ifdef WithShell
#error "Step_MainPath_GPU currently supports Patch grids only."
#endif
setpbh(BH_num, Porg0, Mass, BH_num_input);
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if (MAPBH == 1)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
for (int ith = 0; ith < 3; ith++)
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
(void)BB;
double ndeps = (lev < GH->movls) ? numepsb : numepss;
double TRK4 = PhysTime;
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
auto run_stage_on_block =
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
MyList<var> *rhs_list, int rk_stage) {
MyList<var> *varl0 = state0_list;
MyList<var> *varlb = boundary_src_list;
MyList<var> *varls = stage_data_list;
MyList<var> *varlr = rhs_list;
while (varl0)
{
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
cg->X[0], cg->X[1], cg->X[2],
patch->bbox[0], patch->bbox[1], patch->bbox[2],
patch->bbox[3], patch->bbox[4], patch->bbox[5],
cg->fgfs[varl0->data->sgfn],
cg->fgfs[varlb->data->sgfn],
cg->fgfs[varls->data->sgfn],
cg->fgfs[varlr->data->sgfn],
varl0->data->propspeed,
varl0->data->SoA,
Symmetry, lev, rk_stage))
{
cerr << "GPU rk4/boundary failure: lev=" << lev
<< " rk_stage=" << rk_stage
<< " var=" << varl0->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
varl0 = varl0->next;
varlb = varlb->next;
varls = varls->next;
varlr = varlr->next;
}
};
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
#if (AGM == 0)
if (bssn_cuda_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]))
{
cerr << "GPU enforce_ga failure: lev=" << lev
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
#endif
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
ERROR = 1;
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
MPI_Request err_req_pre;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
}
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
#endif
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
#if (AGM == 0)
if (bssn_cuda_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
{
cerr << "GPU enforce_ga failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
#elif (AGM == 1)
if (iter_count == 3 &&
bssn_cuda_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]))
{
cerr << "GPU enforce_ga failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
#endif
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
ERROR = 1;
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
<< " var=" << phi1->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
MPI_Request err_req_cor;
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
}
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
#endif
if (iter_count < 3)
{
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#if (MAPBH == 0)
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
#endif
}
}
#if (RPS == 0)
RestrictProlong(lev, YN, BB);
#endif
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(StateList, SynchList_cor, myrank);
cg->swapList(OldStateList, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
}
#endif

View File

@@ -18,12 +18,18 @@ using namespace std;
#include <fstream> #include <fstream>
#endif #endif
void compare_result_gpu(int ftag1,double * datac,int data_num){ void compare_result_gpu(int ftag1,double * datac,int data_num){
double * data = (double*)malloc(sizeof(double)*data_num); #ifdef RESULT_CHECK
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost); double * data = (double*)malloc(sizeof(double)*data_num);
compare_result(ftag1,data,data_num); cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
free(data); compare_result(ftag1,data,data_num);
} free(data);
#else
(void)ftag1;
(void)datac;
(void)data_num;
#endif
}
__global__ void test_const_address(double * testd){ __global__ void test_const_address(double * testd){
int _t = blockIdx.x*blockDim.x+threadIdx.x; int _t = blockIdx.x*blockDim.x+threadIdx.x;

View File

@@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o \ NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
# file dependences # file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

View File

@@ -9,6 +9,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart
## Memory allocator switch ## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) ## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif endif
LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization ## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data ## instrument : PGO Phase 1 instrumentation to collect fresh profile data

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -57,6 +58,48 @@ BUILD_JOBS = 64
################################################################## ##################################################################
##################################################################
def prepare_gpu_runtime_env():
"""
Create a user-private CUDA MPS environment for GPU runs.
On shared machines another user's daemon may already occupy the default
/tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private
pipe directory avoids cross-user interference.
"""
env = os.environ.copy()
pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}")
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
control_socket = os.path.join(pipe_dir, "control")
if not os.path.exists(control_socket):
start = subprocess.run(
["nvidia-cuda-mps-control", "-d"],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
if start.returncode != 0:
print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
else:
print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
else:
print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
return env
##################################################################
################################################################## ##################################################################
@@ -146,16 +189,29 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
run_env = None
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" run_env = prepare_gpu_runtime_env()
if int(input_data.MPI_processes) == 1:
mpi_command = "./ABEGPU"
else:
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) mpi_process = subprocess.Popen(
mpi_command,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
env=run_env,
)
## Write ABE run output to file while printing to stdout ## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0: with open(mpi_command_outfile, 'w') as file0: