diff --git a/AMSS_NCKU_source/ABE.C b/AMSS_NCKU_source/ABE.C index 9a4874e..83d2032 100644 --- a/AMSS_NCKU_source/ABE.C +++ b/AMSS_NCKU_source/ABE.C @@ -29,16 +29,11 @@ using namespace std; #error "not define ABEtype" #endif -#if (ABEtype == 0) - -#ifdef USE_GPU -#include "bssn_gpu_class.h" -#else -#include "bssn_class.h" -#endif - -#elif (ABEtype == 1) -#include "bssnEScalar_class.h" +#if (ABEtype == 0) +#include "bssn_class.h" + +#elif (ABEtype == 1) +#include "bssnEScalar_class.h" #elif (ABEtype == 2) #include "Z4c_class.h" diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index bcd9117..179233e 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -3026,9 +3026,14 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps #if (PSTR == 0) #if 1 -void bssn_class::Step(int lev, int YN) -{ - setpbh(BH_num, Porg0, Mass, BH_num_input); +void bssn_class::Step(int lev, int YN) +{ +#ifdef USE_GPU + Step_MainPath_GPU(lev, YN); + return; +#endif + + setpbh(BH_num, Porg0, Mass, BH_num_input); double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 5a8eb2a..94fd306 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -171,16 +171,19 @@ public: bool check_Stdin_Abort(); - virtual void Setup_Initial_Data_Cao(); - virtual void Setup_Initial_Data_Lousto(); - virtual void Initialize(); - virtual void Read_Ansorg(); - virtual void Read_Pablo() {}; - virtual void Compute_Psi4(int lev); - virtual void Step(int lev, int YN); - virtual void Interp_Constraint(bool infg); - virtual void Constraint_Out(); - virtual void Compute_Constraint(); + virtual void Setup_Initial_Data_Cao(); + virtual void Setup_Initial_Data_Lousto(); + virtual void Initialize(); + virtual void Read_Ansorg(); + virtual void Read_Pablo() {}; + virtual void Compute_Psi4(int lev); + virtual void Step(int lev, int YN); +#ifdef USE_GPU + void Step_MainPath_GPU(int lev, int YN); +#endif + virtual void Interp_Constraint(bool infg); + virtual void Constraint_Out(); + virtual void Compute_Constraint(); #ifdef With_AHF protected: diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu new file mode 100644 index 0000000..6ef14d0 --- /dev/null +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -0,0 +1,572 @@ +#include "bssn_cuda_ops.h" + +#include +#include +#include + +namespace { + +inline void report_cuda_error(const char *where, cudaError_t err) +{ + if (err != cudaSuccess) + std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err)); +} + +inline int count_points(const int ex[3]) +{ + return ex[0] * ex[1] * ex[2]; +} + +inline int div_up(int a, int b) +{ + return (a + b - 1) / b; +} + +struct DeviceArrays +{ + double *x = nullptr; + double *y = nullptr; + double *z = nullptr; + double *a = nullptr; + double *b = nullptr; + double *c = nullptr; + double *d = nullptr; +}; + +inline bool copy_to_device(double *&dst, const double *src, size_t bytes) +{ + cudaError_t err = cudaMalloc(&dst, bytes); + if (err != cudaSuccess) + { + report_cuda_error("cudaMalloc", err); + return false; + } + err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D)", err); + return false; + } + return true; +} + +inline void free_device(double *ptr) +{ + if (ptr) + cudaFree(ptr); +} + +__global__ void enforce_ga_kernel(int n, + double *dxx, double *gxy, double *gxz, + double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, + double *Ayy, double *Ayz, double *Azz) +{ + const double one = 1.0; + const double two = 2.0; + const double one_third = 1.0 / 3.0; + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + double lgxx = dxx[idx] + one; + double lgyy = dyy[idx] + one; + double lgzz = dzz[idx] + one; + double lgxy = gxy[idx]; + double lgxz = gxz[idx]; + double lgyz = gyz[idx]; + + double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz + - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz; + double scale = 1.0 / cbrt(det); + + lgxx *= scale; + lgxy *= scale; + lgxz *= scale; + lgyy *= scale; + lgyz *= scale; + lgzz *= scale; + + dxx[idx] = lgxx - one; + gxy[idx] = lgxy; + gxz[idx] = lgxz; + dyy[idx] = lgyy - one; + gyz[idx] = lgyz; + dzz[idx] = lgzz - one; + + double gupxx = (lgyy * lgzz - lgyz * lgyz); + double gupxy = -(lgxy * lgzz - lgyz * lgxz); + double gupxz = (lgxy * lgyz - lgyy * lgxz); + double gupyy = (lgxx * lgzz - lgxz * lgxz); + double gupyz = -(lgxx * lgyz - lgxy * lgxz); + double gupzz = (lgxx * lgyy - lgxy * lgxy); + + double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx] + + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]); + + Axx[idx] -= one_third * lgxx * trA; + Axy[idx] -= one_third * lgxy * trA; + Axz[idx] -= one_third * lgxz * trA; + Ayy[idx] -= one_third * lgyy * trA; + Ayz[idx] -= one_third * lgyz * trA; + Azz[idx] -= one_third * lgzz * trA; + } +} + +__device__ inline int index3(int i, int j, int k, int nx, int ny) +{ + return i + j * nx + k * nx * ny; +} + +__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k, + int nx, int ny, int nz, + const double soa[3], int symmetry) +{ + double sign = 1.0; + if (i < 0) + { + i = -i - 1; + sign *= soa[0]; + } + if (j < 0) + { + j = -j - 1; + sign *= soa[1]; + } + if (k < 0) + { + k = -k - 1; + sign *= soa[2]; + } + if (i >= nx) i = nx - 1; + if (j >= ny) j = ny - 1; + if (k >= nz) k = nz - 1; + (void)symmetry; + return sign * f[index3(i, j, k, nx, ny)]; +} + +__global__ void rk4_kernel(int n, double dT, + const double *f0, + double *f1, + double *rhs, + int stage) +{ + const double half = 0.5; + const double one_sixth = 1.0 / 6.0; + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + if (stage == 0) + { + f1[idx] = f0[idx] + half * dT * rhs[idx]; + } + else if (stage == 1) + { + rhs[idx] += 2.0 * f1[idx]; + f1[idx] = f0[idx] + half * dT * f1[idx]; + } + else if (stage == 2) + { + rhs[idx] += 2.0 * f1[idx]; + f1[idx] = f0[idx] + dT * f1[idx]; + } + else + { + f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]); + } + } +} + +__global__ void lowerbound_kernel(int n, double *chi, double tinny) +{ + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + if (chi[idx] < tinny) + chi[idx] = tinny; + } +} + +__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz, + int has_xmin, int has_ymin, int has_zmin, + int has_xmax, int has_ymax, int has_zmax, + const double *src, double *dst) +{ + const int n = nx * ny * nz; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + const int plane = nx * ny; + const int k = idx / plane; + const int rem = idx - k * plane; + const int j = rem / nx; + const int i = rem - j * nx; + + if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) || + (has_ymin && j == 0) || (has_ymax && j == ny - 1) || + (has_zmin && k == 0) || (has_zmax && k == nz - 1)) + { + dst[idx] = src[idx]; + } + } +} + +__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz, + const double *X, const double *Y, const double *Z, + double xmin, double ymin, double zmin, + double xmax, double ymax, double zmax, + int has_xmin, int has_ymin, int has_zmin, + int has_xmax, int has_ymax, int has_zmax, + int imin, int jmin, int kmin, + double propspeed, + const double *f0, + double *target, + double soa0, double soa1, double soa2, + int symmetry) +{ + const double one = 1.0; + const double two = 2.0; + const int n = nx * ny * nz; + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + const double d2dx = one / (two * dX); + const double d2dy = one / (two * dY); + const double d2dz = one / (two * dZ); + const double soa[3] = {soa0, soa1, soa2}; + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + const int plane = nx * ny; + const int k = idx / plane; + const int rem = idx - k * plane; + const int j = rem / nx; + const int i = rem - j * nx; + + const bool on_boundary = + (has_xmin && i == 0) || (has_xmax && i == nx - 1) || + (has_ymin && j == 0) || (has_ymax && j == ny - 1) || + (has_zmin && k == 0) || (has_zmax && k == nz - 1); + + if (!on_boundary) + continue; + + const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]); + if (radius == 0.0) + continue; + + const double wx = propspeed * X[i] / radius; + const double wy = propspeed * Y[j] / radius; + const double wz = propspeed * Z[k] / radius; + + double fx = 0.0; + double fy = 0.0; + double fz = 0.0; + + if (wx > 0.0) + { + if (i - 2 >= imin) + fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry)); + else if (i - 1 >= imin) + fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)); + else + fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + } + else if (wx < 0.0) + { + if (i + 2 <= nx - 1) + fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + else if (i + 1 <= nx - 1) + fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)); + else + fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry)); + } + + if (wy > 0.0) + { + if (j - 2 >= jmin) + fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry)); + else if (j - 1 >= jmin) + fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)); + else + fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + } + else if (wy < 0.0) + { + if (j + 2 <= ny - 1) + fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + else if (j + 1 <= ny - 1) + fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)); + else + fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry)); + } + + if (wz > 0.0) + { + if (k - 2 >= kmin) + fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry)); + else if (k - 1 >= kmin) + fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)); + else + fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + } + else if (wz < 0.0) + { + if (k + 2 <= nz - 1) + fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry) + + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry) + - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)); + else if (k + 1 <= nz - 1) + fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)); + else + fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry) + - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry) + + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry)); + } + + target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius; + } +} + +inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args) +{ + cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr); + if (err != cudaSuccess) + { + report_cuda_error("cudaLaunchKernel", err); + return false; + } + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) + { + report_cuda_error("cudaDeviceSynchronize", err); + return false; + } + return true; +} + +} // namespace + +int bssn_cuda_enforce_ga(int *ex, + double *dxx, double *gxy, double *gxz, + double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, + double *Ayy, double *Ayz, double *Azz) +{ + int n = count_points(ex); + const size_t bytes = static_cast(n) * sizeof(double); + dim3 block(256); + dim3 grid(div_up(n, static_cast(block.x))); + + double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr; + double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr; + double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr; + double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr; + + bool ok = copy_to_device(d_dxx, dxx, bytes) && + copy_to_device(d_gxy, gxy, bytes) && + copy_to_device(d_gxz, gxz, bytes) && + copy_to_device(d_dyy, dyy, bytes) && + copy_to_device(d_gyz, gyz, bytes) && + copy_to_device(d_dzz, dzz, bytes) && + copy_to_device(d_Axx, Axx, bytes) && + copy_to_device(d_Axy, Axy, bytes) && + copy_to_device(d_Axz, Axz, bytes) && + copy_to_device(d_Ayy, Ayy, bytes) && + copy_to_device(d_Ayz, Ayz, bytes) && + copy_to_device(d_Azz, Azz, bytes); + + if (ok) + { + void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz, + &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz}; + ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args); + } + + if (ok) + { + cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err); + ok = err == cudaSuccess; + if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; } + } + + free_device(d_dxx); free_device(d_gxy); free_device(d_gxz); + free_device(d_dyy); free_device(d_gyz); free_device(d_dzz); + free_device(d_Axx); free_device(d_Axy); free_device(d_Axz); + free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz); + + return ok ? 0 : 1; +} + +int bssn_cuda_rk4_boundary_var(int *ex, double dT, + const double *X, const double *Y, const double *Z, + double xmin, double ymin, double zmin, + double xmax, double ymax, double zmax, + const double *state0, + const double *boundary_src, + double *stage_data, + double *rhs_accum, + double propspeed, + const double SoA[3], + int symmetry, + int lev, + int rk_stage) +{ + int nx = ex[0]; + int ny = ex[1]; + int nz = ex[2]; + int n = count_points(ex); + const size_t bytes = static_cast(n) * sizeof(double); + const size_t bytes_x = static_cast(nx) * sizeof(double); + const size_t bytes_y = static_cast(ny) * sizeof(double); + const size_t bytes_z = static_cast(nz) * sizeof(double); + dim3 block(256); + dim3 grid(div_up(n, static_cast(block.x))); + + double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr; + double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr; + + bool ok = copy_to_device(d_X, X, bytes_x) && + copy_to_device(d_Y, Y, bytes_y) && + copy_to_device(d_Z, Z, bytes_z) && + copy_to_device(d_state0, state0, bytes) && + copy_to_device(d_boundary, boundary_src, bytes) && + copy_to_device(d_stage, stage_data, bytes) && + copy_to_device(d_rhs, rhs_accum, bytes); + + if (!ok) + { + free_device(d_X); free_device(d_Y); free_device(d_Z); + free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs); + return 1; + } + + double dX = X[1] - X[0]; + double dY = Y[1] - Y[0]; + double dZ = Z[1] - Z[0]; + const int no_symm = 0, eq_symm = 1, octant = 2; + int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX); + int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY); + int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ); + int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0); + int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0); + int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0); + double soa0 = SoA[0]; + double soa1 = SoA[1]; + double soa2 = SoA[2]; + + if (lev == 0) + { + int imin = 1; + int jmin = 1; + int kmin = 1; + if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0; + if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0; + if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0; + + double *bam_target = (rk_stage == 0) ? d_rhs : d_stage; + const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary; + void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z, + &xmin, &ymin, &zmin, &xmax, &ymax, &zmax, + &has_xmin, &has_ymin, &has_zmin, + &has_xmax, &has_ymax, &has_zmax, + &imin, &jmin, &kmin, &propspeed, + &bam_source, &bam_target, + &soa0, &soa1, &soa2, + &symmetry}; + ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args); + } + + if (ok) + { + void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; + ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args); + } + + if (ok && lev > 0) + { + void *args[] = {&nx, &ny, &nz, + &has_xmin, &has_ymin, &has_zmin, + &has_xmax, &has_ymax, &has_zmax, + &d_state0, &d_stage}; + ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args); + } + + if (ok) + { + cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); + ok = err == cudaSuccess; + if (ok) + { + err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); + ok = err == cudaSuccess; + } + } + + free_device(d_X); free_device(d_Y); free_device(d_Z); + free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs); + return ok ? 0 : 1; +} + +int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) +{ + int n = count_points(ex); + const size_t bytes = static_cast(n) * sizeof(double); + dim3 block(256); + dim3 grid(div_up(n, static_cast(block.x))); + + double *d_chi = nullptr; + bool ok = copy_to_device(d_chi, chi, bytes); + + if (ok) + { + void *args[] = {&n, &d_chi, &tinny}; + ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args); + } + + if (ok) + { + cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); + ok = err == cudaSuccess; + } + + free_device(d_chi); + return ok ? 0 : 1; +} diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h new file mode 100644 index 0000000..e45bcaa --- /dev/null +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -0,0 +1,26 @@ +#ifndef BSSN_CUDA_OPS_H +#define BSSN_CUDA_OPS_H + +int bssn_cuda_enforce_ga(int *ex, + double *dxx, double *gxy, double *gxz, + double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, + double *Ayy, double *Ayz, double *Azz); + +int bssn_cuda_rk4_boundary_var(int *ex, double dT, + const double *X, const double *Y, const double *Z, + double xmin, double ymin, double zmin, + double xmax, double ymax, double zmax, + const double *state0, + const double *boundary_src, + double *stage_data, + double *rhs_accum, + double propspeed, + const double SoA[3], + int symmetry, + int lev, + int rk_stage); + +int bssn_cuda_lowerbound(int *ex, double *chi, double tinny); + +#endif diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C new file mode 100644 index 0000000..e713a83 --- /dev/null +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -0,0 +1,364 @@ +#include "macrodef.h" + +#ifdef USE_GPU + +#include +#include + +#include "bssn_class.h" +#include "bssn_cuda_ops.h" +#include "bssn_gpu.h" +#include "bssn_macro.h" +#include "rungekutta4_rout.h" + +void bssn_class::Step_MainPath_GPU(int lev, int YN) +{ +#ifdef WithShell +#error "Step_MainPath_GPU currently supports Patch grids only." +#endif + + setpbh(BH_num, Porg0, Mass, BH_num_input); + + const double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); + +#if (MAPBH == 1) + if (BH_num > 0 && lev == GH->levels - 1) + { + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + for (int ith = 0; ith < 3; ith++) + Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev; + if (Symmetry > 0) + Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); + if (Symmetry == 2) + { + Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); + Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); + } + } + } + + if (lev == a_lev) + AnalysisStuff(lev, dT_lev); +#endif + +#ifdef With_AHF + AH_Step_Find(lev, dT_lev); +#endif + + const bool BB = fgt(PhysTime, StartTime, dT_lev / 2); + (void)BB; + double ndeps = (lev < GH->movls) ? numepsb : numepss; + double TRK4 = PhysTime; + int iter_count = 0; + int pre = 0, cor = 1; + int ERROR = 0; + + auto run_stage_on_block = + [&](Block *cg, Patch *patch, MyList *state0_list, + MyList *boundary_src_list, MyList *stage_data_list, + MyList *rhs_list, int rk_stage) { + MyList *varl0 = state0_list; + MyList *varlb = boundary_src_list; + MyList *varls = stage_data_list; + MyList *varlr = rhs_list; + + while (varl0) + { + if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev, + cg->X[0], cg->X[1], cg->X[2], + patch->bbox[0], patch->bbox[1], patch->bbox[2], + patch->bbox[3], patch->bbox[4], patch->bbox[5], + cg->fgfs[varl0->data->sgfn], + cg->fgfs[varlb->data->sgfn], + cg->fgfs[varls->data->sgfn], + cg->fgfs[varlr->data->sgfn], + varl0->data->propspeed, + varl0->data->SoA, + Symmetry, lev, rk_stage)) + { + cerr << "GPU rk4/boundary failure: lev=" << lev + << " rk_stage=" << rk_stage + << " var=" << varl0->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + varl0 = varl0->next; + varlb = varlb->next; + varls = varls->next; + varlr = varlr->next; + } + }; + + MyList *Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { +#if (AGM == 0) + if (bssn_cuda_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn])) + { + cerr << "GPU enforce_ga failure: lev=" << lev + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } +#endif + + if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME)) + ERROR = 1; + + run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count); + + if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny)) + { + cerr << "GPU lowerbound failure: lev=" << lev + << " rk_stage=" << iter_count + << " var=" << phi->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + MPI_Request err_req_pre; + { + int erh = ERROR; + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre); + } + + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); + + MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + +#if (MAPBH == 0) + if (BH_num > 0 && lev == GH->levels - 1) + { + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg[ithBH][2] = fabs(Porg[ithBH][2]); + if (Symmetry == 2) + { + Porg[ithBH][0] = fabs(Porg[ithBH][0]); + Porg[ithBH][1] = fabs(Porg[ithBH][1]); + } + } + } + + if (lev == a_lev) + AnalysisStuff(lev, dT_lev); +#endif + + for (iter_count = 1; iter_count < 4; iter_count++) + { + if (iter_count == 1 || iter_count == 3) + TRK4 += dT_lev / 2; + + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { +#if (AGM == 0) + if (bssn_cuda_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) + { + cerr << "GPU enforce_ga failure: lev=" << lev + << " rk_stage=" << iter_count + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } +#elif (AGM == 1) + if (iter_count == 3 && + bssn_cuda_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) + { + cerr << "GPU enforce_ga failure: lev=" << lev + << " rk_stage=" << iter_count + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } +#endif + + if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) + ERROR = 1; + + run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count); + + if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny)) + { + cerr << "GPU lowerbound failure: lev=" << lev + << " rk_stage=" << iter_count + << " var=" << phi1->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + } + + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + MPI_Request err_req_cor; + { + int erh = ERROR; + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); + } + + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); + + MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); + if (ERROR) + { + Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); + if (myrank == 0) + { + if (ErrorMonitor->outfile) + ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count + << " variables at t = " << PhysTime + << ", lev = " << lev << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + +#if (MAPBH == 0) + if (BH_num > 0 && lev == GH->levels - 1) + { + compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); + f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); + if (Symmetry > 0) + Porg1[ithBH][2] = fabs(Porg1[ithBH][2]); + if (Symmetry == 2) + { + Porg1[ithBH][0] = fabs(Porg1[ithBH][0]); + Porg1[ithBH][1] = fabs(Porg1[ithBH][1]); + } + } + } +#endif + + if (iter_count < 3) + { + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + BP->data->swapList(SynchList_pre, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + +#if (MAPBH == 0) + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg[ithBH][0] = Porg1[ithBH][0]; + Porg[ithBH][1] = Porg1[ithBH][1]; + Porg[ithBH][2] = Porg1[ithBH][2]; + } + } +#endif + } + } + +#if (RPS == 0) + RestrictProlong(lev, YN, BB); +#endif + + Pp = GH->PatL[lev]; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + cg->swapList(StateList, SynchList_cor, myrank); + cg->swapList(OldStateList, SynchList_cor, myrank); + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } + + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg0[ithBH][0] = Porg1[ithBH][0]; + Porg0[ithBH][1] = Porg1[ithBH][1]; + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } +} + +#endif diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index e67ae18..bf6920c 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -18,12 +18,18 @@ using namespace std; #include #endif -void compare_result_gpu(int ftag1,double * datac,int data_num){ - double * data = (double*)malloc(sizeof(double)*data_num); - cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost); - compare_result(ftag1,data,data_num); - free(data); -} +void compare_result_gpu(int ftag1,double * datac,int data_num){ +#ifdef RESULT_CHECK + double * data = (double*)malloc(sizeof(double)*data_num); + cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost); + compare_result(ftag1,data,data_num); + free(data); +#else + (void)ftag1; + (void)datac; + (void)data_num; +#endif +} __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 72b9cbd..14bfb32 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -105,13 +105,12 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o -C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ - cgh.o surface_integral.o ShellPatch.o\ - bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ - bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ - Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o \ - bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o +C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ + cgh.o bssn_class.o surface_integral.o ShellPatch.o\ + bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ + bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ + Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ + NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ @@ -143,7 +142,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o -CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o +CUDAFILES = bssn_gpu.o bssn_cuda_ops.o # file dependences $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 331cff1..09dfe1d 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -9,6 +9,7 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 +CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart ## Memory allocator switch ## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) @@ -24,6 +25,8 @@ ifeq ($(USE_TBBMALLOC),1) LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) endif +LDLIBS := $(CUDA_LDLIBS) $(LDLIBS) + ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## opt : (default) maximum performance with PGO profile-guided optimization ## instrument : PGO Phase 1 instrumentation to collect fresh profile data diff --git a/makefile_and_run.py b/makefile_and_run.py index 5682476..ac86036 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -9,6 +9,7 @@ import AMSS_NCKU_Input as input_data +import os import subprocess import time @@ -57,6 +58,48 @@ BUILD_JOBS = 64 ################################################################## +################################################################## + +def prepare_gpu_runtime_env(): + """ + Create a user-private CUDA MPS environment for GPU runs. + + On shared machines another user's daemon may already occupy the default + /tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc + fail with cudaErrorMpsConnectionFailed. Binding AMSS-NCKU to a private + pipe directory avoids cross-user interference. + """ + env = os.environ.copy() + + pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}") + log_dir = env.get("CUDA_MPS_LOG_DIRECTORY", f"/tmp/amss-ncku-mps-log-{os.getuid()}") + + os.makedirs(pipe_dir, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + + env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir + env["CUDA_MPS_LOG_DIRECTORY"] = log_dir + + control_socket = os.path.join(pipe_dir, "control") + if not os.path.exists(control_socket): + start = subprocess.run( + ["nvidia-cuda-mps-control", "-d"], + env=env, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + if start.returncode != 0: + print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}") + else: + print(f" Using private CUDA MPS pipe directory: {pipe_dir}") + else: + print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}") + + return env + +################################################################## + + ################################################################## @@ -146,16 +189,29 @@ def run_ABE(): ## Define the command to run; cast other values to strings as needed + run_env = None + if (input_data.GPU_Calculation == "no"): mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + run_env = prepare_gpu_runtime_env() + if int(input_data.MPI_processes) == 1: + mpi_command = "./ABEGPU" + else: + mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output - mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + mpi_process = subprocess.Popen( + mpi_command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=run_env, + ) ## Write ABE run output to file while printing to stdout with open(mpi_command_outfile, 'w') as file0: