[WIP]Implement multi-GPU support in BSSN RHS and add profiling for H2D/D2H transfers

This commit is contained in:
2026-02-28 01:21:45 +08:00
parent 724e9cd415
commit d94c31c5c4
2 changed files with 75 additions and 8 deletions

View File

@@ -12,6 +12,38 @@
#include "macrodef.h" #include "macrodef.h"
#include "bssn_rhs.h" #include "bssn_rhs.h"
/* ------------------------------------------------------------------ */
/* Multi-GPU dispatch: distribute ranks across available GPUs */
/* ------------------------------------------------------------------ */
static struct {
int num_gpus;
int my_rank;
int my_device;
bool inited;
} g_dispatch = {0, -1, -1, false};
static void init_gpu_dispatch() {
if (g_dispatch.inited) return;
cudaGetDeviceCount(&g_dispatch.num_gpus);
if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
/* Get MPI rank from environment (set by mpirun/mpiexec). */
const char *rank_env = getenv("PMI_RANK");
if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK");
if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK");
if (!rank_env) rank_env = getenv("SLURM_PROCID");
g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0;
g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus;
cudaSetDevice(g_dispatch.my_device);
if (g_dispatch.my_rank == 0) {
printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n",
g_dispatch.num_gpus);
}
g_dispatch.inited = true;
}
/* ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ */
/* Error checking */ /* Error checking */
/* ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ */
@@ -96,12 +128,13 @@ struct GpuBuffers {
double *d_mem; /* single big allocation */ double *d_mem; /* single big allocation */
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
double *h_stage; /* host staging buffer for bulk H2D/D2H */
double *slot[NUM_SLOTS]; /* pointers into d_mem */ double *slot[NUM_SLOTS]; /* pointers into d_mem */
int prev_nx, prev_ny, prev_nz; int prev_nx, prev_ny, prev_nz;
bool initialized; bool initialized;
}; };
static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false }; static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
/* Slot assignments — INPUT (H2D) */ /* Slot assignments — INPUT (H2D) */
enum { enum {
@@ -162,6 +195,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; }
size_t all = (size_t)nx * ny * nz; size_t all = (size_t)nx * ny * nz;
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
@@ -171,6 +205,11 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
/* Host staging buffer for bulk H2D/D2H transfers.
Size = max(H2D input slots, D2H output slots) * all doubles. */
size_t stage_slots = NUM_USED_SLOTS; /* generous upper bound */
g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double));
for (int s = 0; s < NUM_USED_SLOTS; ++s) for (int s = 0; s < NUM_USED_SLOTS; ++s)
g_buf.slot[s] = g_buf.d_mem + s * all; g_buf.slot[s] = g_buf.d_mem + s * all;
@@ -1946,6 +1985,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co) int &Symmetry, int &Lev, double &eps, int &co)
{ {
/* --- Multi-GPU: select device --- */
init_gpu_dispatch();
cudaSetDevice(g_dispatch.my_device);
/* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */
static int prof_call_count = 0;
const bool do_prof = (g_dispatch.my_rank == 0 && prof_call_count < 20);
cudaEvent_t ev_start, ev_h2d, ev_kern, ev_d2h;
if (do_prof) {
cudaEventCreate(&ev_start); cudaEventCreate(&ev_h2d);
cudaEventCreate(&ev_kern); cudaEventCreate(&ev_d2h);
cudaEventRecord(ev_start);
}
const int nx = ex[0], ny = ex[1], nz = ex[2]; const int nx = ex[0], ny = ex[1], nz = ex[2];
const int all = nx * ny * nz; const int all = nx * ny * nz;
const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0]; const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
@@ -2019,6 +2072,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice));
if (do_prof) cudaEventRecord(ev_h2d);
/* ============================================================ */ /* ============================================================ */
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */ /* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
/* ============================================================ */ /* ============================================================ */
@@ -2401,6 +2456,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
} }
if (do_prof) cudaEventRecord(ev_kern);
/* ============================================================ */ /* ============================================================ */
/* D2H: copy all output arrays back to host */ /* D2H: copy all output arrays back to host */
/* ============================================================ */ /* ============================================================ */
@@ -2428,8 +2485,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost));
/* Christoffel symbols */
CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost));
@@ -2448,16 +2503,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost));
/* Ricci tensor */
CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost));
/* Constraint residuals (only meaningful when co==0) */
if (co == 0) { if (co == 0) {
CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost));
@@ -2468,6 +2519,22 @@ int f_compute_rhs_bssn(int *ex, double &T,
CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost));
} }
if (do_prof) {
cudaEventRecord(ev_d2h);
cudaEventSynchronize(ev_d2h);
float t_h2d, t_kern, t_d2h;
cudaEventElapsedTime(&t_h2d, ev_start, ev_h2d);
cudaEventElapsedTime(&t_kern, ev_h2d, ev_kern);
cudaEventElapsedTime(&t_d2h, ev_kern, ev_d2h);
printf("[AMSS-PROF] call#%d nx=%d ny=%d nz=%d(all=%d) "
"H2D=%.3fms Kern=%.3fms D2H=%.3fms Total=%.3fms\n",
prof_call_count, nx, ny, nz, all,
t_h2d, t_kern, t_d2h, t_h2d + t_kern + t_d2h);
cudaEventDestroy(ev_start); cudaEventDestroy(ev_h2d);
cudaEventDestroy(ev_kern); cudaEventDestroy(ev_d2h);
prof_call_count++;
}
#undef D #undef D
return 0; return 0;
} }

View File

@@ -42,4 +42,4 @@ CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_89 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80