[WIP]Implement multi-GPU support in BSSN RHS and add profiling for H2D/D2H transfers
This commit is contained in:
@@ -12,6 +12,38 @@
|
||||
#include "macrodef.h"
|
||||
#include "bssn_rhs.h"
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Multi-GPU dispatch: distribute ranks across available GPUs */
|
||||
/* ------------------------------------------------------------------ */
|
||||
static struct {
|
||||
int num_gpus;
|
||||
int my_rank;
|
||||
int my_device;
|
||||
bool inited;
|
||||
} g_dispatch = {0, -1, -1, false};
|
||||
|
||||
static void init_gpu_dispatch() {
|
||||
if (g_dispatch.inited) return;
|
||||
cudaGetDeviceCount(&g_dispatch.num_gpus);
|
||||
if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
|
||||
|
||||
/* Get MPI rank from environment (set by mpirun/mpiexec). */
|
||||
const char *rank_env = getenv("PMI_RANK");
|
||||
if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK");
|
||||
if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK");
|
||||
if (!rank_env) rank_env = getenv("SLURM_PROCID");
|
||||
g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0;
|
||||
|
||||
g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus;
|
||||
cudaSetDevice(g_dispatch.my_device);
|
||||
|
||||
if (g_dispatch.my_rank == 0) {
|
||||
printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n",
|
||||
g_dispatch.num_gpus);
|
||||
}
|
||||
g_dispatch.inited = true;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Error checking */
|
||||
/* ------------------------------------------------------------------ */
|
||||
@@ -96,12 +128,13 @@ struct GpuBuffers {
|
||||
double *d_mem; /* single big allocation */
|
||||
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
|
||||
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
|
||||
double *h_stage; /* host staging buffer for bulk H2D/D2H */
|
||||
double *slot[NUM_SLOTS]; /* pointers into d_mem */
|
||||
int prev_nx, prev_ny, prev_nz;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
|
||||
static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
|
||||
|
||||
/* Slot assignments — INPUT (H2D) */
|
||||
enum {
|
||||
@@ -162,6 +195,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
||||
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
|
||||
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
|
||||
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
|
||||
if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; }
|
||||
|
||||
size_t all = (size_t)nx * ny * nz;
|
||||
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
|
||||
@@ -171,6 +205,11 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
|
||||
|
||||
/* Host staging buffer for bulk H2D/D2H transfers.
|
||||
Size = max(H2D input slots, D2H output slots) * all doubles. */
|
||||
size_t stage_slots = NUM_USED_SLOTS; /* generous upper bound */
|
||||
g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double));
|
||||
|
||||
for (int s = 0; s < NUM_USED_SLOTS; ++s)
|
||||
g_buf.slot[s] = g_buf.d_mem + s * all;
|
||||
|
||||
@@ -1946,6 +1985,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co)
|
||||
{
|
||||
/* --- Multi-GPU: select device --- */
|
||||
init_gpu_dispatch();
|
||||
cudaSetDevice(g_dispatch.my_device);
|
||||
|
||||
/* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */
|
||||
static int prof_call_count = 0;
|
||||
const bool do_prof = (g_dispatch.my_rank == 0 && prof_call_count < 20);
|
||||
cudaEvent_t ev_start, ev_h2d, ev_kern, ev_d2h;
|
||||
if (do_prof) {
|
||||
cudaEventCreate(&ev_start); cudaEventCreate(&ev_h2d);
|
||||
cudaEventCreate(&ev_kern); cudaEventCreate(&ev_d2h);
|
||||
cudaEventRecord(ev_start);
|
||||
}
|
||||
|
||||
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
||||
const int all = nx * ny * nz;
|
||||
const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
|
||||
@@ -2019,6 +2072,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice));
|
||||
|
||||
if (do_prof) cudaEventRecord(ev_h2d);
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
|
||||
/* ============================================================ */
|
||||
@@ -2401,6 +2456,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
|
||||
}
|
||||
|
||||
if (do_prof) cudaEventRecord(ev_kern);
|
||||
|
||||
/* ============================================================ */
|
||||
/* D2H: copy all output arrays back to host */
|
||||
/* ============================================================ */
|
||||
@@ -2428,8 +2485,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Christoffel symbols */
|
||||
CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost));
|
||||
@@ -2448,16 +2503,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Ricci tensor */
|
||||
CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Constraint residuals (only meaningful when co==0) */
|
||||
if (co == 0) {
|
||||
CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost));
|
||||
@@ -2468,6 +2519,22 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
if (do_prof) {
|
||||
cudaEventRecord(ev_d2h);
|
||||
cudaEventSynchronize(ev_d2h);
|
||||
float t_h2d, t_kern, t_d2h;
|
||||
cudaEventElapsedTime(&t_h2d, ev_start, ev_h2d);
|
||||
cudaEventElapsedTime(&t_kern, ev_h2d, ev_kern);
|
||||
cudaEventElapsedTime(&t_d2h, ev_kern, ev_d2h);
|
||||
printf("[AMSS-PROF] call#%d nx=%d ny=%d nz=%d(all=%d) "
|
||||
"H2D=%.3fms Kern=%.3fms D2H=%.3fms Total=%.3fms\n",
|
||||
prof_call_count, nx, ny, nz, all,
|
||||
t_h2d, t_kern, t_d2h, t_h2d + t_kern + t_d2h);
|
||||
cudaEventDestroy(ev_start); cudaEventDestroy(ev_h2d);
|
||||
cudaEventDestroy(ev_kern); cudaEventDestroy(ev_d2h);
|
||||
prof_call_count++;
|
||||
}
|
||||
|
||||
#undef D
|
||||
return 0;
|
||||
}
|
||||
@@ -42,4 +42,4 @@ CLINKER = mpiicpx
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_89
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80
|
||||
|
||||
Reference in New Issue
Block a user