[WIP]Implement multi-GPU support in BSSN RHS and add profiling for H2D/D2H transfers

2026-02-28 01:21:45 +08:00
parent 724e9cd415
commit d94c31c5c4
2 changed files with 75 additions and 8 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -12,6 +12,38 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"

+/* ------------------------------------------------------------------ */
+/*  Multi-GPU dispatch: distribute ranks across available GPUs          */
+/* ------------------------------------------------------------------ */
+static struct {
+    int  num_gpus;
+    int  my_rank;
+    int  my_device;
+    bool inited;
+} g_dispatch = {0, -1, -1, false};
+
+static void init_gpu_dispatch() {
+    if (g_dispatch.inited) return;
+    cudaGetDeviceCount(&g_dispatch.num_gpus);
+    if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
+
+    /* Get MPI rank from environment (set by mpirun/mpiexec). */
+    const char *rank_env = getenv("PMI_RANK");
+    if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK");
+    if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK");
+    if (!rank_env) rank_env = getenv("SLURM_PROCID");
+    g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0;
+
+    g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus;
+    cudaSetDevice(g_dispatch.my_device);
+
+    if (g_dispatch.my_rank == 0) {
+        printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n",
+               g_dispatch.num_gpus);
+    }
+    g_dispatch.inited = true;
+}
+
 /* ------------------------------------------------------------------ */
 /*  Error checking                                                     */
 /* ------------------------------------------------------------------ */
@@ -96,12 +128,13 @@ struct GpuBuffers {
    double *d_mem;          /* single big allocation */
    double *d_fh2;          /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
    double *d_fh3;          /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
+    double *h_stage;        /* host staging buffer for bulk H2D/D2H */
    double *slot[NUM_SLOTS]; /* pointers into d_mem */
    int     prev_nx, prev_ny, prev_nz;
    bool    initialized;
 };

-static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
+static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false };

 /* Slot assignments — INPUT (H2D) */
 enum {
@@ -162,6 +195,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
    if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
    if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
    if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
+    if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; }

    size_t all = (size_t)nx * ny * nz;
    size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
@@ -171,6 +205,11 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
    CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));

+    /* Host staging buffer for bulk H2D/D2H transfers.
+       Size = max(H2D input slots, D2H output slots) * all doubles. */
+    size_t stage_slots = NUM_USED_SLOTS;  /* generous upper bound */
+    g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double));
+
    for (int s = 0; s < NUM_USED_SLOTS; ++s)
        g_buf.slot[s] = g_buf.d_mem + s * all;

@@ -1946,6 +1985,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
    double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
    int &Symmetry, int &Lev, double &eps, int &co)
 {
+    /* --- Multi-GPU: select device --- */
+    init_gpu_dispatch();
+    cudaSetDevice(g_dispatch.my_device);
+
+    /* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */
+    static int prof_call_count = 0;
+    const bool do_prof = (g_dispatch.my_rank == 0 && prof_call_count < 20);
+    cudaEvent_t ev_start, ev_h2d, ev_kern, ev_d2h;
+    if (do_prof) {
+        cudaEventCreate(&ev_start); cudaEventCreate(&ev_h2d);
+        cudaEventCreate(&ev_kern);  cudaEventCreate(&ev_d2h);
+        cudaEventRecord(ev_start);
+    }
+
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
@@ -2019,6 +2072,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
    CUDA_CHECK(cudaMemcpy(D(S_Syz),   Syz_m, bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(D(S_Szz),   Szz,   bytes, cudaMemcpyHostToDevice));

+    if (do_prof) cudaEventRecord(ev_h2d);
+
    /* ============================================================ */
    /*  Phase 1: prep — alpn1, chin1, gxx, gyy, gzz                 */
    /* ============================================================ */
@@ -2401,6 +2456,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
            D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
    }

+    if (do_prof) cudaEventRecord(ev_kern);
+
    /* ============================================================ */
    /*  D2H: copy all output arrays back to host                     */
    /* ============================================================ */
@@ -2428,8 +2485,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
    CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost));
-
-    /* Christoffel symbols */
    CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost));
@@ -2448,16 +2503,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
    CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost));
-
-    /* Ricci tensor */
    CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost));
-
-    /* Constraint residuals (only meaningful when co==0) */
    if (co == 0) {
        CUDA_CHECK(cudaMemcpy(ham_Res,  D(S_ham_Res),  bytes, cudaMemcpyDeviceToHost));
        CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost));
@@ -2468,6 +2519,22 @@ int f_compute_rhs_bssn(int *ex, double &T,
        CUDA_CHECK(cudaMemcpy(Gmz_Res,  D(S_Gmz_Res),  bytes, cudaMemcpyDeviceToHost));
    }

+    if (do_prof) {
+        cudaEventRecord(ev_d2h);
+        cudaEventSynchronize(ev_d2h);
+        float t_h2d, t_kern, t_d2h;
+        cudaEventElapsedTime(&t_h2d,  ev_start, ev_h2d);
+        cudaEventElapsedTime(&t_kern, ev_h2d,   ev_kern);
+        cudaEventElapsedTime(&t_d2h,  ev_kern,  ev_d2h);
+        printf("[AMSS-PROF] call#%d nx=%d ny=%d nz=%d(all=%d) "
+               "H2D=%.3fms Kern=%.3fms D2H=%.3fms Total=%.3fms\n",
+               prof_call_count, nx, ny, nz, all,
+               t_h2d, t_kern, t_d2h, t_h2d + t_kern + t_d2h);
+        cudaEventDestroy(ev_start); cudaEventDestroy(ev_h2d);
+        cudaEventDestroy(ev_kern);  cudaEventDestroy(ev_d2h);
+        prof_call_count++;
+    }
+
    #undef D
    return 0;
 }
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -42,4 +42,4 @@ CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_89
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80