diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index ca6aab8..1a13b41 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -12,6 +12,38 @@ #include "macrodef.h" #include "bssn_rhs.h" +/* ------------------------------------------------------------------ */ +/* Multi-GPU dispatch: distribute ranks across available GPUs */ +/* ------------------------------------------------------------------ */ +static struct { + int num_gpus; + int my_rank; + int my_device; + bool inited; +} g_dispatch = {0, -1, -1, false}; + +static void init_gpu_dispatch() { + if (g_dispatch.inited) return; + cudaGetDeviceCount(&g_dispatch.num_gpus); + if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; + + /* Get MPI rank from environment (set by mpirun/mpiexec). */ + const char *rank_env = getenv("PMI_RANK"); + if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK"); + if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK"); + if (!rank_env) rank_env = getenv("SLURM_PROCID"); + g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0; + + g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus; + cudaSetDevice(g_dispatch.my_device); + + if (g_dispatch.my_rank == 0) { + printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n", + g_dispatch.num_gpus); + } + g_dispatch.inited = true; +} + /* ------------------------------------------------------------------ */ /* Error checking */ /* ------------------------------------------------------------------ */ @@ -96,12 +128,13 @@ struct GpuBuffers { double *d_mem; /* single big allocation */ double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ + double *h_stage; /* host staging buffer for bulk H2D/D2H */ double *slot[NUM_SLOTS]; /* pointers into d_mem */ int prev_nx, prev_ny, prev_nz; bool initialized; }; -static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false }; +static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false }; /* Slot assignments — INPUT (H2D) */ enum { @@ -162,6 +195,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } + if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; } size_t all = (size_t)nx * ny * nz; size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); @@ -171,6 +205,11 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); + /* Host staging buffer for bulk H2D/D2H transfers. + Size = max(H2D input slots, D2H output slots) * all doubles. */ + size_t stage_slots = NUM_USED_SLOTS; /* generous upper bound */ + g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double)); + for (int s = 0; s < NUM_USED_SLOTS; ++s) g_buf.slot[s] = g_buf.d_mem + s * all; @@ -1946,6 +1985,20 @@ int f_compute_rhs_bssn(int *ex, double &T, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &co) { + /* --- Multi-GPU: select device --- */ + init_gpu_dispatch(); + cudaSetDevice(g_dispatch.my_device); + + /* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */ + static int prof_call_count = 0; + const bool do_prof = (g_dispatch.my_rank == 0 && prof_call_count < 20); + cudaEvent_t ev_start, ev_h2d, ev_kern, ev_d2h; + if (do_prof) { + cudaEventCreate(&ev_start); cudaEventCreate(&ev_h2d); + cudaEventCreate(&ev_kern); cudaEventCreate(&ev_d2h); + cudaEventRecord(ev_start); + } + const int nx = ex[0], ny = ex[1], nz = ex[2]; const int all = nx * ny * nz; const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0]; @@ -2019,6 +2072,8 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice)); + if (do_prof) cudaEventRecord(ev_h2d); + /* ============================================================ */ /* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */ /* ============================================================ */ @@ -2401,6 +2456,8 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); } + if (do_prof) cudaEventRecord(ev_kern); + /* ============================================================ */ /* D2H: copy all output arrays back to host */ /* ============================================================ */ @@ -2428,8 +2485,6 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost)); - - /* Christoffel symbols */ CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost)); @@ -2448,16 +2503,12 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost)); - - /* Ricci tensor */ CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost)); - - /* Constraint residuals (only meaningful when co==0) */ if (co == 0) { CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost)); @@ -2468,6 +2519,22 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost)); } + if (do_prof) { + cudaEventRecord(ev_d2h); + cudaEventSynchronize(ev_d2h); + float t_h2d, t_kern, t_d2h; + cudaEventElapsedTime(&t_h2d, ev_start, ev_h2d); + cudaEventElapsedTime(&t_kern, ev_h2d, ev_kern); + cudaEventElapsedTime(&t_d2h, ev_kern, ev_d2h); + printf("[AMSS-PROF] call#%d nx=%d ny=%d nz=%d(all=%d) " + "H2D=%.3fms Kern=%.3fms D2H=%.3fms Total=%.3fms\n", + prof_call_count, nx, ny, nz, all, + t_h2d, t_kern, t_d2h, t_h2d + t_kern + t_d2h); + cudaEventDestroy(ev_start); cudaEventDestroy(ev_h2d); + cudaEventDestroy(ev_kern); cudaEventDestroy(ev_d2h); + prof_call_count++; + } + #undef D return 0; } \ No newline at end of file diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 5f13b89..b72ee4d 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -42,4 +42,4 @@ CLINKER = mpiicpx Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc -CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_89 +CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80