diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 1a13b41..94b073a 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5,44 +5,61 @@ * Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o. */ -#include -#include -#include -#include -#include "macrodef.h" -#include "bssn_rhs.h" +#include +#include +#include +#include +#include +#include "macrodef.h" +#include "bssn_rhs.h" /* ------------------------------------------------------------------ */ /* Multi-GPU dispatch: distribute ranks across available GPUs */ /* ------------------------------------------------------------------ */ -static struct { - int num_gpus; - int my_rank; - int my_device; - bool inited; -} g_dispatch = {0, -1, -1, false}; - -static void init_gpu_dispatch() { - if (g_dispatch.inited) return; - cudaGetDeviceCount(&g_dispatch.num_gpus); - if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; - - /* Get MPI rank from environment (set by mpirun/mpiexec). */ - const char *rank_env = getenv("PMI_RANK"); - if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK"); - if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK"); - if (!rank_env) rank_env = getenv("SLURM_PROCID"); - g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0; - - g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus; - cudaSetDevice(g_dispatch.my_device); - - if (g_dispatch.my_rank == 0) { - printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n", - g_dispatch.num_gpus); - } - g_dispatch.inited = true; -} +static struct { + int num_gpus; + int my_rank; + int my_local_rank; + int my_device; + bool inited; +} g_dispatch = {0, -1, -1, -1, false}; + +static int env_to_int(const char *name, int fallback = -1) { + const char *v = getenv(name); + if (!v || !*v) return fallback; + return atoi(v); +} + +static void init_gpu_dispatch() { + if (g_dispatch.inited) return; + cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus); + if (err != cudaSuccess) g_dispatch.num_gpus = 1; + if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; + + /* Get MPI rank from environment (set by mpirun/mpiexec). */ + g_dispatch.my_rank = env_to_int("PMI_RANK", + env_to_int("OMPI_COMM_WORLD_RANK", + env_to_int("MV2_COMM_WORLD_RANK", + env_to_int("SLURM_PROCID", 0)))); + + /* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */ + g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK", + env_to_int("MV2_COMM_WORLD_LOCAL_RANK", + env_to_int("MPI_LOCALRANKID", + env_to_int("SLURM_LOCALID", -1)))); + + const int rank_for_map = (g_dispatch.my_local_rank >= 0) + ? g_dispatch.my_local_rank : g_dispatch.my_rank; + g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus; + cudaSetDevice(g_dispatch.my_device); + + if (g_dispatch.my_rank == 0) { + printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n", + g_dispatch.num_gpus, + (g_dispatch.my_local_rank >= 0) ? "local" : "global"); + } + g_dispatch.inited = true; +} /* ------------------------------------------------------------------ */ /* Error checking */ @@ -124,17 +141,24 @@ __device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) { /* Total number of "all"-sized slots */ #define NUM_SLOTS 160 -struct GpuBuffers { - double *d_mem; /* single big allocation */ - double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ - double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ - double *h_stage; /* host staging buffer for bulk H2D/D2H */ - double *slot[NUM_SLOTS]; /* pointers into d_mem */ - int prev_nx, prev_ny, prev_nz; - bool initialized; -}; - -static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false }; +struct GpuBuffers { + double *d_mem; /* single big allocation */ + double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ + double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ + double *h_stage; /* host staging buffer for bulk H2D/D2H */ + bool h_stage_pinned; /* true if allocated by cudaMallocHost */ + double *slot[NUM_SLOTS]; /* pointers into d_mem */ + size_t cap_all; + size_t cap_fh2_size; + size_t cap_fh3_size; + int prev_nx, prev_ny, prev_nz; + bool initialized; +}; + +static GpuBuffers g_buf = { + nullptr, nullptr, nullptr, nullptr, false, {}, + 0, 0, 0, 0, 0, 0, false +}; /* Slot assignments — INPUT (H2D) */ enum { @@ -185,39 +209,66 @@ enum { NUM_USED_SLOTS }; -static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS"); - -static void ensure_gpu_buffers(int nx, int ny, int nz) { - if (g_buf.initialized && g_buf.prev_nx == nx && - g_buf.prev_ny == ny && g_buf.prev_nz == nz) - return; - - if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } - if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } - if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } - if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; } - - size_t all = (size_t)nx * ny * nz; - size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); - size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3); - - CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); - CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); - - /* Host staging buffer for bulk H2D/D2H transfers. - Size = max(H2D input slots, D2H output slots) * all doubles. */ - size_t stage_slots = NUM_USED_SLOTS; /* generous upper bound */ - g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double)); - - for (int s = 0; s < NUM_USED_SLOTS; ++s) - g_buf.slot[s] = g_buf.d_mem + s * all; - - g_buf.prev_nx = nx; - g_buf.prev_ny = ny; - g_buf.prev_nz = nz; - g_buf.initialized = true; -} +static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS"); + +static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1); +static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1); +static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1); +static const int STAGE_SLOT_COUNT = + (H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT)) + ? H2D_INPUT_SLOT_COUNT + : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT); + +static void ensure_gpu_buffers(int nx, int ny, int nz) { + size_t all = (size_t)nx * ny * nz; + size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); + size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3); + const bool need_grow = (!g_buf.initialized) + || (all > g_buf.cap_all) + || (fh2_size > g_buf.cap_fh2_size) + || (fh3_size > g_buf.cap_fh3_size); + + if (need_grow) { + if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } + if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } + if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } + if (g_buf.h_stage) { + if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage); + else free(g_buf.h_stage); + g_buf.h_stage = nullptr; + g_buf.h_stage_pinned = false; + } + + CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); + + const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double); + cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes); + if (stage_err == cudaSuccess) { + g_buf.h_stage_pinned = true; + } else { + g_buf.h_stage = (double *)malloc(stage_bytes); + g_buf.h_stage_pinned = false; + if (!g_buf.h_stage) { + fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes); + exit(EXIT_FAILURE); + } + } + + g_buf.cap_all = all; + g_buf.cap_fh2_size = fh2_size; + g_buf.cap_fh3_size = fh3_size; + g_buf.initialized = true; + } + + for (int s = 0; s < NUM_USED_SLOTS; ++s) + g_buf.slot[s] = g_buf.d_mem + s * all; + + g_buf.prev_nx = nx; + g_buf.prev_ny = ny; + g_buf.prev_nz = nz; +} /* ================================================================== */ /* A. Symmetry boundary kernels (ord=2 and ord=3) */ @@ -753,60 +804,93 @@ void kern_kodis(const double * __restrict__ fh, /* ================================================================== */ /* Host wrapper helpers */ /* ================================================================== */ -static const int BLK = 128; -static inline int grid(int n) { return (n + BLK - 1) / BLK; } +static const int BLK = 128; +static inline int grid(size_t n) { + if (n == 0) return 1; + size_t g = (n + BLK - 1) / BLK; + if (g > 2147483647u) g = 2147483647u; + return (int)g; +} /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */ static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, double SoA0, double SoA1, double SoA2, int all) -{ - double *fh = g_buf.d_fh2; - kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_symbd_ighost_ord2<<>>(fh, SoA0); - kern_symbd_jghost_ord2<<>>(fh, SoA1); - kern_symbd_kghost_ord2<<>>(fh, SoA2); - kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); -} +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_ighost = 2ull * ny * nz; + const size_t w_jghost = 2ull * (nx + 2ull) * nz; + const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull); + + kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_symbd_ighost_ord2<<>>(fh, SoA0); + kern_symbd_jghost_ord2<<>>(fh, SoA1); + kern_symbd_kghost_ord2<<>>(fh, SoA2); + kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); +} /* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */ static void gpu_fdderivs(double *d_f, double *d_fxx, double *d_fxy, double *d_fxz, double *d_fyy, double *d_fyz, double *d_fzz, double SoA0, double SoA1, double SoA2, int all) -{ - double *fh = g_buf.d_fh2; - kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_symbd_ighost_ord2<<>>(fh, SoA0); - kern_symbd_jghost_ord2<<>>(fh, SoA1); - kern_symbd_kghost_ord2<<>>(fh, SoA2); - kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); -} +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_ighost = 2ull * ny * nz; + const size_t w_jghost = 2ull * (nx + 2ull) * nz; + const size_t w_kghost = 2ull * (nx + 2ull) * (ny + 2ull); + + kern_symbd_copy_interior_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_symbd_ighost_ord2<<>>(fh, SoA0); + kern_symbd_jghost_ord2<<>>(fh, SoA1); + kern_symbd_kghost_ord2<<>>(fh, SoA2); + kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); +} /* symmetry_bd on GPU for ord=3, then launch lopsided kernel */ static void gpu_lopsided(double *d_f, double *d_f_rhs, double *d_Sfx, double *d_Sfy, double *d_Sfz, double SoA0, double SoA1, double SoA2, int all) -{ - double *fh = g_buf.d_fh3; - kern_symbd_copy_interior_ord3<<>>(d_f, fh); - kern_symbd_ighost_ord3<<>>(fh, SoA0); - kern_symbd_jghost_ord3<<>>(fh, SoA1); - kern_symbd_kghost_ord3<<>>(fh, SoA2); - kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); -} +{ + double *fh = g_buf.d_fh3; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_ighost = 3ull * ny * nz; + const size_t w_jghost = 3ull * (nx + 3ull) * nz; + const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull); + + kern_symbd_copy_interior_ord3<<>>(d_f, fh); + kern_symbd_ighost_ord3<<>>(fh, SoA0); + kern_symbd_jghost_ord3<<>>(fh, SoA1); + kern_symbd_kghost_ord3<<>>(fh, SoA2); + kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); +} /* symmetry_bd on GPU for ord=3, then launch kodis kernel */ static void gpu_kodis(double *d_f, double *d_f_rhs, double SoA0, double SoA1, double SoA2, double eps_val, int all) -{ - double *fh = g_buf.d_fh3; - kern_symbd_copy_interior_ord3<<>>(d_f, fh); - kern_symbd_ighost_ord3<<>>(fh, SoA0); - kern_symbd_jghost_ord3<<>>(fh, SoA1); - kern_symbd_kghost_ord3<<>>(fh, SoA2); - kern_kodis<<>>(fh, d_f_rhs, eps_val); -} +{ + double *fh = g_buf.d_fh3; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_ighost = 3ull * ny * nz; + const size_t w_jghost = 3ull * (nx + 3ull) * nz; + const size_t w_kghost = 3ull * (nx + 3ull) * (ny + 3ull); + + kern_symbd_copy_interior_ord3<<>>(d_f, fh); + kern_symbd_ighost_ord3<<>>(fh, SoA0); + kern_symbd_jghost_ord3<<>>(fh, SoA1); + kern_symbd_kghost_ord3<<>>(fh, SoA2); + kern_kodis<<>>(fh, d_f_rhs, eps_val); +} /* ================================================================== */ /* C. Point-wise computation kernels */ @@ -1984,10 +2068,10 @@ int f_compute_rhs_bssn(int *ex, double &T, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &co) -{ - /* --- Multi-GPU: select device --- */ - init_gpu_dispatch(); - cudaSetDevice(g_dispatch.my_device); +{ + /* --- Multi-GPU: select device --- */ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); /* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */ static int prof_call_count = 0; @@ -2033,44 +2117,27 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams))); /* --- Shorthand for device slot pointers --- */ - #define D(s) g_buf.slot[s] - const size_t bytes = (size_t)all * sizeof(double); - - /* --- H2D: copy all input arrays --- */ - CUDA_CHECK(cudaMemcpy(D(S_chi), chi, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_trK), trK, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dxx), dxx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_gxy), gxy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_gxz), gxz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dyy), dyy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_gyz), gyz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dzz), dzz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Axx), Axx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Axy), Axy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Axz), Axz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Ayy), Ayy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Ayz), Ayz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Azz), Azz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Gamx), Gamx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Gamy), Gamy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Gamz), Gamz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Lap), Lap, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_betax), betax, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_betay), betay, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_betaz), betaz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dtSfx), dtSfx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dtSfy), dtSfy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_dtSfz), dtSfz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_rho), rho, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sx), Sx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sy), Sy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sz), Sz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sxx), Sxx, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sxy), Sxy_m, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Sxz), Sxz, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Syy), Syy, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Syz), Syz_m, bytes, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(D(S_Szz), Szz, bytes, cudaMemcpyHostToDevice)); + #define D(s) g_buf.slot[s] + const size_t bytes = (size_t)all * sizeof(double); + + /* --- H2D: stage all inputs, then one bulk copy --- */ + double *h2d_src[] = { + chi, trK, dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, + rho, Sx, Sy, Sz, + Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz + }; + static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT, + "h2d_src list must match H2D_INPUT_SLOT_COUNT"); + for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) { + std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes); + } + CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage, + (size_t)H2D_INPUT_SLOT_COUNT * bytes, + cudaMemcpyHostToDevice)); if (do_prof) cudaEventRecord(ev_h2d); @@ -2458,66 +2525,45 @@ int f_compute_rhs_bssn(int *ex, double &T, if (do_prof) cudaEventRecord(ev_kern); - /* ============================================================ */ - /* D2H: copy all output arrays back to host */ - /* ============================================================ */ - CUDA_CHECK(cudaMemcpy(chi_rhs, D(S_chi_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(trK_rhs, D(S_trK_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gxx_rhs, D(S_gxx_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gxy_rhs, D(S_gxy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gxz_rhs, D(S_gxz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gyy_rhs, D(S_gyy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gyz_rhs, D(S_gyz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(gzz_rhs, D(S_gzz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Axx_rhs, D(S_Axx_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Axy_rhs, D(S_Axy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Axz_rhs, D(S_Axz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Ayy_rhs, D(S_Ayy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Ayz_rhs, D(S_Ayz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Azz_rhs, D(S_Azz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamx_rhs, D(S_Gamx_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamy_rhs, D(S_Gamy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamz_rhs, D(S_Gamz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Lap_rhs, D(S_Lap_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(betax_rhs, D(S_betax_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(betay_rhs, D(S_betay_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(betaz_rhs, D(S_betaz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxyy, D(S_Gamxyy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxyz, D(S_Gamxyz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamxzz, D(S_Gamxzz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyxx, D(S_Gamyxx), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyxy, D(S_Gamyxy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyxz, D(S_Gamyxz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyyy, D(S_Gamyyy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyyz, D(S_Gamyyz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamyzz, D(S_Gamyzz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzxx, D(S_Gamzxx), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzxy, D(S_Gamzxy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzxz, D(S_Gamzxz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost)); - if (co == 0) { - CUDA_CHECK(cudaMemcpy(ham_Res, D(S_ham_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(movy_Res, D(S_movy_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(movz_Res, D(S_movz_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gmx_Res, D(S_Gmx_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gmy_Res, D(S_Gmy_Res), bytes, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(Gmz_Res, D(S_Gmz_Res), bytes, cudaMemcpyDeviceToHost)); - } + /* ============================================================ */ + /* D2H: copy all output arrays back to host */ + /* ============================================================ */ + const int d2h_slot_count = D2H_BASE_SLOT_COUNT + + ((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs), + (size_t)d2h_slot_count * bytes, + cudaMemcpyDeviceToHost)); + + double *d2h_dst[] = { + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz + }; + static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT, + "d2h_dst list must match D2H_BASE_SLOT_COUNT"); + for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) { + std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes); + } + if (co == 0) { + double *d2h_dst_co[] = { + ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res + }; + static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) == + D2H_CONSTRAINT_SLOT_COUNT, + "d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT"); + for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) { + std::memcpy(d2h_dst_co[s], + g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all, + bytes); + } + } if (do_prof) { cudaEventRecord(ev_d2h); @@ -2537,4 +2583,4 @@ int f_compute_rhs_bssn(int *ex, double &T, #undef D return 0; -} \ No newline at end of file +}