From 5839755c2f71a433d6852051b7db5b6360635151 Mon Sep 17 00:00:00 2001 From: ianchb Date: Mon, 2 Mar 2026 12:12:58 +0800 Subject: [PATCH] compute div_beta on-the-fly to remove temp array --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 855 +++++++++++++++--------------- 1 file changed, 424 insertions(+), 431 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 1488073..640b1c1 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5,61 +5,61 @@ * Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o. */ -#include -#include -#include -#include -#include -#include "macrodef.h" -#include "bssn_rhs.h" +#include +#include +#include +#include +#include +#include "macrodef.h" +#include "bssn_rhs.h" /* ------------------------------------------------------------------ */ /* Multi-GPU dispatch: distribute ranks across available GPUs */ /* ------------------------------------------------------------------ */ -static struct { - int num_gpus; - int my_rank; - int my_local_rank; - int my_device; - bool inited; -} g_dispatch = {0, -1, -1, -1, false}; - -static int env_to_int(const char *name, int fallback = -1) { - const char *v = getenv(name); - if (!v || !*v) return fallback; - return atoi(v); -} - -static void init_gpu_dispatch() { - if (g_dispatch.inited) return; - cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus); - if (err != cudaSuccess) g_dispatch.num_gpus = 1; - if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; - - /* Get MPI rank from environment (set by mpirun/mpiexec). */ - g_dispatch.my_rank = env_to_int("PMI_RANK", - env_to_int("OMPI_COMM_WORLD_RANK", - env_to_int("MV2_COMM_WORLD_RANK", - env_to_int("SLURM_PROCID", 0)))); - - /* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */ - g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK", - env_to_int("MV2_COMM_WORLD_LOCAL_RANK", - env_to_int("MPI_LOCALRANKID", - env_to_int("SLURM_LOCALID", -1)))); - - const int rank_for_map = (g_dispatch.my_local_rank >= 0) - ? g_dispatch.my_local_rank : g_dispatch.my_rank; - g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus; - cudaSetDevice(g_dispatch.my_device); - - if (g_dispatch.my_rank == 0) { - printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n", - g_dispatch.num_gpus, - (g_dispatch.my_local_rank >= 0) ? "local" : "global"); - } - g_dispatch.inited = true; -} +static struct { + int num_gpus; + int my_rank; + int my_local_rank; + int my_device; + bool inited; +} g_dispatch = {0, -1, -1, -1, false}; + +static int env_to_int(const char *name, int fallback = -1) { + const char *v = getenv(name); + if (!v || !*v) return fallback; + return atoi(v); +} + +static void init_gpu_dispatch() { + if (g_dispatch.inited) return; + cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus); + if (err != cudaSuccess) g_dispatch.num_gpus = 1; + if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1; + + /* Get MPI rank from environment (set by mpirun/mpiexec). */ + g_dispatch.my_rank = env_to_int("PMI_RANK", + env_to_int("OMPI_COMM_WORLD_RANK", + env_to_int("MV2_COMM_WORLD_RANK", + env_to_int("SLURM_PROCID", 0)))); + + /* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */ + g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK", + env_to_int("MV2_COMM_WORLD_LOCAL_RANK", + env_to_int("MPI_LOCALRANKID", + env_to_int("SLURM_LOCALID", -1)))); + + const int rank_for_map = (g_dispatch.my_local_rank >= 0) + ? g_dispatch.my_local_rank : g_dispatch.my_rank; + g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus; + cudaSetDevice(g_dispatch.my_device); + + if (g_dispatch.my_rank == 0) { + printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n", + g_dispatch.num_gpus, + (g_dispatch.my_local_rank >= 0) ? "local" : "global"); + } + g_dispatch.inited = true; +} /* ------------------------------------------------------------------ */ /* Error checking */ @@ -141,24 +141,24 @@ __device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) { /* Total number of "all"-sized slots */ #define NUM_SLOTS 160 -struct GpuBuffers { - double *d_mem; /* single big allocation */ - double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ - double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ - double *h_stage; /* host staging buffer for bulk H2D/D2H */ - bool h_stage_pinned; /* true if allocated by cudaMallocHost */ - double *slot[NUM_SLOTS]; /* pointers into d_mem */ - size_t cap_all; - size_t cap_fh2_size; - size_t cap_fh3_size; - int prev_nx, prev_ny, prev_nz; - bool initialized; -}; - -static GpuBuffers g_buf = { - nullptr, nullptr, nullptr, nullptr, false, {}, - 0, 0, 0, 0, 0, 0, false -}; +struct GpuBuffers { + double *d_mem; /* single big allocation */ + double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */ + double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */ + double *h_stage; /* host staging buffer for bulk H2D/D2H */ + bool h_stage_pinned; /* true if allocated by cudaMallocHost */ + double *slot[NUM_SLOTS]; /* pointers into d_mem */ + size_t cap_all; + size_t cap_fh2_size; + size_t cap_fh3_size; + int prev_nx, prev_ny, prev_nz; + bool initialized; +}; + +static GpuBuffers g_buf = { + nullptr, nullptr, nullptr, nullptr, false, {}, + 0, 0, 0, 0, 0, 0, false +}; /* Slot assignments — INPUT (H2D) */ enum { @@ -200,8 +200,8 @@ enum { S_Gamxx, S_Gamxy, S_Gamxz, S_Gamyx, S_Gamyy_t, S_Gamyz_t, S_Gamzx, S_Gamzy, S_Gamzz_t, - S_Kx, S_Ky, S_Kz, - S_div_beta, S_S_arr, S_f_arr, + S_Kx, S_Ky, S_Kz, + S_S_arr, S_f_arr, S_fxx, S_fxy, S_fxz, S_fyy, S_fyz, S_fzz, S_Gamxa, S_Gamya, S_Gamza, S_gupxx, S_gupxy, S_gupxz, @@ -209,75 +209,75 @@ enum { NUM_USED_SLOTS }; -static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS"); - -static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1); -static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1); -static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1); -static const int STAGE_SLOT_COUNT = - (H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT)) - ? H2D_INPUT_SLOT_COUNT - : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT); - -static void ensure_gpu_buffers(int nx, int ny, int nz) { - size_t all = (size_t)nx * ny * nz; - size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); - size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3); - const bool need_grow = (!g_buf.initialized) - || (all > g_buf.cap_all) - || (fh2_size > g_buf.cap_fh2_size) - || (fh3_size > g_buf.cap_fh3_size); - - if (need_grow) { - if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } - if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } - if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } - if (g_buf.h_stage) { - if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage); - else free(g_buf.h_stage); - g_buf.h_stage = nullptr; - g_buf.h_stage_pinned = false; - } - - CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); - CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); - - const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double); - cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes); - if (stage_err == cudaSuccess) { - g_buf.h_stage_pinned = true; - } else { - g_buf.h_stage = (double *)malloc(stage_bytes); - g_buf.h_stage_pinned = false; - if (!g_buf.h_stage) { - fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes); - exit(EXIT_FAILURE); - } - } - - g_buf.cap_all = all; - g_buf.cap_fh2_size = fh2_size; - g_buf.cap_fh3_size = fh3_size; - g_buf.initialized = true; - } - - for (int s = 0; s < NUM_USED_SLOTS; ++s) - g_buf.slot[s] = g_buf.d_mem + s * all; - - g_buf.prev_nx = nx; - g_buf.prev_ny = ny; - g_buf.prev_nz = nz; -} +static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS"); + +static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1); +static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1); +static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1); +static const int STAGE_SLOT_COUNT = + (H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT)) + ? H2D_INPUT_SLOT_COUNT + : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT); + +static void ensure_gpu_buffers(int nx, int ny, int nz) { + size_t all = (size_t)nx * ny * nz; + size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2); + size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3); + const bool need_grow = (!g_buf.initialized) + || (all > g_buf.cap_all) + || (fh2_size > g_buf.cap_fh2_size) + || (fh3_size > g_buf.cap_fh3_size); + + if (need_grow) { + if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } + if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } + if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } + if (g_buf.h_stage) { + if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage); + else free(g_buf.h_stage); + g_buf.h_stage = nullptr; + g_buf.h_stage_pinned = false; + } + + CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double))); + CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double))); + + const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double); + cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes); + if (stage_err == cudaSuccess) { + g_buf.h_stage_pinned = true; + } else { + g_buf.h_stage = (double *)malloc(stage_bytes); + g_buf.h_stage_pinned = false; + if (!g_buf.h_stage) { + fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes); + exit(EXIT_FAILURE); + } + } + + g_buf.cap_all = all; + g_buf.cap_fh2_size = fh2_size; + g_buf.cap_fh3_size = fh3_size; + g_buf.initialized = true; + } + + for (int s = 0; s < NUM_USED_SLOTS; ++s) + g_buf.slot[s] = g_buf.d_mem + s * all; + + g_buf.prev_nx = nx; + g_buf.prev_ny = ny; + g_buf.prev_nz = nz; +} /* ================================================================== */ /* A. Symmetry boundary kernels (ord=2 and ord=3) */ /* ================================================================== */ -/* Step 1: Copy interior into ghost-padded array */ -__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, - double * __restrict__ fh, - double SoA0, double SoA1, double SoA2) +/* Step 1: Copy interior into ghost-padded array */ +__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) { const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; @@ -290,47 +290,47 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func, int k0 = tid / (nx * ny); int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid]; - } -} - -/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */ -__global__ void kern_symbd_pack_ord2(const double * __restrict__ func, - double * __restrict__ fh, - double SoA0, double SoA1, double SoA2) -{ - const int nx = d_gp.ex[0], ny = d_gp.ex[1]; - const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz; - const int total = fnx * fny * fnz; - - for (int tid = blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += blockDim.x * gridDim.x) - { - int ii = tid % fnx; - int jj = (tid / fnx) % fny; - int kk = tid / (fnx * fny); - - int iF = ii - 1; /* -1 .. nx */ - int jF = jj - 1; /* -1 .. ny */ - int kF = kk - 1; /* -1 .. nz */ - - int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ - int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ - int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ - - double sign = 1.0; - if (iF <= 0) sign *= SoA0; - if (jF <= 0) sign *= SoA1; - if (kF <= 0) sign *= SoA2; - - int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; - fh[tid] = sign * func[src]; - } -} - -/* Step 2: Fill i-ghosts (x-direction symmetry) */ -__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0) -{ + } +} + +/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord2(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 1; /* -1 .. nx */ + int jF = jj - 1; /* -1 .. ny */ + int kF = kk - 1; /* -1 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +/* Step 2: Fill i-ghosts (x-direction symmetry) */ +__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0) +{ const int ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny; /* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */ @@ -404,9 +404,9 @@ __global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2) /* ---- ord=3 variants (for lopsided / kodis) ---- */ -__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, - double * __restrict__ fh) -{ +__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, + double * __restrict__ fh) +{ const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; for (int tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -418,46 +418,46 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func, int k0 = tid / (nx * ny); int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid]; - } -} - -/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */ -__global__ void kern_symbd_pack_ord3(const double * __restrict__ func, - double * __restrict__ fh, - double SoA0, double SoA1, double SoA2) -{ - const int nx = d_gp.ex[0], ny = d_gp.ex[1]; - const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz; - const int total = fnx * fny * fnz; - - for (int tid = blockIdx.x * blockDim.x + threadIdx.x; - tid < total; - tid += blockDim.x * gridDim.x) - { - int ii = tid % fnx; - int jj = (tid / fnx) % fny; - int kk = tid / (fnx * fny); - - int iF = ii - 2; /* -2 .. nx */ - int jF = jj - 2; /* -2 .. ny */ - int kF = kk - 2; /* -2 .. nz */ - - int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ - int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ - int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ - - double sign = 1.0; - if (iF <= 0) sign *= SoA0; - if (jF <= 0) sign *= SoA1; - if (kF <= 0) sign *= SoA2; - - int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; - fh[tid] = sign * func[src]; - } -} - -__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0) -{ + } +} + +/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */ +__global__ void kern_symbd_pack_ord3(const double * __restrict__ func, + double * __restrict__ fh, + double SoA0, double SoA1, double SoA2) +{ + const int nx = d_gp.ex[0], ny = d_gp.ex[1]; + const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz; + const int total = fnx * fny * fnz; + + for (int tid = blockIdx.x * blockDim.x + threadIdx.x; + tid < total; + tid += blockDim.x * gridDim.x) + { + int ii = tid % fnx; + int jj = (tid / fnx) % fny; + int kk = tid / (fnx * fny); + + int iF = ii - 2; /* -2 .. nx */ + int jF = jj - 2; /* -2 .. ny */ + int kF = kk - 2; /* -2 .. nz */ + + int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */ + int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */ + int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */ + + double sign = 1.0; + if (iF <= 0) sign *= SoA0; + if (jF <= 0) sign *= SoA1; + if (kF <= 0) sign *= SoA2; + + int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny; + fh[tid] = sign * func[src]; + } +} + +__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0) +{ const int ny = d_gp.ex[1], nz = d_gp.ex[2]; const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny; int total = ny * nz; @@ -874,69 +874,69 @@ void kern_kodis(const double * __restrict__ fh, /* ================================================================== */ /* Host wrapper helpers */ /* ================================================================== */ -static const int BLK = 128; -static inline int grid(size_t n) { - if (n == 0) return 1; - size_t g = (n + BLK - 1) / BLK; - if (g > 2147483647u) g = 2147483647u; - return (int)g; -} +static const int BLK = 128; +static inline int grid(size_t n) { + if (n == 0) return 1; + size_t g = (n + BLK - 1) / BLK; + if (g > 2147483647u) g = 2147483647u; + return (int)g; +} /* symmetry_bd on GPU for ord=2, then launch fderivs kernel */ -static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, - double SoA0, double SoA1, double SoA2, int all) -{ - double *fh = g_buf.d_fh2; - const size_t nx = (size_t)g_buf.prev_nx; - const size_t ny = (size_t)g_buf.prev_ny; - const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); - - kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); -} +static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz, + double SoA0, double SoA1, double SoA2, int all) +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); + + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_fderivs<<>>(fh, d_fx, d_fy, d_fz); +} /* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */ static void gpu_fdderivs(double *d_f, double *d_fxx, double *d_fxy, double *d_fxz, double *d_fyy, double *d_fyz, double *d_fzz, double SoA0, double SoA1, double SoA2, int all) -{ - double *fh = g_buf.d_fh2; - const size_t nx = (size_t)g_buf.prev_nx; - const size_t ny = (size_t)g_buf.prev_ny; - const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); +{ + double *fh = g_buf.d_fh2; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull); + + kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); + kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); +} - kern_symbd_pack_ord2<<>>(d_f, fh, SoA0, SoA1, SoA2); - kern_fdderivs<<>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); -} - -/* Combined ord=3 advection + KO dissipation. - * When advection and KO use the same source field, symmetry packing is shared. - * If they differ (e.g. gxx advection + dxx KO), only KO repacks. - */ -static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs, - double *d_Sfx, double *d_Sfy, double *d_Sfz, - double SoA0, double SoA1, double SoA2, - double eps_val, int all) -{ - double *fh = g_buf.d_fh3; - const size_t nx = (size_t)g_buf.prev_nx; - const size_t ny = (size_t)g_buf.prev_ny; - const size_t nz = (size_t)g_buf.prev_nz; - const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); - - kern_symbd_pack_ord3<<>>(d_f_adv, fh, SoA0, SoA1, SoA2); - kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); - - if (eps_val > 0.0) { - if (d_f_ko != d_f_adv) { - kern_symbd_pack_ord3<<>>(d_f_ko, fh, SoA0, SoA1, SoA2); - } - kern_kodis<<>>(fh, d_f_rhs, eps_val); - } -} +/* Combined ord=3 advection + KO dissipation. + * When advection and KO use the same source field, symmetry packing is shared. + * If they differ (e.g. gxx advection + dxx KO), only KO repacks. + */ +static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs, + double *d_Sfx, double *d_Sfy, double *d_Sfz, + double SoA0, double SoA1, double SoA2, + double eps_val, int all) +{ + double *fh = g_buf.d_fh3; + const size_t nx = (size_t)g_buf.prev_nx; + const size_t ny = (size_t)g_buf.prev_ny; + const size_t nz = (size_t)g_buf.prev_nz; + const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); + + kern_symbd_pack_ord3<<>>(d_f_adv, fh, SoA0, SoA1, SoA2); + kern_lopsided<<>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz); + + if (eps_val > 0.0) { + if (d_f_ko != d_f_adv) { + kern_symbd_pack_ord3<<>>(d_f_ko, fh, SoA0, SoA1, SoA2); + } + kern_kodis<<>>(fh, d_f_rhs, eps_val); + } +} /* ================================================================== */ /* C. Point-wise computation kernels */ @@ -959,8 +959,8 @@ __global__ void kern_phase1_prep( } } -/* Phase 2a: chi_rhs, gij_rhs, div_beta */ -__global__ void kern_phase2_metric_rhs( +/* Phase 2a: chi_rhs, gij_rhs */ +__global__ void kern_phase2_metric_rhs( const double* __restrict__ alpn1, const double* __restrict__ chin1, const double* __restrict__ gxx, const double* __restrict__ gxy, const double* __restrict__ gxz, const double* __restrict__ gyy, @@ -972,19 +972,17 @@ __global__ void kern_phase2_metric_rhs( const double* __restrict__ betaxx, const double* __restrict__ betaxy, const double* __restrict__ betaxz, const double* __restrict__ betayx, const double* __restrict__ betayy, const double* __restrict__ betayz, - const double* __restrict__ betazx, const double* __restrict__ betazy, - const double* __restrict__ betazz, - double* __restrict__ div_beta, - double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs, + const double* __restrict__ betazx, const double* __restrict__ betazy, + const double* __restrict__ betazz, + double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs, double* __restrict__ gyy_rhs, double* __restrict__ gzz_rhs, double* __restrict__ gxy_rhs, double* __restrict__ gyz_rhs, double* __restrict__ gxz_rhs) { - const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0; - for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { - double db = betaxx[i] + betayy[i] + betazz[i]; - div_beta[i] = db; - chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db); + const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0; + for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) { + double db = betaxx[i] + betayy[i] + betazz[i]; + chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db); gxx_rhs[i] = -TWO*alpn1[i]*Axx[i] - F2o3*gxx[i]*db + TWO*(gxx[i]*betaxx[i] + gxy[i]*betayx[i] + gxz[i]*betazx[i]); gyy_rhs[i] = -TWO*alpn1[i]*Ayy[i] - F2o3*gyy[i]*db @@ -1240,11 +1238,10 @@ void kern_phase6_gamma_rhs_part1( * Gamxx..Gamzz hold fderivs(Gamma) results. */ __global__ __launch_bounds__(128, 4) -void kern_phase8_gamma_rhs_part2( - const double* __restrict__ gupxx, const double* __restrict__ gupxy, - const double* __restrict__ gupxz, const double* __restrict__ gupyy, - const double* __restrict__ gupyz, const double* __restrict__ gupzz, - const double* __restrict__ div_beta, +void kern_phase8_gamma_rhs_part2( + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, /* fdderivs(betax) -> gxxx,gxyx,gxzx,gyyx,gyzx,gzzx */ const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy, const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy, @@ -1298,13 +1295,13 @@ void kern_phase8_gamma_rhs_part2( /* Gamma^a contracted */ double Ga_x = uxx*Gxxx[i]+uyy*Gxyy[i]+uzz*Gxzz[i] +TWO*(uxy*Gxxy[i]+uxz*Gxxz[i]+uyz*Gxyz[i]); - double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i] - +TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]); - double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i] - +TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]); - Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z; - double db = div_beta[i]; - Gamx_rhs[i] += F2o3*Ga_x*db + double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i] + +TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]); + double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i] + +TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]); + Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z; + double db = betaxx[i] + betayy[i] + betazz[i]; + Gamx_rhs[i] += F2o3*Ga_x*db - Ga_x*betaxx[i] - Ga_y*betaxy[i] - Ga_z*betaxz[i] + F1o3*(uxx*fxx_v+uxy*fxy_v+uxz*fxz_v) + uxx*bxx_xx[i]+uyy*bxx_yy[i]+uzz*bxx_zz[i] @@ -1733,13 +1730,12 @@ void kern_phase15_trK_Aij_gauge( const double* __restrict__ gupxz, const double* __restrict__ gupyy, const double* __restrict__ gupyz, const double* __restrict__ gupzz, const double* __restrict__ trK, - const double* __restrict__ Axx, const double* __restrict__ Axy, - const double* __restrict__ Axz, const double* __restrict__ Ayy, - const double* __restrict__ Ayz, const double* __restrict__ Azz, - const double* __restrict__ Lapx, const double* __restrict__ Lapy, - const double* __restrict__ Lapz, - const double* __restrict__ div_beta, - const double* __restrict__ betaxx, const double* __restrict__ betaxy, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ Lapx, const double* __restrict__ Lapy, + const double* __restrict__ Lapz, + const double* __restrict__ betaxx, const double* __restrict__ betaxy, const double* __restrict__ betaxz, const double* __restrict__ betayx, const double* __restrict__ betayy, const double* __restrict__ betayz, const double* __restrict__ betazx, const double* __restrict__ betazy, @@ -1856,8 +1852,8 @@ void kern_phase15_trK_Aij_gauge( +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i]) +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]))); - double trK_v = trK[i]; - double db = div_beta[i]; + double trK_v = trK[i]; + double db = betaxx[i] + betayy[i] + betazz[i]; /* trK_rhs step 1: store D^iD_i alpha * chin1 */ trK_rhs[i] = c1 * DDA; @@ -2114,12 +2110,12 @@ int f_compute_rhs_bssn(int *ex, double &T, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &co) -{ - /* --- Multi-GPU: select device --- */ - init_gpu_dispatch(); - CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); +{ + /* --- Multi-GPU: select device --- */ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - const int nx = ex[0], ny = ex[1], nz = ex[2]; + const int nx = ex[0], ny = ex[1], nz = ex[2]; const int all = nx * ny * nz; const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0]; const int NO_SYMM = 0, EQ_SYMM = 1; @@ -2153,30 +2149,30 @@ int f_compute_rhs_bssn(int *ex, double &T, CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams))); /* --- Shorthand for device slot pointers --- */ - #define D(s) g_buf.slot[s] - const size_t bytes = (size_t)all * sizeof(double); + #define D(s) g_buf.slot[s] + const size_t bytes = (size_t)all * sizeof(double); + + /* --- H2D: stage all inputs, then one bulk copy --- */ + double *h2d_src[] = { + chi, trK, dxx, gxy, gxz, dyy, gyz, dzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, + rho, Sx, Sy, Sz, + Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz + }; + static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT, + "h2d_src list must match H2D_INPUT_SLOT_COUNT"); + for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) { + std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes); + } + CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage, + (size_t)H2D_INPUT_SLOT_COUNT * bytes, + cudaMemcpyHostToDevice)); - /* --- H2D: stage all inputs, then one bulk copy --- */ - double *h2d_src[] = { - chi, trK, dxx, gxy, gxz, dyy, gyz, dzz, - Axx, Axy, Axz, Ayy, Ayz, Azz, - Gamx, Gamy, Gamz, - Lap, betax, betay, betaz, - dtSfx, dtSfy, dtSfz, - rho, Sx, Sy, Sz, - Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz - }; - static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT, - "h2d_src list must match H2D_INPUT_SLOT_COUNT"); - for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) { - std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes); - } - CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage, - (size_t)H2D_INPUT_SLOT_COUNT * bytes, - cudaMemcpyHostToDevice)); - - /* ============================================================ */ - /* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */ + /* ============================================================ */ + /* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */ /* ============================================================ */ kern_phase1_prep<<>>( D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz), @@ -2199,17 +2195,16 @@ int f_compute_rhs_bssn(int *ex, double &T, /* ============================================================ */ /* Phase 2: metric RHS + inverse */ /* ============================================================ */ - kern_phase2_metric_rhs<<>>( - D(S_alpn1), D(S_chin1), - D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), - D(S_trK), - D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), - D(S_betaxx), D(S_betaxy), D(S_betaxz), - D(S_betayx), D(S_betayy), D(S_betayz), - D(S_betazx), D(S_betazy), D(S_betazz), - D(S_div_beta), - D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs), - D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs)); + kern_phase2_metric_rhs<<>>( + D(S_alpn1), D(S_chin1), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), + D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs), + D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs)); kern_phase2_inverse<<>>( D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), @@ -2280,13 +2275,12 @@ int f_compute_rhs_bssn(int *ex, double &T, gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all); /* Phase 8: Gamma_rhs part 2 */ - kern_phase8_gamma_rhs_part2<<>>( - D(S_gupxx), D(S_gupxy), D(S_gupxz), - D(S_gupyy), D(S_gupyz), D(S_gupzz), - D(S_div_beta), - D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), - D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), - D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), + kern_phase8_gamma_rhs_part2<<>>( + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx), + D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy), + D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), @@ -2426,17 +2420,16 @@ int f_compute_rhs_bssn(int *ex, double &T, /* ============================================================ */ /* Phase 15: trK_rhs, Aij_rhs, gauge */ /* ============================================================ */ - kern_phase15_trK_Aij_gauge<<>>( - D(S_alpn1), D(S_chin1), - D(S_chix), D(S_chiy), D(S_chiz), - D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), - D(S_gupxx), D(S_gupxy), D(S_gupxz), - D(S_gupyy), D(S_gupyz), D(S_gupzz), - D(S_trK), - D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), - D(S_Lapx), D(S_Lapy), D(S_Lapz), - D(S_div_beta), - D(S_betaxx), D(S_betaxy), D(S_betaxz), + kern_phase15_trK_Aij_gauge<<>>( + D(S_alpn1), D(S_chin1), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), + D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), D(S_betayx), D(S_betayy), D(S_betayz), D(S_betazx), D(S_betazy), D(S_betazz), D(S_rho), @@ -2461,33 +2454,33 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs), D(S_f_arr), D(S_S_arr)); - /* ============================================================ */ - /* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */ - /* ============================================================ */ - gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + /* ============================================================ */ + /* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */ + /* ============================================================ */ + gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); + gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); + gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); + gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); + gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); /* ============================================================ */ /* Phase 18: Hamilton & momentum constraints (co==0) */ @@ -2527,46 +2520,46 @@ int f_compute_rhs_bssn(int *ex, double &T, D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res)); } - /* ============================================================ */ - /* D2H: copy all output arrays back to host */ - /* ============================================================ */ - const int d2h_slot_count = D2H_BASE_SLOT_COUNT + - ((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0); - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs), - (size_t)d2h_slot_count * bytes, - cudaMemcpyDeviceToHost)); + /* ============================================================ */ + /* D2H: copy all output arrays back to host */ + /* ============================================================ */ + const int d2h_slot_count = D2H_BASE_SLOT_COUNT + + ((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs), + (size_t)d2h_slot_count * bytes, + cudaMemcpyDeviceToHost)); + + double *d2h_dst[] = { + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz + }; + static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT, + "d2h_dst list must match D2H_BASE_SLOT_COUNT"); + for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) { + std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes); + } + if (co == 0) { + double *d2h_dst_co[] = { + ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res + }; + static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) == + D2H_CONSTRAINT_SLOT_COUNT, + "d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT"); + for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) { + std::memcpy(d2h_dst_co[s], + g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all, + bytes); + } + } - double *d2h_dst[] = { - chi_rhs, trK_rhs, - gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, - Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, - Gamx_rhs, Gamy_rhs, Gamz_rhs, - Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, - dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, - Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, - Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, - Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, - Rxx, Rxy, Rxz, Ryy, Ryz, Rzz - }; - static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT, - "d2h_dst list must match D2H_BASE_SLOT_COUNT"); - for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) { - std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes); - } - if (co == 0) { - double *d2h_dst_co[] = { - ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res - }; - static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) == - D2H_CONSTRAINT_SLOT_COUNT, - "d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT"); - for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) { - std::memcpy(d2h_dst_co[s], - g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all, - bytes); - } - } - - #undef D - return 0; -} + #undef D + return 0; +}