merge lopsided+kodis
This commit is contained in:
@@ -5,61 +5,61 @@
|
||||
* Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
#include "macrodef.h"
|
||||
#include "bssn_rhs.h"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <cuda_runtime.h>
|
||||
#include "macrodef.h"
|
||||
#include "bssn_rhs.h"
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Multi-GPU dispatch: distribute ranks across available GPUs */
|
||||
/* ------------------------------------------------------------------ */
|
||||
static struct {
|
||||
int num_gpus;
|
||||
int my_rank;
|
||||
int my_local_rank;
|
||||
int my_device;
|
||||
bool inited;
|
||||
} g_dispatch = {0, -1, -1, -1, false};
|
||||
|
||||
static int env_to_int(const char *name, int fallback = -1) {
|
||||
const char *v = getenv(name);
|
||||
if (!v || !*v) return fallback;
|
||||
return atoi(v);
|
||||
}
|
||||
|
||||
static void init_gpu_dispatch() {
|
||||
if (g_dispatch.inited) return;
|
||||
cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus);
|
||||
if (err != cudaSuccess) g_dispatch.num_gpus = 1;
|
||||
if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
|
||||
|
||||
/* Get MPI rank from environment (set by mpirun/mpiexec). */
|
||||
g_dispatch.my_rank = env_to_int("PMI_RANK",
|
||||
env_to_int("OMPI_COMM_WORLD_RANK",
|
||||
env_to_int("MV2_COMM_WORLD_RANK",
|
||||
env_to_int("SLURM_PROCID", 0))));
|
||||
|
||||
/* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */
|
||||
g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK",
|
||||
env_to_int("MV2_COMM_WORLD_LOCAL_RANK",
|
||||
env_to_int("MPI_LOCALRANKID",
|
||||
env_to_int("SLURM_LOCALID", -1))));
|
||||
|
||||
const int rank_for_map = (g_dispatch.my_local_rank >= 0)
|
||||
? g_dispatch.my_local_rank : g_dispatch.my_rank;
|
||||
g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus;
|
||||
cudaSetDevice(g_dispatch.my_device);
|
||||
|
||||
if (g_dispatch.my_rank == 0) {
|
||||
printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n",
|
||||
g_dispatch.num_gpus,
|
||||
(g_dispatch.my_local_rank >= 0) ? "local" : "global");
|
||||
}
|
||||
g_dispatch.inited = true;
|
||||
}
|
||||
static struct {
|
||||
int num_gpus;
|
||||
int my_rank;
|
||||
int my_local_rank;
|
||||
int my_device;
|
||||
bool inited;
|
||||
} g_dispatch = {0, -1, -1, -1, false};
|
||||
|
||||
static int env_to_int(const char *name, int fallback = -1) {
|
||||
const char *v = getenv(name);
|
||||
if (!v || !*v) return fallback;
|
||||
return atoi(v);
|
||||
}
|
||||
|
||||
static void init_gpu_dispatch() {
|
||||
if (g_dispatch.inited) return;
|
||||
cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus);
|
||||
if (err != cudaSuccess) g_dispatch.num_gpus = 1;
|
||||
if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
|
||||
|
||||
/* Get MPI rank from environment (set by mpirun/mpiexec). */
|
||||
g_dispatch.my_rank = env_to_int("PMI_RANK",
|
||||
env_to_int("OMPI_COMM_WORLD_RANK",
|
||||
env_to_int("MV2_COMM_WORLD_RANK",
|
||||
env_to_int("SLURM_PROCID", 0))));
|
||||
|
||||
/* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */
|
||||
g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK",
|
||||
env_to_int("MV2_COMM_WORLD_LOCAL_RANK",
|
||||
env_to_int("MPI_LOCALRANKID",
|
||||
env_to_int("SLURM_LOCALID", -1))));
|
||||
|
||||
const int rank_for_map = (g_dispatch.my_local_rank >= 0)
|
||||
? g_dispatch.my_local_rank : g_dispatch.my_rank;
|
||||
g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus;
|
||||
cudaSetDevice(g_dispatch.my_device);
|
||||
|
||||
if (g_dispatch.my_rank == 0) {
|
||||
printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n",
|
||||
g_dispatch.num_gpus,
|
||||
(g_dispatch.my_local_rank >= 0) ? "local" : "global");
|
||||
}
|
||||
g_dispatch.inited = true;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Error checking */
|
||||
@@ -141,24 +141,24 @@ __device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) {
|
||||
/* Total number of "all"-sized slots */
|
||||
#define NUM_SLOTS 160
|
||||
|
||||
struct GpuBuffers {
|
||||
double *d_mem; /* single big allocation */
|
||||
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
|
||||
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
|
||||
double *h_stage; /* host staging buffer for bulk H2D/D2H */
|
||||
bool h_stage_pinned; /* true if allocated by cudaMallocHost */
|
||||
double *slot[NUM_SLOTS]; /* pointers into d_mem */
|
||||
size_t cap_all;
|
||||
size_t cap_fh2_size;
|
||||
size_t cap_fh3_size;
|
||||
int prev_nx, prev_ny, prev_nz;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
static GpuBuffers g_buf = {
|
||||
nullptr, nullptr, nullptr, nullptr, false, {},
|
||||
0, 0, 0, 0, 0, 0, false
|
||||
};
|
||||
struct GpuBuffers {
|
||||
double *d_mem; /* single big allocation */
|
||||
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
|
||||
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
|
||||
double *h_stage; /* host staging buffer for bulk H2D/D2H */
|
||||
bool h_stage_pinned; /* true if allocated by cudaMallocHost */
|
||||
double *slot[NUM_SLOTS]; /* pointers into d_mem */
|
||||
size_t cap_all;
|
||||
size_t cap_fh2_size;
|
||||
size_t cap_fh3_size;
|
||||
int prev_nx, prev_ny, prev_nz;
|
||||
bool initialized;
|
||||
};
|
||||
|
||||
static GpuBuffers g_buf = {
|
||||
nullptr, nullptr, nullptr, nullptr, false, {},
|
||||
0, 0, 0, 0, 0, 0, false
|
||||
};
|
||||
|
||||
/* Slot assignments — INPUT (H2D) */
|
||||
enum {
|
||||
@@ -209,75 +209,75 @@ enum {
|
||||
NUM_USED_SLOTS
|
||||
};
|
||||
|
||||
static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS");
|
||||
|
||||
static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1);
|
||||
static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1);
|
||||
static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1);
|
||||
static const int STAGE_SLOT_COUNT =
|
||||
(H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT))
|
||||
? H2D_INPUT_SLOT_COUNT
|
||||
: (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);
|
||||
|
||||
static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
||||
size_t all = (size_t)nx * ny * nz;
|
||||
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
|
||||
size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3);
|
||||
const bool need_grow = (!g_buf.initialized)
|
||||
|| (all > g_buf.cap_all)
|
||||
|| (fh2_size > g_buf.cap_fh2_size)
|
||||
|| (fh3_size > g_buf.cap_fh3_size);
|
||||
|
||||
if (need_grow) {
|
||||
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
|
||||
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
|
||||
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
|
||||
if (g_buf.h_stage) {
|
||||
if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage);
|
||||
else free(g_buf.h_stage);
|
||||
g_buf.h_stage = nullptr;
|
||||
g_buf.h_stage_pinned = false;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
|
||||
|
||||
const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double);
|
||||
cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes);
|
||||
if (stage_err == cudaSuccess) {
|
||||
g_buf.h_stage_pinned = true;
|
||||
} else {
|
||||
g_buf.h_stage = (double *)malloc(stage_bytes);
|
||||
g_buf.h_stage_pinned = false;
|
||||
if (!g_buf.h_stage) {
|
||||
fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
g_buf.cap_all = all;
|
||||
g_buf.cap_fh2_size = fh2_size;
|
||||
g_buf.cap_fh3_size = fh3_size;
|
||||
g_buf.initialized = true;
|
||||
}
|
||||
|
||||
for (int s = 0; s < NUM_USED_SLOTS; ++s)
|
||||
g_buf.slot[s] = g_buf.d_mem + s * all;
|
||||
|
||||
g_buf.prev_nx = nx;
|
||||
g_buf.prev_ny = ny;
|
||||
g_buf.prev_nz = nz;
|
||||
}
|
||||
static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS");
|
||||
|
||||
static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1);
|
||||
static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1);
|
||||
static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1);
|
||||
static const int STAGE_SLOT_COUNT =
|
||||
(H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT))
|
||||
? H2D_INPUT_SLOT_COUNT
|
||||
: (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);
|
||||
|
||||
static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
||||
size_t all = (size_t)nx * ny * nz;
|
||||
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
|
||||
size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3);
|
||||
const bool need_grow = (!g_buf.initialized)
|
||||
|| (all > g_buf.cap_all)
|
||||
|| (fh2_size > g_buf.cap_fh2_size)
|
||||
|| (fh3_size > g_buf.cap_fh3_size);
|
||||
|
||||
if (need_grow) {
|
||||
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
|
||||
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
|
||||
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
|
||||
if (g_buf.h_stage) {
|
||||
if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage);
|
||||
else free(g_buf.h_stage);
|
||||
g_buf.h_stage = nullptr;
|
||||
g_buf.h_stage_pinned = false;
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
|
||||
|
||||
const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double);
|
||||
cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes);
|
||||
if (stage_err == cudaSuccess) {
|
||||
g_buf.h_stage_pinned = true;
|
||||
} else {
|
||||
g_buf.h_stage = (double *)malloc(stage_bytes);
|
||||
g_buf.h_stage_pinned = false;
|
||||
if (!g_buf.h_stage) {
|
||||
fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
g_buf.cap_all = all;
|
||||
g_buf.cap_fh2_size = fh2_size;
|
||||
g_buf.cap_fh3_size = fh3_size;
|
||||
g_buf.initialized = true;
|
||||
}
|
||||
|
||||
for (int s = 0; s < NUM_USED_SLOTS; ++s)
|
||||
g_buf.slot[s] = g_buf.d_mem + s * all;
|
||||
|
||||
g_buf.prev_nx = nx;
|
||||
g_buf.prev_ny = ny;
|
||||
g_buf.prev_nz = nz;
|
||||
}
|
||||
|
||||
/* ================================================================== */
|
||||
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
|
||||
/* ================================================================== */
|
||||
|
||||
/* Step 1: Copy interior into ghost-padded array */
|
||||
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
/* Step 1: Copy interior into ghost-padded array */
|
||||
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
||||
@@ -290,47 +290,47 @@ __global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
||||
int k0 = tid / (nx * ny);
|
||||
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
||||
fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 1; /* -1 .. nx */
|
||||
int jF = jj - 1; /* -1 .. ny */
|
||||
int kF = kk - 1; /* -1 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
||||
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 1; /* -1 .. nx */
|
||||
int jF = jj - 1; /* -1 .. ny */
|
||||
int kF = kk - 1; /* -1 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
||||
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
||||
/* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
|
||||
@@ -404,9 +404,9 @@ __global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
|
||||
|
||||
/* ---- ord=3 variants (for lopsided / kodis) ---- */
|
||||
|
||||
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh)
|
||||
{
|
||||
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -418,46 +418,46 @@ __global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
||||
int k0 = tid / (nx * ny);
|
||||
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
||||
fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 2; /* -2 .. nx */
|
||||
int jF = jj - 2; /* -2 .. ny */
|
||||
int kF = kk - 2; /* -2 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
|
||||
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
|
||||
double * __restrict__ fh,
|
||||
double SoA0, double SoA1, double SoA2)
|
||||
{
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
|
||||
const int total = fnx * fny * fnz;
|
||||
|
||||
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
tid < total;
|
||||
tid += blockDim.x * gridDim.x)
|
||||
{
|
||||
int ii = tid % fnx;
|
||||
int jj = (tid / fnx) % fny;
|
||||
int kk = tid / (fnx * fny);
|
||||
|
||||
int iF = ii - 2; /* -2 .. nx */
|
||||
int jF = jj - 2; /* -2 .. ny */
|
||||
int kF = kk - 2; /* -2 .. nz */
|
||||
|
||||
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
||||
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
||||
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
||||
|
||||
double sign = 1.0;
|
||||
if (iF <= 0) sign *= SoA0;
|
||||
if (jF <= 0) sign *= SoA1;
|
||||
if (kF <= 0) sign *= SoA2;
|
||||
|
||||
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
||||
fh[tid] = sign * func[src];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
||||
{
|
||||
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
||||
int total = ny * nz;
|
||||
@@ -874,73 +874,69 @@ void kern_kodis(const double * __restrict__ fh,
|
||||
/* ================================================================== */
|
||||
/* Host wrapper helpers */
|
||||
/* ================================================================== */
|
||||
static const int BLK = 128;
|
||||
static inline int grid(size_t n) {
|
||||
if (n == 0) return 1;
|
||||
size_t g = (n + BLK - 1) / BLK;
|
||||
if (g > 2147483647u) g = 2147483647u;
|
||||
return (int)g;
|
||||
}
|
||||
static const int BLK = 128;
|
||||
static inline int grid(size_t n) {
|
||||
if (n == 0) return 1;
|
||||
size_t g = (n + BLK - 1) / BLK;
|
||||
if (g > 2147483647u) g = 2147483647u;
|
||||
return (int)g;
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||
}
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
||||
static void gpu_fdderivs(double *d_f,
|
||||
double *d_fxx, double *d_fxy, double *d_fxz,
|
||||
double *d_fyy, double *d_fyz, double *d_fzz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||
}
|
||||
{
|
||||
double *fh = g_buf.d_fh2;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
||||
|
||||
/* symmetry_bd on GPU for ord=3, then launch lopsided kernel */
|
||||
static void gpu_lopsided(double *d_f, double *d_f_rhs,
|
||||
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh3;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
||||
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
|
||||
}
|
||||
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=3, then launch kodis kernel */
|
||||
static void gpu_kodis(double *d_f, double *d_f_rhs,
|
||||
double SoA0, double SoA1, double SoA2,
|
||||
double eps_val, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh3;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
||||
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||
}
|
||||
/* Combined ord=3 advection + KO dissipation.
|
||||
* When advection and KO use the same source field, symmetry packing is shared.
|
||||
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
||||
*/
|
||||
static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
||||
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
||||
double SoA0, double SoA1, double SoA2,
|
||||
double eps_val, int all)
|
||||
{
|
||||
double *fh = g_buf.d_fh3;
|
||||
const size_t nx = (size_t)g_buf.prev_nx;
|
||||
const size_t ny = (size_t)g_buf.prev_ny;
|
||||
const size_t nz = (size_t)g_buf.prev_nz;
|
||||
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
||||
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_adv, fh, SoA0, SoA1, SoA2);
|
||||
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
|
||||
|
||||
if (eps_val > 0.0) {
|
||||
if (d_f_ko != d_f_adv) {
|
||||
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_ko, fh, SoA0, SoA1, SoA2);
|
||||
}
|
||||
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
||||
}
|
||||
}
|
||||
|
||||
/* ================================================================== */
|
||||
/* C. Point-wise computation kernels */
|
||||
@@ -2118,12 +2114,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co)
|
||||
{
|
||||
/* --- Multi-GPU: select device --- */
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
{
|
||||
/* --- Multi-GPU: select device --- */
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
|
||||
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
||||
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
||||
const int all = nx * ny * nz;
|
||||
const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
||||
@@ -2157,30 +2153,30 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||
|
||||
/* --- Shorthand for device slot pointers --- */
|
||||
#define D(s) g_buf.slot[s]
|
||||
const size_t bytes = (size_t)all * sizeof(double);
|
||||
|
||||
/* --- H2D: stage all inputs, then one bulk copy --- */
|
||||
double *h2d_src[] = {
|
||||
chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz,
|
||||
dtSfx, dtSfy, dtSfz,
|
||||
rho, Sx, Sy, Sz,
|
||||
Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz
|
||||
};
|
||||
static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT,
|
||||
"h2d_src list must match H2D_INPUT_SLOT_COUNT");
|
||||
for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) {
|
||||
std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage,
|
||||
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
|
||||
cudaMemcpyHostToDevice));
|
||||
#define D(s) g_buf.slot[s]
|
||||
const size_t bytes = (size_t)all * sizeof(double);
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
|
||||
/* --- H2D: stage all inputs, then one bulk copy --- */
|
||||
double *h2d_src[] = {
|
||||
chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz,
|
||||
dtSfx, dtSfy, dtSfz,
|
||||
rho, Sx, Sy, Sz,
|
||||
Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz
|
||||
};
|
||||
static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT,
|
||||
"h2d_src list must match H2D_INPUT_SLOT_COUNT");
|
||||
for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) {
|
||||
std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage,
|
||||
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
|
||||
cudaMemcpyHostToDevice));
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
|
||||
/* ============================================================ */
|
||||
kern_phase1_prep<<<grid(all),BLK>>>(
|
||||
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
|
||||
@@ -2466,62 +2462,32 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_f_arr), D(S_S_arr));
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 16: 23x lopsided (advection) */
|
||||
/* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */
|
||||
/* ============================================================ */
|
||||
gpu_lopsided(D(S_gxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
||||
gpu_lopsided(D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all);
|
||||
gpu_lopsided(D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all);
|
||||
gpu_lopsided(D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_gyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
||||
gpu_lopsided(D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all);
|
||||
gpu_lopsided(D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
||||
gpu_lopsided(D(S_gzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
||||
gpu_lopsided(D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all);
|
||||
gpu_lopsided(D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all);
|
||||
gpu_lopsided(D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all);
|
||||
gpu_lopsided(D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all);
|
||||
gpu_lopsided(D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all);
|
||||
gpu_lopsided(D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all);
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 17: 24x KO dissipation (eps > 0) */
|
||||
/* ============================================================ */
|
||||
if (eps > 0) {
|
||||
gpu_kodis(D(S_chi), D(S_chi_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_trK), D(S_trK_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_dxx), D(S_gxx_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_gxy), D(S_gxy_rhs), ANTI,ANTI,SYM, eps, all);
|
||||
gpu_kodis(D(S_gxz), D(S_gxz_rhs), ANTI,SYM,ANTI, eps, all);
|
||||
gpu_kodis(D(S_dyy), D(S_gyy_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_gyz), D(S_gyz_rhs), SYM,ANTI,ANTI, eps, all);
|
||||
gpu_kodis(D(S_dzz), D(S_gzz_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_Axx), D(S_Axx_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_dtSfz), D(S_dtSfz_rhs), SYM,SYM,ANTI, eps, all);
|
||||
gpu_kodis(D(S_Axy), D(S_Axy_rhs), ANTI,ANTI,SYM, eps, all);
|
||||
gpu_kodis(D(S_dtSfy), D(S_dtSfy_rhs), SYM,ANTI,SYM, eps, all);
|
||||
gpu_kodis(D(S_Axz), D(S_Axz_rhs), ANTI,SYM,ANTI, eps, all);
|
||||
gpu_kodis(D(S_dtSfx), D(S_dtSfx_rhs), ANTI,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_Ayy), D(S_Ayy_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_betaz), D(S_betaz_rhs), SYM,SYM,ANTI, eps, all);
|
||||
gpu_kodis(D(S_Ayz), D(S_Ayz_rhs), SYM,ANTI,ANTI, eps, all);
|
||||
gpu_kodis(D(S_betay), D(S_betay_rhs), SYM,ANTI,SYM, eps, all);
|
||||
gpu_kodis(D(S_Azz), D(S_Azz_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_betax), D(S_betax_rhs), ANTI,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_Gamx), D(S_Gamx_rhs), ANTI,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_Lap), D(S_Lap_rhs), SYM,SYM,SYM, eps, all);
|
||||
gpu_kodis(D(S_Gamy), D(S_Gamy_rhs), SYM,ANTI,SYM, eps, all);
|
||||
gpu_kodis(D(S_Gamz), D(S_Gamz_rhs), SYM,SYM,ANTI, eps, all);
|
||||
}
|
||||
gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
|
||||
gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
|
||||
|
||||
/* ============================================================ */
|
||||
/* Phase 18: Hamilton & momentum constraints (co==0) */
|
||||
@@ -2561,46 +2527,46 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
|
||||
}
|
||||
|
||||
/* ============================================================ */
|
||||
/* D2H: copy all output arrays back to host */
|
||||
/* ============================================================ */
|
||||
const int d2h_slot_count = D2H_BASE_SLOT_COUNT +
|
||||
((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0);
|
||||
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs),
|
||||
(size_t)d2h_slot_count * bytes,
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
double *d2h_dst[] = {
|
||||
chi_rhs, trK_rhs,
|
||||
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
||||
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
||||
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
||||
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
||||
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz
|
||||
};
|
||||
static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT,
|
||||
"d2h_dst list must match D2H_BASE_SLOT_COUNT");
|
||||
for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) {
|
||||
std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes);
|
||||
}
|
||||
if (co == 0) {
|
||||
double *d2h_dst_co[] = {
|
||||
ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res
|
||||
};
|
||||
static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) ==
|
||||
D2H_CONSTRAINT_SLOT_COUNT,
|
||||
"d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT");
|
||||
for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) {
|
||||
std::memcpy(d2h_dst_co[s],
|
||||
g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
/* ============================================================ */
|
||||
/* D2H: copy all output arrays back to host */
|
||||
/* ============================================================ */
|
||||
const int d2h_slot_count = D2H_BASE_SLOT_COUNT +
|
||||
((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0);
|
||||
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs),
|
||||
(size_t)d2h_slot_count * bytes,
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
#undef D
|
||||
return 0;
|
||||
}
|
||||
double *d2h_dst[] = {
|
||||
chi_rhs, trK_rhs,
|
||||
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
||||
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
||||
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
||||
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
||||
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz
|
||||
};
|
||||
static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT,
|
||||
"d2h_dst list must match D2H_BASE_SLOT_COUNT");
|
||||
for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) {
|
||||
std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes);
|
||||
}
|
||||
if (co == 0) {
|
||||
double *d2h_dst_co[] = {
|
||||
ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res
|
||||
};
|
||||
static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) ==
|
||||
D2H_CONSTRAINT_SLOT_COUNT,
|
||||
"d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT");
|
||||
for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) {
|
||||
std::memcpy(d2h_dst_co[s],
|
||||
g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all,
|
||||
bytes);
|
||||
}
|
||||
}
|
||||
|
||||
#undef D
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user