Stabilize EScalar CUDA sync defaults
This commit is contained in:
@@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() {
|
||||
static int enabled = -1;
|
||||
if (enabled < 0) {
|
||||
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
static bool escalar_gpu_rk_enabled() {
|
||||
static int enabled = -1;
|
||||
if (enabled < 0) {
|
||||
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
@@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
||||
struct StepContext {
|
||||
double *d_state0_mem;
|
||||
double *d_accum_mem;
|
||||
double *d_escalar0_mem;
|
||||
double *d_escalar_accum_mem;
|
||||
double *d_state_curr_mem;
|
||||
double *d_state_next_mem;
|
||||
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
||||
@@ -596,6 +607,8 @@ struct StepContext {
|
||||
double *h_comm_mem;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||
std::array<double *, 2> d_escalar0;
|
||||
std::array<double *, 2> d_escalar_accum;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||
std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
|
||||
@@ -615,6 +628,7 @@ struct StepContext {
|
||||
|
||||
StepContext()
|
||||
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
||||
d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr),
|
||||
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
||||
d_resident_mem{},
|
||||
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
||||
@@ -625,6 +639,8 @@ struct StepContext {
|
||||
d_resident_mem.fill(nullptr);
|
||||
d_state0.fill(nullptr);
|
||||
d_accum.fill(nullptr);
|
||||
d_escalar0.fill(nullptr);
|
||||
d_escalar_accum.fill(nullptr);
|
||||
d_state_curr.fill(nullptr);
|
||||
d_state_next.fill(nullptr);
|
||||
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
||||
@@ -641,6 +657,8 @@ struct StepContext {
|
||||
struct StepAllocation {
|
||||
double *d_state0_mem;
|
||||
double *d_accum_mem;
|
||||
double *d_escalar0_mem;
|
||||
double *d_escalar_accum_mem;
|
||||
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
||||
double *d_matter_mem;
|
||||
double *d_comm_mem;
|
||||
@@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation()
|
||||
StepAllocation alloc = {};
|
||||
alloc.d_state0_mem = nullptr;
|
||||
alloc.d_accum_mem = nullptr;
|
||||
alloc.d_escalar0_mem = nullptr;
|
||||
alloc.d_escalar_accum_mem = nullptr;
|
||||
alloc.d_resident_mem.fill(nullptr);
|
||||
alloc.d_matter_mem = nullptr;
|
||||
alloc.d_comm_mem = nullptr;
|
||||
@@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
StepAllocation alloc = {};
|
||||
alloc.d_state0_mem = ctx.d_state0_mem;
|
||||
alloc.d_accum_mem = ctx.d_accum_mem;
|
||||
alloc.d_escalar0_mem = ctx.d_escalar0_mem;
|
||||
alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem;
|
||||
alloc.d_resident_mem = ctx.d_resident_mem;
|
||||
alloc.d_matter_mem = ctx.d_matter_mem;
|
||||
alloc.d_comm_mem = ctx.d_comm_mem;
|
||||
@@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
alloc.cap_h_comm = ctx.cap_h_comm;
|
||||
ctx.d_state0_mem = nullptr;
|
||||
ctx.d_accum_mem = nullptr;
|
||||
ctx.d_escalar0_mem = nullptr;
|
||||
ctx.d_escalar_accum_mem = nullptr;
|
||||
ctx.d_state_curr_mem = nullptr;
|
||||
ctx.d_state_next_mem = nullptr;
|
||||
ctx.d_resident_mem.fill(nullptr);
|
||||
@@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
ctx.resident_clock = 0;
|
||||
ctx.d_state0.fill(nullptr);
|
||||
ctx.d_accum.fill(nullptr);
|
||||
ctx.d_escalar0.fill(nullptr);
|
||||
ctx.d_escalar_accum.fill(nullptr);
|
||||
ctx.d_state_curr.fill(nullptr);
|
||||
ctx.d_state_next.fill(nullptr);
|
||||
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
||||
@@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
|
||||
{
|
||||
ctx.d_state0_mem = alloc.d_state0_mem;
|
||||
ctx.d_accum_mem = alloc.d_accum_mem;
|
||||
ctx.d_escalar0_mem = alloc.d_escalar0_mem;
|
||||
ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem;
|
||||
ctx.d_resident_mem = alloc.d_resident_mem;
|
||||
ctx.d_state_curr_mem = nullptr;
|
||||
ctx.d_state_next_mem = nullptr;
|
||||
@@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
||||
ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
|
||||
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
if (ctx.current_bank >= 0) {
|
||||
ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
|
||||
ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
|
||||
@@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static void ensure_escalar_buffers(StepContext &ctx, size_t all)
|
||||
{
|
||||
if (!ctx.d_escalar0_mem)
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double)));
|
||||
if (!ctx.d_escalar_accum_mem)
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double)));
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
|
||||
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
|
||||
static void release_step_ctx(void *block_tag)
|
||||
{
|
||||
auto it = g_step_ctx.find(block_tag);
|
||||
@@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
|
||||
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
|
||||
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
|
||||
a2);
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
if (!escalar_gpu_rk_enabled()) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
ctx.matter_ready = true;
|
||||
(void)Lev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double *Sphi_out_host,
|
||||
double *Spi_out_host,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &precor)
|
||||
{
|
||||
if (!escalar_gpu_rk_enabled())
|
||||
return 1;
|
||||
if (RK4 < 0 || RK4 > 3)
|
||||
return 1;
|
||||
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const size_t bytes = all * sizeof(double);
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor);
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
ensure_escalar_buffers(ctx, all);
|
||||
|
||||
if (RK4 == 0) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr],
|
||||
bytes, cudaMemcpyDeviceToDevice));
|
||||
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr],
|
||||
bytes, cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
if (apply_bam_bc) {
|
||||
gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa],
|
||||
propspeed[0],
|
||||
soa_flat[0], soa_flat[1], soa_flat[2],
|
||||
X, Y, Z, bbox, Symmetry);
|
||||
gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya],
|
||||
propspeed[1],
|
||||
soa_flat[3], soa_flat[4], soa_flat[5],
|
||||
X, Y, Z, bbox, Symmetry);
|
||||
}
|
||||
|
||||
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa],
|
||||
ctx.d_escalar_accum[0], dT, RK4);
|
||||
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya],
|
||||
ctx.d_escalar_accum[1], dT, RK4);
|
||||
|
||||
try_pin_escalar_host_buffer(Sphi_out_host, bytes);
|
||||
try_pin_escalar_host_buffer(Spi_out_host, bytes);
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
(void)Lev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
|
||||
Reference in New Issue
Block a user