Stabilize EScalar CUDA sync defaults

This commit is contained in:
2026-05-03 00:24:50 +08:00
parent 74ba5feb86
commit 4430d04ee7
6 changed files with 243 additions and 20 deletions

View File

@@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
static bool escalar_gpu_rk_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
@@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
struct StepContext {
double *d_state0_mem;
double *d_accum_mem;
double *d_escalar0_mem;
double *d_escalar_accum_mem;
double *d_state_curr_mem;
double *d_state_next_mem;
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
@@ -596,6 +607,8 @@ struct StepContext {
double *h_comm_mem;
std::array<double *, BSSN_STATE_COUNT> d_state0;
std::array<double *, BSSN_STATE_COUNT> d_accum;
std::array<double *, 2> d_escalar0;
std::array<double *, 2> d_escalar_accum;
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
std::array<double *, BSSN_STATE_COUNT> d_state_next;
std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
@@ -615,6 +628,7 @@ struct StepContext {
StepContext()
: d_state0_mem(nullptr), d_accum_mem(nullptr),
d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr),
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
d_resident_mem{},
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
@@ -625,6 +639,8 @@ struct StepContext {
d_resident_mem.fill(nullptr);
d_state0.fill(nullptr);
d_accum.fill(nullptr);
d_escalar0.fill(nullptr);
d_escalar_accum.fill(nullptr);
d_state_curr.fill(nullptr);
d_state_next.fill(nullptr);
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -641,6 +657,8 @@ struct StepContext {
struct StepAllocation {
double *d_state0_mem;
double *d_accum_mem;
double *d_escalar0_mem;
double *d_escalar_accum_mem;
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
double *d_matter_mem;
double *d_comm_mem;
@@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation()
StepAllocation alloc = {};
alloc.d_state0_mem = nullptr;
alloc.d_accum_mem = nullptr;
alloc.d_escalar0_mem = nullptr;
alloc.d_escalar_accum_mem = nullptr;
alloc.d_resident_mem.fill(nullptr);
alloc.d_matter_mem = nullptr;
alloc.d_comm_mem = nullptr;
@@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
StepAllocation alloc = {};
alloc.d_state0_mem = ctx.d_state0_mem;
alloc.d_accum_mem = ctx.d_accum_mem;
alloc.d_escalar0_mem = ctx.d_escalar0_mem;
alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem;
alloc.d_resident_mem = ctx.d_resident_mem;
alloc.d_matter_mem = ctx.d_matter_mem;
alloc.d_comm_mem = ctx.d_comm_mem;
@@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
alloc.cap_h_comm = ctx.cap_h_comm;
ctx.d_state0_mem = nullptr;
ctx.d_accum_mem = nullptr;
ctx.d_escalar0_mem = nullptr;
ctx.d_escalar_accum_mem = nullptr;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
ctx.d_resident_mem.fill(nullptr);
@@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
ctx.resident_clock = 0;
ctx.d_state0.fill(nullptr);
ctx.d_accum.fill(nullptr);
ctx.d_escalar0.fill(nullptr);
ctx.d_escalar_accum.fill(nullptr);
ctx.d_state_curr.fill(nullptr);
ctx.d_state_next.fill(nullptr);
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
{
ctx.d_state0_mem = alloc.d_state0_mem;
ctx.d_accum_mem = alloc.d_accum_mem;
ctx.d_escalar0_mem = alloc.d_escalar0_mem;
ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem;
ctx.d_resident_mem = alloc.d_resident_mem;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
@@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
}
}
if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) {
for (int i = 0; i < 2; ++i) {
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
}
}
if (ctx.current_bank >= 0) {
ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
@@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
return ctx;
}
static void ensure_escalar_buffers(StepContext &ctx, size_t all)
{
if (!ctx.d_escalar0_mem)
CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double)));
if (!ctx.d_escalar_accum_mem)
CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double)));
for (int i = 0; i < 2; ++i) {
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
}
}
static void release_step_ctx(void *block_tag)
{
auto it = g_step_ctx.find(block_tag);
@@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
a2);
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
if (!escalar_gpu_rk_enabled()) {
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
}
ctx.matter_ready = true;
(void)Lev;
return 0;
}
extern "C"
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double *Sphi_out_host,
double *Spi_out_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &precor)
{
if (!escalar_gpu_rk_enabled())
return 1;
if (RK4 < 0 || RK4 > 3)
return 1;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor);
StepContext &ctx = ensure_step_ctx(block_tag, all);
ensure_escalar_buffers(ctx, all);
if (RK4 == 0) {
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr],
bytes, cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr],
bytes, cudaMemcpyDeviceToDevice));
}
if (apply_bam_bc) {
gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa],
propspeed[0],
soa_flat[0], soa_flat[1], soa_flat[2],
X, Y, Z, bbox, Symmetry);
gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya],
propspeed[1],
soa_flat[3], soa_flat[4], soa_flat[5],
X, Y, Z, bbox, Symmetry);
}
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa],
ctx.d_escalar_accum[0], dT, RK4);
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya],
ctx.d_escalar_accum[1], dT, RK4);
try_pin_escalar_host_buffer(Sphi_out_host, bytes);
try_pin_escalar_host_buffer(Spi_out_host, bytes);
CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
(void)Lev;
return 0;
}
extern "C"
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,