Cache matter fields in StepContext across RK4 substeps

This commit is contained in:
2026-04-12 22:19:45 +08:00
parent 4fa12a2009
commit 7f2a391dd2

View File

@@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
struct StepContext {
double *d_state0_mem;
double *d_accum_mem;
double *d_matter_mem;
std::array<double *, BSSN_STATE_COUNT> d_state0;
std::array<double *, BSSN_STATE_COUNT> d_accum;
std::array<double *, BSSN_MATTER_COUNT> d_matter;
size_t cap_all;
bool matter_ready;
};
static std::unordered_map<void *, StepContext> g_step_ctx;
@@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
cudaFree(ctx.d_accum_mem);
ctx.d_accum_mem = nullptr;
}
if (ctx.d_matter_mem) {
cudaFree(ctx.d_matter_mem);
ctx.d_matter_mem = nullptr;
}
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
ctx.cap_all = all;
ctx.matter_ready = false;
}
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
}
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
}
return ctx;
}
@@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag)
if (it == g_step_ctx.end()) return;
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
g_step_ctx.erase(it);
}
@@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex,
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
}
static void upload_state_and_matter(double **state_host,
double **matter_host,
size_t all)
static void upload_state_inputs(double **state_host, size_t all)
{
static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
"state + matter upload must match contiguous input slots");
const size_t bytes = all * sizeof(double);
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice));
}
static void upload_matter_cache(StepContext &ctx,
double **matter_host,
size_t all)
{
const size_t bytes = all * sizeof(double);
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice));
ctx.matter_ready = true;
}
static void bind_matter_slots(const StepContext &ctx)
{
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
}
}
static void launch_rhs_pipeline(int all, double eps, int co)
{
const double SYM = 1.0;
@@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
upload_state_and_matter(state_host_in, matter_host, all);
StepContext &ctx = ensure_step_ctx(block_tag, all);
upload_state_inputs(state_host_in, all);
if (RK4 == 0) {
upload_matter_cache(ctx, matter_host, all);
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
} else if (!ctx.matter_ready) {
upload_matter_cache(ctx, matter_host, all);
}
bind_matter_slots(ctx);
launch_rhs_pipeline((int)all, eps, co);
@@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag,
}
download_state_outputs(state_host_out, all);
if (RK4 == 3) {
release_step_ctx(block_tag);
}
return 0;
}