Cache matter fields in StepContext across RK4 substeps
This commit is contained in:
@@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
|
||||
struct StepContext {
|
||||
double *d_state0_mem;
|
||||
double *d_accum_mem;
|
||||
double *d_matter_mem;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||
size_t cap_all;
|
||||
bool matter_ready;
|
||||
};
|
||||
|
||||
static std::unordered_map<void *, StepContext> g_step_ctx;
|
||||
@@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
||||
cudaFree(ctx.d_accum_mem);
|
||||
ctx.d_accum_mem = nullptr;
|
||||
}
|
||||
if (ctx.d_matter_mem) {
|
||||
cudaFree(ctx.d_matter_mem);
|
||||
ctx.d_matter_mem = nullptr;
|
||||
}
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
|
||||
ctx.cap_all = all;
|
||||
ctx.matter_ready = false;
|
||||
}
|
||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
||||
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
||||
}
|
||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
@@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag)
|
||||
if (it == g_step_ctx.end()) return;
|
||||
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
|
||||
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
|
||||
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
|
||||
g_step_ctx.erase(it);
|
||||
}
|
||||
|
||||
@@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex,
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||
}
|
||||
|
||||
static void upload_state_and_matter(double **state_host,
|
||||
double **matter_host,
|
||||
size_t all)
|
||||
static void upload_state_inputs(double **state_host, size_t all)
|
||||
{
|
||||
static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
|
||||
"state + matter upload must match contiguous input slots");
|
||||
const size_t bytes = all * sizeof(double);
|
||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
|
||||
}
|
||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||
std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
|
||||
matter_host[i], bytes);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
||||
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
|
||||
(size_t)BSSN_STATE_COUNT * bytes,
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
static void upload_matter_cache(StepContext &ctx,
|
||||
double **matter_host,
|
||||
size_t all)
|
||||
{
|
||||
const size_t bytes = all * sizeof(double);
|
||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
|
||||
(size_t)BSSN_MATTER_COUNT * bytes,
|
||||
cudaMemcpyHostToDevice));
|
||||
ctx.matter_ready = true;
|
||||
}
|
||||
|
||||
static void bind_matter_slots(const StepContext &ctx)
|
||||
{
|
||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||
g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
{
|
||||
const double SYM = 1.0;
|
||||
@@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
const size_t bytes = all * sizeof(double);
|
||||
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||
upload_state_and_matter(state_host_in, matter_host, all);
|
||||
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
upload_state_inputs(state_host_in, all);
|
||||
if (RK4 == 0) {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
||||
(size_t)BSSN_STATE_COUNT * bytes,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
} else if (!ctx.matter_ready) {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
}
|
||||
bind_matter_slots(ctx);
|
||||
|
||||
launch_rhs_pipeline((int)all, eps, co);
|
||||
|
||||
@@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
}
|
||||
|
||||
download_state_outputs(state_host_out, all);
|
||||
if (RK4 == 3) {
|
||||
release_step_ctx(block_tag);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user