Cache matter fields in StepContext across RK4 substeps
This commit is contained in:
@@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
|
|||||||
struct StepContext {
|
struct StepContext {
|
||||||
double *d_state0_mem;
|
double *d_state0_mem;
|
||||||
double *d_accum_mem;
|
double *d_accum_mem;
|
||||||
|
double *d_matter_mem;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||||
|
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||||
size_t cap_all;
|
size_t cap_all;
|
||||||
|
bool matter_ready;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::unordered_map<void *, StepContext> g_step_ctx;
|
static std::unordered_map<void *, StepContext> g_step_ctx;
|
||||||
@@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
|||||||
cudaFree(ctx.d_accum_mem);
|
cudaFree(ctx.d_accum_mem);
|
||||||
ctx.d_accum_mem = nullptr;
|
ctx.d_accum_mem = nullptr;
|
||||||
}
|
}
|
||||||
|
if (ctx.d_matter_mem) {
|
||||||
|
cudaFree(ctx.d_matter_mem);
|
||||||
|
ctx.d_matter_mem = nullptr;
|
||||||
|
}
|
||||||
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
|
||||||
ctx.cap_all = all;
|
ctx.cap_all = all;
|
||||||
|
ctx.matter_ready = false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
||||||
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||||
|
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
|
||||||
|
}
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag)
|
|||||||
if (it == g_step_ctx.end()) return;
|
if (it == g_step_ctx.end()) return;
|
||||||
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
|
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
|
||||||
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
|
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
|
||||||
|
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
|
||||||
g_step_ctx.erase(it);
|
g_step_ctx.erase(it);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex,
|
|||||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void upload_state_and_matter(double **state_host,
|
static void upload_state_inputs(double **state_host, size_t all)
|
||||||
double **matter_host,
|
|
||||||
size_t all)
|
|
||||||
{
|
{
|
||||||
static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT,
|
|
||||||
"state + matter upload must match contiguous input slots");
|
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
|
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
|
||||||
std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all,
|
|
||||||
matter_host[i], bytes);
|
|
||||||
}
|
|
||||||
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
||||||
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
|
(size_t)BSSN_STATE_COUNT * bytes,
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void upload_matter_cache(StepContext &ctx,
|
||||||
|
double **matter_host,
|
||||||
|
size_t all)
|
||||||
|
{
|
||||||
|
const size_t bytes = all * sizeof(double);
|
||||||
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||||
|
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
|
||||||
|
(size_t)BSSN_MATTER_COUNT * bytes,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
ctx.matter_ready = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bind_matter_slots(const StepContext &ctx)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||||
|
g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void launch_rhs_pipeline(int all, double eps, int co)
|
static void launch_rhs_pipeline(int all, double eps, int co)
|
||||||
{
|
{
|
||||||
const double SYM = 1.0;
|
const double SYM = 1.0;
|
||||||
@@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
|
|
||||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||||
upload_state_and_matter(state_host_in, matter_host, all);
|
|
||||||
|
|
||||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
|
upload_state_inputs(state_host_in, all);
|
||||||
if (RK4 == 0) {
|
if (RK4 == 0) {
|
||||||
|
upload_matter_cache(ctx, matter_host, all);
|
||||||
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
||||||
(size_t)BSSN_STATE_COUNT * bytes,
|
(size_t)BSSN_STATE_COUNT * bytes,
|
||||||
cudaMemcpyDeviceToDevice));
|
cudaMemcpyDeviceToDevice));
|
||||||
|
} else if (!ctx.matter_ready) {
|
||||||
|
upload_matter_cache(ctx, matter_host, all);
|
||||||
}
|
}
|
||||||
|
bind_matter_slots(ctx);
|
||||||
|
|
||||||
launch_rhs_pipeline((int)all, eps, co);
|
launch_rhs_pipeline((int)all, eps, co);
|
||||||
|
|
||||||
@@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
}
|
}
|
||||||
|
|
||||||
download_state_outputs(state_host_out, all);
|
download_state_outputs(state_host_out, all);
|
||||||
|
if (RK4 == 3) {
|
||||||
|
release_step_ctx(block_tag);
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user