diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 15b5ba5..04f2dd0 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -248,9 +248,12 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = { struct StepContext { double *d_state0_mem; double *d_accum_mem; + double *d_matter_mem; std::array d_state0; std::array d_accum; + std::array d_matter; size_t cap_all; + bool matter_ready; }; static std::unordered_map g_step_ctx; @@ -318,14 +321,23 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) cudaFree(ctx.d_accum_mem); ctx.d_accum_mem = nullptr; } + if (ctx.d_matter_mem) { + cudaFree(ctx.d_matter_mem); + ctx.d_matter_mem = nullptr; + } CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); ctx.cap_all = all; + ctx.matter_ready = false; } for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; } + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; + } return ctx; } @@ -335,6 +347,7 @@ static void release_step_ctx(void *block_tag) if (it == g_step_ctx.end()) return; if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem); if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem); + if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem); g_step_ctx.erase(it); } @@ -2384,25 +2397,38 @@ static void setup_grid_params(int *ex, CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams))); } -static void upload_state_and_matter(double **state_host, - double **matter_host, - size_t all) +static void upload_state_inputs(double **state_host, size_t all) { - static_assert(BSSN_STATE_COUNT + BSSN_MATTER_COUNT == H2D_INPUT_SLOT_COUNT, - "state + matter upload must match contiguous input slots"); const size_t bytes = all * sizeof(double); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); } - for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { - std::memcpy(g_buf.h_stage + (size_t)(BSSN_STATE_COUNT + i) * all, - matter_host[i], bytes); - } CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, - (size_t)H2D_INPUT_SLOT_COUNT * bytes, + (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyHostToDevice)); } +static void upload_matter_cache(StepContext &ctx, + double **matter_host, + size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes); + } + CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage, + (size_t)BSSN_MATTER_COUNT * bytes, + cudaMemcpyHostToDevice)); + ctx.matter_ready = true; +} + +static void bind_matter_slots(const StepContext &ctx) +{ + for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { + g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i]; + } +} + static void launch_rhs_pipeline(int all, double eps, int co) { const double SYM = 1.0; @@ -3254,14 +3280,17 @@ int bssn_cuda_rk4_substep(void *block_tag, const size_t bytes = all * sizeof(double); setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); - upload_state_and_matter(state_host_in, matter_host, all); - StepContext &ctx = ensure_step_ctx(block_tag, all); + upload_state_inputs(state_host_in, all); if (RK4 == 0) { + upload_matter_cache(ctx, matter_host, all); CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToDevice)); + } else if (!ctx.matter_ready) { + upload_matter_cache(ctx, matter_host, all); } + bind_matter_slots(ctx); launch_rhs_pipeline((int)all, eps, co); @@ -3286,6 +3315,9 @@ int bssn_cuda_rk4_substep(void *block_tag, } download_state_outputs(state_host_out, all); + if (RK4 == 3) { + release_step_ctx(block_tag); + } return 0; }