Add direct CUDA resident-state sync path and profiling hooks

This commit is contained in:
2026-04-13 00:57:05 +08:00
parent 7f2a391dd2
commit 636e35bfd8
5 changed files with 1188 additions and 527 deletions

View File

@@ -6,6 +6,7 @@
*/
#include <array>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cmath>
@@ -62,6 +63,72 @@ static void init_gpu_dispatch() {
}
g_dispatch.inited = true;
}
struct CudaProfileStats {
long long calls;
double total_ms;
double state_ms;
double matter_ms;
double rhs_ms;
double bc_ms;
double finalize_ms;
double output_ms;
};
static CudaProfileStats &cuda_profile_stats() {
static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
return stats;
}
static bool cuda_profile_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_PROFILE_CUDA");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
static int cuda_profile_every() {
static int every = -1;
if (every < 0) {
const char *env = getenv("AMSS_PROFILE_CUDA_EVERY");
every = (env && atoi(env) > 0) ? atoi(env) : 100;
}
return every;
}
static double cuda_profile_now_ms() {
using clock = std::chrono::steady_clock;
return std::chrono::duration<double, std::milli>(
clock::now().time_since_epoch()).count();
}
static void cuda_profile_sync() {
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error %s:%d: %s\n",
__FILE__, __LINE__, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
}
static void cuda_profile_maybe_log() {
if (!cuda_profile_enabled()) return;
CudaProfileStats &stats = cuda_profile_stats();
if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
fprintf(stderr,
"[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n",
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
stats.total_ms / (double)stats.calls,
stats.state_ms / (double)stats.calls,
stats.matter_ms / (double)stats.calls,
stats.rhs_ms / (double)stats.calls,
stats.bc_ms / (double)stats.calls,
stats.finalize_ms / (double)stats.calls,
stats.output_ms / (double)stats.calls);
fflush(stderr);
}
/* ------------------------------------------------------------------ */
/* Error checking */
@@ -248,12 +315,17 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
struct StepContext {
double *d_state0_mem;
double *d_accum_mem;
double *d_state_curr_mem;
double *d_state_next_mem;
double *d_matter_mem;
std::array<double *, BSSN_STATE_COUNT> d_state0;
std::array<double *, BSSN_STATE_COUNT> d_accum;
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
std::array<double *, BSSN_STATE_COUNT> d_state_next;
std::array<double *, BSSN_MATTER_COUNT> d_matter;
size_t cap_all;
bool matter_ready;
bool state_ready;
};
static std::unordered_map<void *, StepContext> g_step_ctx;
@@ -321,19 +393,32 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
cudaFree(ctx.d_accum_mem);
ctx.d_accum_mem = nullptr;
}
if (ctx.d_state_curr_mem) {
cudaFree(ctx.d_state_curr_mem);
ctx.d_state_curr_mem = nullptr;
}
if (ctx.d_state_next_mem) {
cudaFree(ctx.d_state_next_mem);
ctx.d_state_next_mem = nullptr;
}
if (ctx.d_matter_mem) {
cudaFree(ctx.d_matter_mem);
ctx.d_matter_mem = nullptr;
}
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
ctx.cap_all = all;
ctx.matter_ready = false;
ctx.state_ready = false;
}
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all;
ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all;
}
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
@@ -347,6 +432,8 @@ static void release_step_ctx(void *block_tag)
if (it == g_step_ctx.end()) return;
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem);
if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem);
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
g_step_ctx.erase(it);
}
@@ -1050,6 +1137,76 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0,
}
}
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
double * __restrict__ gxy,
double * __restrict__ gxz,
double * __restrict__ dyy,
double * __restrict__ gyz,
double * __restrict__ dzz,
double * __restrict__ Axx,
double * __restrict__ Axy,
double * __restrict__ Axz,
double * __restrict__ Ayy,
double * __restrict__ Ayz,
double * __restrict__ Azz)
{
constexpr double F1O3 = 1.0 / 3.0;
constexpr double ONE = 1.0;
constexpr double TWO = 2.0;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < d_gp.all;
i += blockDim.x * gridDim.x)
{
double lgxx = dxx[i] + ONE;
double lgyy = dyy[i] + ONE;
double lgzz = dzz[i] + ONE;
double lgxy = gxy[i];
double lgxz = gxz[i];
double lgyz = gyz[i];
double lscale = lgxx * lgyy * lgzz
+ lgxy * lgyz * lgxz
+ lgxz * lgxy * lgyz
- lgxz * lgyy * lgxz
- lgxy * lgxy * lgzz
- lgxx * lgyz * lgyz;
lscale = ONE / cbrt(lscale);
lgxx *= lscale;
lgxy *= lscale;
lgxz *= lscale;
lgyy *= lscale;
lgyz *= lscale;
lgzz *= lscale;
dxx[i] = lgxx - ONE;
gxy[i] = lgxy;
gxz[i] = lgxz;
dyy[i] = lgyy - ONE;
gyz[i] = lgyz;
dzz[i] = lgzz - ONE;
const double lgupxx = (lgyy * lgzz - lgyz * lgyz);
const double lgupxy = - (lgxy * lgzz - lgyz * lgxz);
const double lgupxz = (lgxy * lgyz - lgyy * lgxz);
const double lgupyy = (lgxx * lgzz - lgxz * lgxz);
const double lgupyz = - (lgxx * lgyz - lgxy * lgxz);
const double lgupzz = (lgxx * lgyy - lgxy * lgxy);
const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i]
+ TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]);
Axx[i] -= F1O3 * lgxx * ltrA;
Axy[i] -= F1O3 * lgxy * ltrA;
Axz[i] -= F1O3 * lgxz * ltrA;
Ayy[i] -= F1O3 * lgyy * ltrA;
Ayz[i] -= F1O3 * lgyz * ltrA;
Azz[i] -= F1O3 * lgzz * ltrA;
}
}
__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
{
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -2429,6 +2586,20 @@ static void bind_matter_slots(const StepContext &ctx)
}
}
static void bind_state_input_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
{
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
g_buf.slot[k_state_input_slots[i]] = state[i];
}
}
static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
{
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
g_buf.slot[k_state_rhs_slots[i]] = state[i];
}
}
static void launch_rhs_pipeline(int all, double eps, int co)
{
const double SYM = 1.0;
@@ -2763,6 +2934,87 @@ static void download_state_outputs(double **state_host_out, size_t all)
}
}
static void copy_state_region_cuda(void *block_tag,
int state_index,
double *host_state,
const int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz,
cudaMemcpyKind kind)
{
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
if (sx <= 0 || sy <= 0 || sz <= 0) return;
const size_t pitch = (size_t)ex[0] * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
cudaMemcpy3DParms p = {};
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
if (kind == cudaMemcpyDeviceToHost) {
p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
} else {
p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
}
CUDA_CHECK(cudaMemcpy3D(&p));
}
static void copy_state_region_packed_cuda(void *block_tag,
int state_index,
double *host_buffer,
const int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz,
cudaMemcpyKind kind)
{
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
if (sx <= 0 || sy <= 0 || sz <= 0) return;
const size_t src_pitch = (size_t)ex[0] * sizeof(double);
const size_t dst_pitch = (size_t)sx * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
cudaMemcpy3DParms p = {};
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
if (kind == cudaMemcpyDeviceToHost) {
p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
p.dstPos = make_cudaPos(0, 0, 0);
} else {
p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
p.srcPos = make_cudaPos(0, 0, 0);
p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
}
CUDA_CHECK(cudaMemcpy3D(&p));
}
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
{
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
StepContext &ctx = ensure_step_ctx(block_tag, all);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
}
static bool has_resident_state(void *block_tag)
{
auto it = g_step_ctx.find(block_tag);
return it != g_step_ctx.end() && it->second.state_ready;
}
/* ================================================================== */
/* Main host function — drop-in replacement for bssn_rhs_c.C */
/* ================================================================== */
@@ -3266,22 +3518,53 @@ int bssn_cuda_rk4_substep(void *block_tag,
int &Symmetry,
int &Lev,
double &eps,
int &co)
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny)
{
(void)T;
(void)Lev;
(void)state_host_out;
if (RK4 < 0 || RK4 > 3) return 1;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
const bool profile = cuda_profile_enabled();
const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
double state_ms = 0.0;
double matter_ms = 0.0;
double rhs_ms = 0.0;
double bc_ms = 0.0;
double finalize_ms = 0.0;
double output_ms = 0.0;
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
StepContext &ctx = ensure_step_ctx(block_tag, all);
upload_state_inputs(state_host_in, all);
const bool use_resident_state = (keep_resident_state != 0);
if (use_resident_state) {
bind_state_input_slots(ctx.d_state_curr);
bind_state_output_slots(ctx.d_state_next);
}
double t0 = profile ? cuda_profile_now_ms() : 0.0;
if (!use_resident_state || RK4 == 0 || !ctx.state_ready) {
upload_state_inputs(state_host_in, all);
}
if (apply_enforce_ga) {
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
}
if (profile) {
cuda_profile_sync();
state_ms += cuda_profile_now_ms() - t0;
}
t0 = profile ? cuda_profile_now_ms() : 0.0;
if (RK4 == 0) {
upload_matter_cache(ctx, matter_host, all);
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
@@ -3291,9 +3574,19 @@ int bssn_cuda_rk4_substep(void *block_tag,
upload_matter_cache(ctx, matter_host, all);
}
bind_matter_slots(ctx);
if (profile) {
cuda_profile_sync();
matter_ms += cuda_profile_now_ms() - t0;
}
t0 = profile ? cuda_profile_now_ms() : 0.0;
launch_rhs_pipeline((int)all, eps, co);
if (profile) {
cuda_profile_sync();
rhs_ms += cuda_profile_now_ms() - t0;
}
t0 = profile ? cuda_profile_now_ms() : 0.0;
if (apply_bam_bc) {
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
@@ -3305,7 +3598,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
X, Y, Z, bbox, Symmetry);
}
}
if (profile) {
cuda_profile_sync();
bc_ms += cuda_profile_now_ms() - t0;
}
t0 = profile ? cuda_profile_now_ms() : 0.0;
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
g_buf.slot[k_state_rhs_slots[i]],
@@ -3314,13 +3612,119 @@ int bssn_cuda_rk4_substep(void *block_tag,
RK4);
}
download_state_outputs(state_host_out, all);
if (RK4 == 3) {
kern_lowerboundset_cuda<<<grid(all), BLK>>>(g_buf.slot[S_chi_rhs], chitiny);
if (profile) {
cuda_profile_sync();
finalize_ms += cuda_profile_now_ms() - t0;
}
t0 = profile ? cuda_profile_now_ms() : 0.0;
if (use_resident_state) {
std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
ctx.d_state_curr.swap(ctx.d_state_next);
ctx.state_ready = true;
} else {
download_state_outputs(state_host_out, all);
}
if (RK4 == 3 && !use_resident_state) {
release_step_ctx(block_tag);
}
if (profile) {
cuda_profile_sync();
output_ms += cuda_profile_now_ms() - t0;
CudaProfileStats &stats = cuda_profile_stats();
stats.calls++;
stats.total_ms += cuda_profile_now_ms() - t_total0;
stats.state_ms += state_ms;
stats.matter_ms += matter_ms;
stats.rhs_ms += rhs_ms;
stats.bc_ms += bc_ms;
stats.finalize_ms += finalize_ms;
stats.output_ms += output_ms;
cuda_profile_maybe_log();
}
return 0;
}
extern "C"
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_cuda(block_tag, state_index, host_state, ex,
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
return 0;
}
extern "C"
int bssn_cuda_copy_state_region_from_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_cuda(block_tag, state_index, host_state, ex,
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
return 0;
}
extern "C"
int bssn_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
download_resident_state(block_tag, ex, state_host_out);
return 0;
}
extern "C"
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
return 0;
}
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
return 0;
}
extern "C"
int bssn_cuda_has_resident_state(void *block_tag)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
return has_resident_state(block_tag) ? 1 : 0;
}
extern "C"
void bssn_cuda_release_step_ctx(void *block_tag)
{