Optimize BSSN EScalar GPU path baseline

This commit is contained in:
2026-05-02 18:19:15 +08:00
parent 52beb4d153
commit 59a216ad93
13 changed files with 1366 additions and 177 deletions

View File

@@ -4947,6 +4947,118 @@ static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT>
}
}
__global__ __launch_bounds__(128, 4)
void kern_escalar_matter_rhs(
const double * __restrict__ chi,
const double * __restrict__ trK,
const double * __restrict__ dxx,
const double * __restrict__ gxy,
const double * __restrict__ gxz,
const double * __restrict__ dyy,
const double * __restrict__ gyz,
const double * __restrict__ dzz,
const double * __restrict__ Gamx,
const double * __restrict__ Gamy,
const double * __restrict__ Gamz,
const double * __restrict__ Lap,
const double * __restrict__ Sphi,
const double * __restrict__ Spi,
const double * __restrict__ chix,
const double * __restrict__ chiy,
const double * __restrict__ chiz,
const double * __restrict__ Lapx,
const double * __restrict__ Lapy,
const double * __restrict__ Lapz,
const double * __restrict__ Kx,
const double * __restrict__ Ky,
const double * __restrict__ Kz,
const double * __restrict__ fxx,
const double * __restrict__ fxy,
const double * __restrict__ fxz,
const double * __restrict__ fyy,
const double * __restrict__ fyz,
const double * __restrict__ fzz,
double * __restrict__ Sphi_rhs,
double * __restrict__ Spi_rhs,
double * __restrict__ rho,
double * __restrict__ Sx,
double * __restrict__ Sy,
double * __restrict__ Sz,
double * __restrict__ Sxx,
double * __restrict__ Sxy,
double * __restrict__ Sxz,
double * __restrict__ Syy,
double * __restrict__ Syz,
double * __restrict__ Szz,
double a2)
{
const double TWO = 2.0;
const double HALF = 0.5;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < d_gp.all;
i += blockDim.x * gridDim.x)
{
const double chin1 = chi[i] + 1.0;
const double alpn1 = Lap[i] + 1.0;
const double gxx = dxx[i] + 1.0;
const double gyy = dyy[i] + 1.0;
const double gzz = dzz[i] + 1.0;
const double lgxy = gxy[i];
const double lgxz = gxz[i];
const double lgyz = gyz[i];
double det = gxx * gyy * gzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
- lgxz * gyy * lgxz - lgxy * lgxy * gzz - gxx * lgyz * lgyz;
const double gupxx = (gyy * gzz - lgyz * lgyz) / det;
const double gupxy = -(lgxy * gzz - lgyz * lgxz) / det;
const double gupxz = (lgxy * lgyz - gyy * lgxz) / det;
const double gupyy = (gxx * gzz - lgxz * lgxz) / det;
const double gupyz = -(gxx * lgyz - lgxy * lgxz) / det;
const double gupzz = (gxx * gyy - lgxy * lgxy) / det;
double V = 0.0;
double dVdSphi = 0.0;
#if (EScalar_CC == 2 || EScalar_CC == 3)
const double sqrt_pi_over_3 = sqrt(PI_VAL / 3.0);
const double e4 = exp(4.0 * sqrt_pi_over_3 * Sphi[i]);
const double e8n = exp(-8.0 * sqrt_pi_over_3 * Sphi[i]);
const double inv_a2 = 1.0 / a2;
V = e8n * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_VAL) * inv_a2;
dVdSphi = inv_a2 / 12.0 * sqrt(3.0 / PI_VAL) * e8n * (-1.0 + e4);
#else
(void)a2;
#endif
Sphi_rhs[i] = alpn1 * Spi[i];
double srhs = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
+ TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
- ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) * HALF / chin1) * Kx[i]
+ (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) * HALF / chin1) * Ky[i]
+ (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) * HALF / chin1) * Kz[i]);
srhs = srhs * alpn1
+ (gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
+ gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
+ gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i]);
Spi_rhs[i] = srhs * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi);
rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
+ gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
+ HALF * Spi[i] * Spi[i] + V;
Sx[i] = -Spi[i] * Kx[i];
Sy[i] = -Spi[i] * Ky[i];
Sz[i] = -Spi[i] * Kz[i];
const double f = (rho[i] - Spi[i] * Spi[i]) / chin1;
Sxx[i] = Kx[i] * Kx[i] - f * gxx;
Sxy[i] = Kx[i] * Ky[i] - f * lgxy;
Sxz[i] = Kx[i] * Kz[i] - f * lgxz;
Syy[i] = Ky[i] * Ky[i] - f * gyy;
Syz[i] = Ky[i] * Kz[i] - f * lgyz;
Szz[i] = Kz[i] * Kz[i] - f * gzz;
}
}
static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
{
if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
@@ -6885,6 +6997,97 @@ int f_compute_rhs_bssn(int *ex, double &T,
return 0;
}
extern "C"
int bssn_cuda_compute_escalar_matter(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double *Sphi_host,
double *Spi_host,
double *Sphi_rhs_host,
double *Spi_rhs_host,
double a2,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &apply_enforce_ga)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
StepContext &ctx = ensure_step_ctx(block_tag, all);
const int input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
mark_resident_current_bank(ctx, input_bank);
bind_state_input_slots(ctx.d_resident[input_bank]);
if (apply_enforce_ga) {
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
set_resident_host_clean(ctx, input_bank, false);
}
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));
double *src_fields[3] = {
g_buf.slot[S_chi], g_buf.slot[S_Lap], g_buf.slot[S_S_arr]
};
double *fx_fields[3] = {
g_buf.slot[S_chix], g_buf.slot[S_Lapx], g_buf.slot[S_Kx]
};
double *fy_fields[3] = {
g_buf.slot[S_chiy], g_buf.slot[S_Lapy], g_buf.slot[S_Ky]
};
double *fz_fields[3] = {
g_buf.slot[S_chiz], g_buf.slot[S_Lapz], g_buf.slot[S_Kz]
};
int soa_signs[9] = {
1, 1, 1,
1, 1, 1,
1, 1, 1
};
gpu_fderivs_batch(3, src_fields, fx_fields, fy_fields, fz_fields,
soa_signs, (int)all);
double *fd_src[1] = { g_buf.slot[S_S_arr] };
double *fxx_fields[1] = { g_buf.slot[S_fxx] };
double *fxy_fields[1] = { g_buf.slot[S_fxy] };
double *fxz_fields[1] = { g_buf.slot[S_fxz] };
double *fyy_fields[1] = { g_buf.slot[S_fyy] };
double *fyz_fields[1] = { g_buf.slot[S_fyz] };
double *fzz_fields[1] = { g_buf.slot[S_fzz] };
gpu_fdderivs_batch(1, fd_src, fxx_fields, fxy_fields, fxz_fields,
fyy_fields, fyz_fields, fzz_fields, soa_signs, (int)all);
kern_escalar_matter_rhs<<<grid(all), BLK>>>(
g_buf.slot[S_chi], g_buf.slot[S_trK],
g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
g_buf.slot[S_Gamx], g_buf.slot[S_Gamy], g_buf.slot[S_Gamz],
g_buf.slot[S_Lap],
g_buf.slot[S_S_arr], g_buf.slot[S_f_arr],
g_buf.slot[S_chix], g_buf.slot[S_chiy], g_buf.slot[S_chiz],
g_buf.slot[S_Lapx], g_buf.slot[S_Lapy], g_buf.slot[S_Lapz],
g_buf.slot[S_Kx], g_buf.slot[S_Ky], g_buf.slot[S_Kz],
g_buf.slot[S_fxx], g_buf.slot[S_fxy], g_buf.slot[S_fxz],
g_buf.slot[S_fyy], g_buf.slot[S_fyz], g_buf.slot[S_fzz],
g_buf.slot[S_Gamxa], g_buf.slot[S_Gamya],
ctx.d_matter[0], ctx.d_matter[1], ctx.d_matter[2], ctx.d_matter[3],
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
a2);
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
ctx.matter_ready = true;
(void)Lev;
return 0;
}
extern "C"
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
@@ -6968,6 +7171,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
if (RK4 == 0) {
if (use_zero_matter) {
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
} else if (!matter_host) {
if (!ctx.matter_ready) return 1;
} else {
upload_matter_cache(ctx, matter_host, all);
}
@@ -6979,7 +7184,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
cudaMemcpyDeviceToDevice));
} else if (!ctx.matter_ready) {
if (use_zero_matter) zero_matter_cache(ctx, all);
else upload_matter_cache(ctx, matter_host, all);
else if (matter_host) upload_matter_cache(ctx, matter_host, all);
else return 1;
}
bind_matter_slots(ctx);
if (profile) {
@@ -7107,6 +7313,20 @@ int bssn_cuda_download_resident_state_if_present(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_resident_state_matches(void *block_tag,
double **state_host_key)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
auto it = g_step_ctx.find(block_tag);
if (it == g_step_ctx.end() || !resident_key_usable(state_host_key))
return 0;
StepContext &ctx = it->second;
const int bank = find_resident_bank(ctx, state_host_key);
return (bank >= 0 && ctx.resident_valid[bank]) ? 1 : 0;
}
extern "C"
int bssn_cuda_download_constraint_outputs(int *ex,
double **constraint_host_out)