Optimize BSSN EScalar GPU path baseline
This commit is contained in:
@@ -4947,6 +4947,118 @@ static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT>
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_escalar_matter_rhs(
|
||||
const double * __restrict__ chi,
|
||||
const double * __restrict__ trK,
|
||||
const double * __restrict__ dxx,
|
||||
const double * __restrict__ gxy,
|
||||
const double * __restrict__ gxz,
|
||||
const double * __restrict__ dyy,
|
||||
const double * __restrict__ gyz,
|
||||
const double * __restrict__ dzz,
|
||||
const double * __restrict__ Gamx,
|
||||
const double * __restrict__ Gamy,
|
||||
const double * __restrict__ Gamz,
|
||||
const double * __restrict__ Lap,
|
||||
const double * __restrict__ Sphi,
|
||||
const double * __restrict__ Spi,
|
||||
const double * __restrict__ chix,
|
||||
const double * __restrict__ chiy,
|
||||
const double * __restrict__ chiz,
|
||||
const double * __restrict__ Lapx,
|
||||
const double * __restrict__ Lapy,
|
||||
const double * __restrict__ Lapz,
|
||||
const double * __restrict__ Kx,
|
||||
const double * __restrict__ Ky,
|
||||
const double * __restrict__ Kz,
|
||||
const double * __restrict__ fxx,
|
||||
const double * __restrict__ fxy,
|
||||
const double * __restrict__ fxz,
|
||||
const double * __restrict__ fyy,
|
||||
const double * __restrict__ fyz,
|
||||
const double * __restrict__ fzz,
|
||||
double * __restrict__ Sphi_rhs,
|
||||
double * __restrict__ Spi_rhs,
|
||||
double * __restrict__ rho,
|
||||
double * __restrict__ Sx,
|
||||
double * __restrict__ Sy,
|
||||
double * __restrict__ Sz,
|
||||
double * __restrict__ Sxx,
|
||||
double * __restrict__ Sxy,
|
||||
double * __restrict__ Sxz,
|
||||
double * __restrict__ Syy,
|
||||
double * __restrict__ Syz,
|
||||
double * __restrict__ Szz,
|
||||
double a2)
|
||||
{
|
||||
const double TWO = 2.0;
|
||||
const double HALF = 0.5;
|
||||
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
i < d_gp.all;
|
||||
i += blockDim.x * gridDim.x)
|
||||
{
|
||||
const double chin1 = chi[i] + 1.0;
|
||||
const double alpn1 = Lap[i] + 1.0;
|
||||
const double gxx = dxx[i] + 1.0;
|
||||
const double gyy = dyy[i] + 1.0;
|
||||
const double gzz = dzz[i] + 1.0;
|
||||
const double lgxy = gxy[i];
|
||||
const double lgxz = gxz[i];
|
||||
const double lgyz = gyz[i];
|
||||
|
||||
double det = gxx * gyy * gzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
|
||||
- lgxz * gyy * lgxz - lgxy * lgxy * gzz - gxx * lgyz * lgyz;
|
||||
const double gupxx = (gyy * gzz - lgyz * lgyz) / det;
|
||||
const double gupxy = -(lgxy * gzz - lgyz * lgxz) / det;
|
||||
const double gupxz = (lgxy * lgyz - gyy * lgxz) / det;
|
||||
const double gupyy = (gxx * gzz - lgxz * lgxz) / det;
|
||||
const double gupyz = -(gxx * lgyz - lgxy * lgxz) / det;
|
||||
const double gupzz = (gxx * gyy - lgxy * lgxy) / det;
|
||||
|
||||
double V = 0.0;
|
||||
double dVdSphi = 0.0;
|
||||
#if (EScalar_CC == 2 || EScalar_CC == 3)
|
||||
const double sqrt_pi_over_3 = sqrt(PI_VAL / 3.0);
|
||||
const double e4 = exp(4.0 * sqrt_pi_over_3 * Sphi[i]);
|
||||
const double e8n = exp(-8.0 * sqrt_pi_over_3 * Sphi[i]);
|
||||
const double inv_a2 = 1.0 / a2;
|
||||
V = e8n * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_VAL) * inv_a2;
|
||||
dVdSphi = inv_a2 / 12.0 * sqrt(3.0 / PI_VAL) * e8n * (-1.0 + e4);
|
||||
#else
|
||||
(void)a2;
|
||||
#endif
|
||||
|
||||
Sphi_rhs[i] = alpn1 * Spi[i];
|
||||
|
||||
double srhs = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
|
||||
+ TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
|
||||
- ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) * HALF / chin1) * Kx[i]
|
||||
+ (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) * HALF / chin1) * Ky[i]
|
||||
+ (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) * HALF / chin1) * Kz[i]);
|
||||
srhs = srhs * alpn1
|
||||
+ (gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
|
||||
+ gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
|
||||
+ gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i]);
|
||||
Spi_rhs[i] = srhs * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi);
|
||||
|
||||
rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
|
||||
+ gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
|
||||
+ HALF * Spi[i] * Spi[i] + V;
|
||||
Sx[i] = -Spi[i] * Kx[i];
|
||||
Sy[i] = -Spi[i] * Ky[i];
|
||||
Sz[i] = -Spi[i] * Kz[i];
|
||||
const double f = (rho[i] - Spi[i] * Spi[i]) / chin1;
|
||||
Sxx[i] = Kx[i] * Kx[i] - f * gxx;
|
||||
Sxy[i] = Kx[i] * Ky[i] - f * lgxy;
|
||||
Sxz[i] = Kx[i] * Kz[i] - f * lgxz;
|
||||
Syy[i] = Ky[i] * Ky[i] - f * gyy;
|
||||
Syz[i] = Ky[i] * Kz[i] - f * lgyz;
|
||||
Szz[i] = Kz[i] * Kz[i] - f * gzz;
|
||||
}
|
||||
}
|
||||
|
||||
static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
|
||||
{
|
||||
if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
|
||||
@@ -6885,6 +6997,97 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_compute_escalar_matter(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double **state_host_in,
|
||||
double *Sphi_host,
|
||||
double *Spi_host,
|
||||
double *Sphi_rhs_host,
|
||||
double *Spi_rhs_host,
|
||||
double a2,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &apply_enforce_ga)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const size_t bytes = all * sizeof(double);
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
|
||||
const int input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
|
||||
mark_resident_current_bank(ctx, input_bank);
|
||||
bind_state_input_slots(ctx.d_resident[input_bank]);
|
||||
if (apply_enforce_ga) {
|
||||
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
||||
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
||||
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
|
||||
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
||||
set_resident_host_clean(ctx, input_bank, false);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));
|
||||
|
||||
double *src_fields[3] = {
|
||||
g_buf.slot[S_chi], g_buf.slot[S_Lap], g_buf.slot[S_S_arr]
|
||||
};
|
||||
double *fx_fields[3] = {
|
||||
g_buf.slot[S_chix], g_buf.slot[S_Lapx], g_buf.slot[S_Kx]
|
||||
};
|
||||
double *fy_fields[3] = {
|
||||
g_buf.slot[S_chiy], g_buf.slot[S_Lapy], g_buf.slot[S_Ky]
|
||||
};
|
||||
double *fz_fields[3] = {
|
||||
g_buf.slot[S_chiz], g_buf.slot[S_Lapz], g_buf.slot[S_Kz]
|
||||
};
|
||||
int soa_signs[9] = {
|
||||
1, 1, 1,
|
||||
1, 1, 1,
|
||||
1, 1, 1
|
||||
};
|
||||
gpu_fderivs_batch(3, src_fields, fx_fields, fy_fields, fz_fields,
|
||||
soa_signs, (int)all);
|
||||
|
||||
double *fd_src[1] = { g_buf.slot[S_S_arr] };
|
||||
double *fxx_fields[1] = { g_buf.slot[S_fxx] };
|
||||
double *fxy_fields[1] = { g_buf.slot[S_fxy] };
|
||||
double *fxz_fields[1] = { g_buf.slot[S_fxz] };
|
||||
double *fyy_fields[1] = { g_buf.slot[S_fyy] };
|
||||
double *fyz_fields[1] = { g_buf.slot[S_fyz] };
|
||||
double *fzz_fields[1] = { g_buf.slot[S_fzz] };
|
||||
gpu_fdderivs_batch(1, fd_src, fxx_fields, fxy_fields, fxz_fields,
|
||||
fyy_fields, fyz_fields, fzz_fields, soa_signs, (int)all);
|
||||
|
||||
kern_escalar_matter_rhs<<<grid(all), BLK>>>(
|
||||
g_buf.slot[S_chi], g_buf.slot[S_trK],
|
||||
g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
||||
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
||||
g_buf.slot[S_Gamx], g_buf.slot[S_Gamy], g_buf.slot[S_Gamz],
|
||||
g_buf.slot[S_Lap],
|
||||
g_buf.slot[S_S_arr], g_buf.slot[S_f_arr],
|
||||
g_buf.slot[S_chix], g_buf.slot[S_chiy], g_buf.slot[S_chiz],
|
||||
g_buf.slot[S_Lapx], g_buf.slot[S_Lapy], g_buf.slot[S_Lapz],
|
||||
g_buf.slot[S_Kx], g_buf.slot[S_Ky], g_buf.slot[S_Kz],
|
||||
g_buf.slot[S_fxx], g_buf.slot[S_fxy], g_buf.slot[S_fxz],
|
||||
g_buf.slot[S_fyy], g_buf.slot[S_fyz], g_buf.slot[S_fzz],
|
||||
g_buf.slot[S_Gamxa], g_buf.slot[S_Gamya],
|
||||
ctx.d_matter[0], ctx.d_matter[1], ctx.d_matter[2], ctx.d_matter[3],
|
||||
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
|
||||
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
|
||||
a2);
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
ctx.matter_ready = true;
|
||||
(void)Lev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
@@ -6968,6 +7171,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
if (RK4 == 0) {
|
||||
if (use_zero_matter) {
|
||||
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
|
||||
} else if (!matter_host) {
|
||||
if (!ctx.matter_ready) return 1;
|
||||
} else {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
}
|
||||
@@ -6979,7 +7184,8 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
} else if (!ctx.matter_ready) {
|
||||
if (use_zero_matter) zero_matter_cache(ctx, all);
|
||||
else upload_matter_cache(ctx, matter_host, all);
|
||||
else if (matter_host) upload_matter_cache(ctx, matter_host, all);
|
||||
else return 1;
|
||||
}
|
||||
bind_matter_slots(ctx);
|
||||
if (profile) {
|
||||
@@ -7107,6 +7313,20 @@ int bssn_cuda_download_resident_state_if_present(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_resident_state_matches(void *block_tag,
|
||||
double **state_host_key)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
auto it = g_step_ctx.find(block_tag);
|
||||
if (it == g_step_ctx.end() || !resident_key_usable(state_host_key))
|
||||
return 0;
|
||||
StepContext &ctx = it->second;
|
||||
const int bank = find_resident_bank(ctx, state_host_key);
|
||||
return (bank >= 0 && ctx.resident_valid[bank]) ? 1 : 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_download_constraint_outputs(int *ex,
|
||||
double **constraint_host_out)
|
||||
|
||||
Reference in New Issue
Block a user