merge lopsided+kodis

This commit is contained in:
2026-03-02 12:12:26 +08:00
parent ad5ff03615
commit a893b4007c

View File

@@ -912,23 +912,12 @@ static void gpu_fdderivs(double *d_f,
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz); kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
} }
/* symmetry_bd on GPU for ord=3, then launch lopsided kernel */ /* Combined ord=3 advection + KO dissipation.
static void gpu_lopsided(double *d_f, double *d_f_rhs, * When advection and KO use the same source field, symmetry packing is shared.
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
*/
static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
double *d_Sfx, double *d_Sfy, double *d_Sfz, double *d_Sfx, double *d_Sfy, double *d_Sfz,
double SoA0, double SoA1, double SoA2, int all)
{
double *fh = g_buf.d_fh3;
const size_t nx = (size_t)g_buf.prev_nx;
const size_t ny = (size_t)g_buf.prev_ny;
const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
}
/* symmetry_bd on GPU for ord=3, then launch kodis kernel */
static void gpu_kodis(double *d_f, double *d_f_rhs,
double SoA0, double SoA1, double SoA2, double SoA0, double SoA1, double SoA2,
double eps_val, int all) double eps_val, int all)
{ {
@@ -938,8 +927,15 @@ static void gpu_kodis(double *d_f, double *d_f_rhs,
const size_t nz = (size_t)g_buf.prev_nz; const size_t nz = (size_t)g_buf.prev_nz;
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull); const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2); kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_adv, fh, SoA0, SoA1, SoA2);
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
if (eps_val > 0.0) {
if (d_f_ko != d_f_adv) {
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_ko, fh, SoA0, SoA1, SoA2);
}
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val); kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
}
} }
/* ================================================================== */ /* ================================================================== */
@@ -2466,62 +2462,32 @@ int f_compute_rhs_bssn(int *ex, double &T,
D(S_f_arr), D(S_S_arr)); D(S_f_arr), D(S_S_arr));
/* ============================================================ */ /* ============================================================ */
/* Phase 16: 23x lopsided (advection) */ /* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */
/* ============================================================ */ /* ============================================================ */
gpu_lopsided(D(S_gxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all); gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided(D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all); gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
gpu_lopsided(D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all); gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
gpu_lopsided(D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all); gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided(D(S_gyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all); gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
gpu_lopsided(D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all); gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
gpu_lopsided(D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all); gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided(D(S_gzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all); gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided(D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all); gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
gpu_lopsided(D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, all); gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all);
gpu_lopsided(D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, all); gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all);
gpu_lopsided(D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, all); gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all);
gpu_lopsided(D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, all); gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all);
gpu_lopsided(D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, all); gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all);
gpu_lopsided(D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, all); gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all);
gpu_lopsided(D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, all); gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all);
/* ============================================================ */
/* Phase 17: 24x KO dissipation (eps > 0) */
/* ============================================================ */
if (eps > 0) {
gpu_kodis(D(S_chi), D(S_chi_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_trK), D(S_trK_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_dxx), D(S_gxx_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_gxy), D(S_gxy_rhs), ANTI,ANTI,SYM, eps, all);
gpu_kodis(D(S_gxz), D(S_gxz_rhs), ANTI,SYM,ANTI, eps, all);
gpu_kodis(D(S_dyy), D(S_gyy_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_gyz), D(S_gyz_rhs), SYM,ANTI,ANTI, eps, all);
gpu_kodis(D(S_dzz), D(S_gzz_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_Axx), D(S_Axx_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_dtSfz), D(S_dtSfz_rhs), SYM,SYM,ANTI, eps, all);
gpu_kodis(D(S_Axy), D(S_Axy_rhs), ANTI,ANTI,SYM, eps, all);
gpu_kodis(D(S_dtSfy), D(S_dtSfy_rhs), SYM,ANTI,SYM, eps, all);
gpu_kodis(D(S_Axz), D(S_Axz_rhs), ANTI,SYM,ANTI, eps, all);
gpu_kodis(D(S_dtSfx), D(S_dtSfx_rhs), ANTI,SYM,SYM, eps, all);
gpu_kodis(D(S_Ayy), D(S_Ayy_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_betaz), D(S_betaz_rhs), SYM,SYM,ANTI, eps, all);
gpu_kodis(D(S_Ayz), D(S_Ayz_rhs), SYM,ANTI,ANTI, eps, all);
gpu_kodis(D(S_betay), D(S_betay_rhs), SYM,ANTI,SYM, eps, all);
gpu_kodis(D(S_Azz), D(S_Azz_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_betax), D(S_betax_rhs), ANTI,SYM,SYM, eps, all);
gpu_kodis(D(S_Gamx), D(S_Gamx_rhs), ANTI,SYM,SYM, eps, all);
gpu_kodis(D(S_Lap), D(S_Lap_rhs), SYM,SYM,SYM, eps, all);
gpu_kodis(D(S_Gamy), D(S_Gamy_rhs), SYM,ANTI,SYM, eps, all);
gpu_kodis(D(S_Gamz), D(S_Gamz_rhs), SYM,SYM,ANTI, eps, all);
}
/* ============================================================ */ /* ============================================================ */
/* Phase 18: Hamilton & momentum constraints (co==0) */ /* Phase 18: Hamilton & momentum constraints (co==0) */