Add zero matter handling and interpolation for resident state in CUDA BSSN

2026-04-15 00:25:53 +08:00
parent 9ab7e7c7f9
commit 8fe60ea703
3 changed files with 209 additions and 11 deletions
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u
  return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
 }
 bool bssn_cuda_patch_contains_point(Patch *patch, const double *point)
 {
  if (!patch)
    return false;
  for (int d = 0; d < dim; d++)
  {
    const double h = patch->getdX(d);
    const double lo = patch->bbox[d] + patch->lli[d] * h;
    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
    if (point[d] < lo || point[d] > hi)
      return false;
  }
  return true;
 }
 bool bssn_cuda_point_in_block(Patch *patch, Block *block,
                              const double *point, const double *DH)
 {
  if (!patch || !block)
    return false;
  for (int d = 0; d < dim; d++)
  {
    double llb;
    double uub;
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
 #else
 #ifdef Cell
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + ghost_width * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - ghost_width * DH[d];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
      return false;
  }
  return true;
 }
 int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
 {
  if (!coords || n <= ordn)
    return 0;
  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
  int start = cxi - ordn / 2;
  if (start < 0)
    start = 0;
  const int max_start = n - ordn;
  if (start > max_start)
    start = max_start;
  return start;
 }
 bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
                                        int myrank,
                                        const double *point,
                                        var *forx, var *fory, var *forz,
                                        int Symmetry,
                                        double *shellf)
 {
  const int ordn = 2 * ghost_width;
  int owner_rank = -1;
  shellf[0] = shellf[1] = shellf[2] = 0.0;
  MyList<Patch> *PL = PatL;
  while (PL)
  {
    Patch *patch = PL->data;
    if (!bssn_cuda_patch_contains_point(patch, point))
    {
      PL = PL->next;
      continue;
    }
    double DH[dim];
    for (int d = 0; d < dim; d++)
      DH[d] = patch->getdX(d);
    MyList<Block> *BP = patch->blb;
    while (BP)
    {
      Block *block = BP->data;
      if (bssn_cuda_point_in_block(patch, block, point, DH))
      {
        owner_rank = block->rank;
        if (myrank == owner_rank)
        {
          int interp_ordn = ordn;
          int interp_sym = Symmetry;
          double x = point[0];
          double y = point[1];
          double z = point[2];
          if (bssn_cuda_has_resident_state(block) &&
              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
          {
            const int sx = ordn;
            const int sy = ordn;
            const int sz = ordn;
            const int region_all = sx * sy * sz;
            const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
            const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
            const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
            double packed_fields[3 * region_all];
            var *vars[3] = {forx, fory, forz};
            for (int f = 0; f < 3; f++)
            {
              if (bssn_cuda_pack_state_region_to_host_buffer(block,
                                                             k_bssn_cuda_bh_state_indices[f],
                                                             packed_fields + f * region_all,
                                                             block->shape,
                                                             i0, j0, k0,
                                                             sx, sy, sz) != 0)
              {
                cout << "CUDA BH tile download failed" << endl;
                MPI_Abort(MPI_COMM_WORLD, 1);
              }
              int tile_shape[3] = {sx, sy, sz};
              f_global_interp(tile_shape,
                              block->X[0] + i0,
                              block->X[1] + j0,
                              block->X[2] + k0,
                              packed_fields + f * region_all,
                              shellf[f],
                              x, y, z,
                              interp_ordn,
                              vars[f]->SoA,
                              interp_sym);
            }
          }
          else
          {
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forx->sgfn], shellf[0],
                            x, y, z, interp_ordn, forx->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[fory->sgfn], shellf[1],
                            x, y, z, interp_ordn, fory->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forz->sgfn], shellf[2],
                            x, y, z, interp_ordn, forz->SoA, interp_sym);
          }
        }
        break;
      }
      if (BP == patch->ble)
        break;
      BP = BP->next;
    }
    if (owner_rank >= 0)
      break;
    PL = PL->next;
  }
  if (owner_rank < 0)
    return false;
  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
  return true;
 }
 void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
@@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN)
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
          // bssn_class does not evolve matter source fields; they remain zero-initialized.
          int use_zero_matter = 1;
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
@@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN)
                                    propspeed, soa_flat, Pp->data->bbox,
                                    dT_lev, TRK4, iter_count, apply_bam_bc,
                                    Symmetry, lev, ndeps, pre,
                                    use_zero_matter,
                                    keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA predictor substep failed in domain: ("
@@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN)
 #if USE_CUDA_BSSN
  const bool need_analysis_state_after_predictor =
      (lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
  const bool need_bh_state_after_predictor =
      (BH_num > 0) && (lev == GH->levels - 1);
  if (use_cuda_resident_sync && need_analysis_state_after_predictor)
    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
  else if (use_cuda_resident_sync && need_bh_state_after_predictor)
    bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
 #endif
 #ifdef WithShell
@@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN)
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            int apply_bam_bc = 0;
            // bssn_class does not evolve matter source fields; they remain zero-initialized.
            int use_zero_matter = 1;
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            int apply_enforce_ga = 0;
 #if (AGM == 0)
@@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN)
                                      propspeed, soa_flat, Pp->data->bbox,
                                      dT_lev, TRK4, iter_count, apply_bam_bc,
                                      Symmetry, lev, ndeps, cor,
                                      use_zero_matter,
                                      keep_resident_state, apply_enforce_ga, chitiny))
            {
              cout << "CUDA corrector substep failed in domain: ("
@@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN)
    }
 #endif
 #if USE_CUDA_BSSN
    if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
      bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
 #endif
    // swap time level
    if (iter_count < 3)
    {
@@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
    int lev = ilev;
 #if USE_CUDA_BSSN
    if (bssn_cuda_use_resident_sync(lev) &&
        bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
    {
      BH_RHS[n][0] = -shellf[0];
      BH_RHS[n][1] = -shellf[1];
      BH_RHS[n][2] = -shellf[2];
      continue;
    }
 #endif
 #if (PSTR == 0)
    while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx,
    ctx.matter_ready = true;
 }
 static void zero_matter_cache(StepContext &ctx, size_t all)
 {
    CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
                          (size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
    ctx.matter_ready = true;
 }
 static void bind_matter_slots(const StepContext &ctx)
 {
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
@@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
                          int &Lev,
                          double &eps,
                          int &co,
                          int &use_zero_matter,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny)
@@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (RK4 == 0) {
-        upload_matter_cache(ctx, matter_host, all);
+        if (use_zero_matter) {
            if (!ctx.matter_ready) zero_matter_cache(ctx, all);
        } else {
            upload_matter_cache(ctx, matter_host, all);
        }
        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
                              (size_t)BSSN_STATE_COUNT * bytes,
                              cudaMemcpyDeviceToDevice));
    } else if (!ctx.matter_ready) {
-        upload_matter_cache(ctx, matter_host, all);
+        if (use_zero_matter) zero_matter_cache(ctx, all);
        else upload_matter_cache(ctx, matter_host, all);
    }
    bind_matter_slots(ctx);
    if (profile) {
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
                          int &Lev,
                          double &eps,
                          int &co,
                          int &use_zero_matter,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny);