Optimize BSSN-EM 8th-order AMR transfers

2026-05-07 21:38:16 +08:00
parent dcc83bafcb
commit 1064a68d16
2 changed files with 241 additions and 0 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data,
  }
  return size_out;
 }
 bool cuda_ensure_bssn_block_resident_for_pack(Block *block,
                                              MyList<var> *vars,
                                              int state_count,
                                              std::vector<Block *> &uploaded)
 {
  if (!block)
    return false;
  if (bssn_cuda_has_resident_state(block) != 0)
    return true;
  for (size_t i = 0; i < uploaded.size(); ++i)
  {
    if (uploaded[i] == block)
      return bssn_cuda_has_resident_state(block) != 0;
  }
  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
  if (!cuda_build_bssn_host_views(block, vars, state_count, views))
    return false;
  if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0)
    return false;
  uploaded.push_back(block);
  return bssn_cuda_has_resident_state(block) != 0;
 }
 void cuda_host_batch_diag(const char *reason, int state_count, int type)
 {
  static int reported = 0;
  const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG");
  if (!env || atoi(env) == 0 || reported >= 32)
    return;
  int rank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  fprintf(stderr,
          "[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n",
          rank, reason ? reason : "unknown", state_count, type);
  fflush(stderr);
  reported++;
 }
 int cuda_transfer_active_length_if_batched_eligible(MyList<Parallel::gridseg> *src,
                                                    MyList<Parallel::gridseg> *dst,
                                                    int rank_in,
                                                    int dir,
                                                    MyList<var> *VarLists,
                                                    int state_count,
                                                    int myrank,
                                                    std::vector<Block *> &uploaded)
 {
  if (dir != PACK && dir != UNPACK)
  {
    cuda_host_batch_diag("bad_dir", state_count, -1);
    return -1;
  }
  if (!cuda_device_segment_batch_enabled())
  {
    cuda_host_batch_diag("segment_batch_off", state_count, -1);
    return -1;
  }
  if (!cuda_device_state_count_supported(state_count))
  {
    cuda_host_batch_diag("unsupported_state_count", state_count, -1);
    return -1;
  }
  if (cuda_amr_restrict_compare_enabled())
  {
    cuda_host_batch_diag("compare_enabled", state_count, -1);
    return -1;
  }
  int total = 0;
  bool has_work = false;
  bool has_amr = false;
  while (src && dst)
  {
    const bool active =
        (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
    if (active)
    {
      has_work = true;
      if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
      {
        cuda_host_batch_diag("null_segment", state_count, -1);
        return -1;
      }
      int type;
      if (src->data->Bg->lev == dst->data->Bg->lev)
        type = 1;
      else if (src->data->Bg->lev > dst->data->Bg->lev)
        type = 2;
      else
        type = 3;
      if (type == 2 || type == 3)
        has_amr = true;
      if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
      {
        cuda_host_batch_diag("restrict_batch_off", state_count, type);
        return -1;
      }
      if (dir == PACK)
      {
        if ((type == 2 || type == 3) &&
            !cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists,
                                                      state_count, uploaded))
        {
          cuda_host_batch_diag("resident_upload_failed", state_count, type);
          return -1;
        }
        if (!cuda_can_direct_pack(src->data, dst->data, type))
        {
          cuda_host_batch_diag("direct_pack_ineligible", state_count, type);
          return -1;
        }
      }
      else
      {
        if (!cuda_can_direct_unpack(dst->data, type))
        {
          cuda_host_batch_diag("direct_unpack_ineligible", state_count, type);
          return -1;
        }
      }
      total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
    }
    src = src->next;
    dst = dst->next;
  }
  if (!has_work)
    return 0;
  if (!has_amr)
  {
    cuda_host_batch_diag("no_amr_segment", state_count, -1);
    return -1;
  }
  return total;
 }
 int cuda_data_packer_host_staged_batched(double *host_data,
                                         MyList<Parallel::gridseg> *src,
                                         MyList<Parallel::gridseg> *dst,
                                         int rank_in,
                                         int dir,
                                         MyList<var> *VarLists,
                                         MyList<var> *VarListd,
                                         int Symmetry)
 {
  if (!host_data || !cuda_amr_host_staged_enabled())
  {
    cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1);
    return -1;
  }
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  const int state_count = cuda_state_var_count(VarLists, VarListd);
  if (state_count < 0)
  {
    cuda_host_batch_diag("var_list_mismatch", state_count, -1);
    return -1;
  }
  std::vector<Block *> uploaded;
  const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in,
                                                                    dir, VarLists,
                                                                    state_count, myrank,
                                                                    uploaded);
  if (total <= 0)
    return total;
  static double *stage_dev = 0;
  static int stage_cap = 0;
  if (total > stage_cap)
  {
    free_device_comm_buffer(stage_dev);
    stage_dev = alloc_device_comm_buffer(total);
    stage_cap = total;
  }
  if (dir == UNPACK)
  {
    cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double),
                                 cudaMemcpyHostToDevice);
    if (h2d != cudaSuccess)
    {
      fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n",
              (int)h2d);
      return -1;
    }
  }
  const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
  const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir,
                                                     VarLists, VarListd, Symmetry);
  if (packed != total)
  {
    cuda_host_batch_diag("device_batched_failed", state_count, -1);
    return -1;
  }
  if (dir == PACK)
  {
    cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double),
                                 cudaMemcpyDeviceToHost);
    if (d2h != cudaSuccess)
    {
      fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n",
              (int)d2h);
      return -1;
    }
  }
  if (sync_profile_enabled())
  {
    const double dt = MPI_Wtime() - t0;
    if (dir == PACK)
      sync_profile_stats().direct_pack_sec += dt;
    else
      sync_profile_stats().direct_unpack_sec += dt;
  }
  return total;
 }
 #endif
 bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
@@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
 #if USE_CUDA_BSSN
  if (data && (dir == PACK || dir == UNPACK))
  {
    const int batched = cuda_data_packer_host_staged_batched(data, src, dst, rank_in, dir,
                                                             VarLists, VarListd, Symmetry);
    if (batched >= 0)
      return batched;
  }
 #endif
  int type; /* 1 copy, 2 restrict, 3 prolong */
  if (src->data->Bg->lev == dst->data->Bg->lev)
    type = 1;
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -167,6 +167,12 @@ def _gpu_runtime_env():
            "AMSS_INTERP_GPU": "0",
            "AMSS_CUDA_AWARE_MPI": "0",
        })
    if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
        defaults.update({
            "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
            "AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
            "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
        })
    if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
        defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
    if getattr(input_data, "Equation_Class", "") == "Z4C":