Interp_Points 负载均衡：热点 block 拆分与 rank 重映射

问题背景： Patch::Interp_Points 在球面插值时存在严重的 MPI 负载不均衡。通过 MPI_Wtime 计时诊断发现，64 进程中 rank 27/28/35/36 四个进程承担了绝大部分插值计算（耗时为平均值的 2.6~3.3 倍），导致其余 60 个进程在 MPI 集合通信处空等，成为整体性能瓶颈。根因分析：这四个 rank 对应的 block 在物理空间上恰好覆盖了球面提取面（extraction sphere）的密集插值点区域，而 distribute 函数按均匀网格体积分配 block-to-rank，未考虑插值点的空间分布不均。优化方案： 1. 新增 distribute_optimize 函数替代 distribute，使用独立的 current_block_id 计数器（与 rank 分配解耦）遍历所有 block。 2. 热点 block 拆分（splitHotspotBlock）：对 block 27/28/35/36 沿 x 轴在中点处二等分，生成左右两个子 block，分别分配给相邻的两个 rank： - block 27 → (rank 26, rank 27) - block 28 → (rank 28, rank 29) - block 35 → (rank 34, rank 35) - block 36 → (rank 36, rank 37) 子 block 严格复刻原 distribute 的 ghost zone 扩张和物理坐标计算逻辑（支持 Vertex/Cell 两种网格模式）。 3. 邻居 rank 重映射（createMappedBlock）：被占用的邻居 block 需要让出原 rank，重映射到相邻空闲 rank： - block 26 → rank 25 - block 29 → rank 30 - block 34 → rank 33 - block 37 → rank 38 其余 block 保持 block_id == rank 的原始映射。 4. cgh.C 中 compose_cgh 通过预处理宏切换调用 distribute_optimize 或原始 distribute。 5. MPatch.C 中添加 profile 采集插桩：在 Interp_Points 重载 2 中用 MPI_Wtime 计时，MPI_Gather 汇总各 rank 耗时，识别热点 rank 并写入二进制 profile 文件。 6. 新增 interp_lb_profile.h/C：定义 profile 文件格式（magic、 version、nprocs、threshold_ratio、heavy_ranks），提供 write_profile/read_profile/identify_heavy_ranks 接口。数学等价性：拆分和重映射仅改变 block 的几何划分与 rank 归属，不修改任何物理方程、差分格式或插值算法，计算结果严格一致。
2026-02-27 15:07:40 +08:00
parent 9c33e16571
commit 6b2464b80c
6 changed files with 574 additions and 1 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -462,7 +462,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
            }
          }
 #else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev);
          //	    ng->checkBlock();
          if (BlL)
            BlL->insert(ng);
@@ -500,6 +500,384 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i

  return BlL;
 }
+
+#ifdef INTERP_LB_OPTIMIZE
+#include "interp_lb_profile_data.h"
+
+MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+#ifdef USE_GPU_DIVIDE
+  double cpu_part, gpu_part;
+  map<string, double>::iterator iter;
+  iter = parameters::dou_par.find("cpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    cpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+        strcpy(pname, (iter->second).c_str());
+      else { cout << "Error inputpar" << endl; exit(0); }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN); str = pline;
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+      else if (status == 0) continue;
+      if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); }
+    }
+    inf.close();
+    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
+  }
+  iter = parameters::dou_par.find("gpu part");
+  if (iter != parameters::dou_par.end())
+  {
+    gpu_part = iter->second;
+  }
+  else
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    const int LEN = 256;
+    char pline[LEN];
+    string str, sgrp, skey, sval;
+    int sind;
+    char pname[50];
+    {
+      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
+      if (iter != parameters::str_par.end())
+        strcpy(pname, (iter->second).c_str());
+      else { cout << "Error inputpar" << endl; exit(0); }
+    }
+    ifstream inf(pname, ifstream::in);
+    if (!inf.good() && myrank == 0)
+    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+    for (int i = 1; inf.good(); i++)
+    {
+      inf.getline(pline, LEN); str = pline;
+      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
+      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
+      else if (status == 0) continue;
+      if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); }
+    }
+    inf.close();
+    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
+  }
+  if (nodes == 0) nodes = cpusize / 2;
+#else
+  if (nodes == 0) nodes = cpusize;
+#endif
+
+  if (dim != 3)
+  {
+    cout << "distrivute: now we only support 3-dimension" << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  MyList<Block> *BlL = 0;
+  int split_size, min_size, block_size = 0;
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
+  int nxyz[dim], mmin_width[dim], min_shape[dim];
+
+  MyList<Patch> *PLi = PatchLIST;
+  for (int i = 0; i < dim; i++)
+    min_shape[i] = PLi->data->shape[i];
+  int lev = PLi->data->lev;
+  PLi = PLi->next;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    for (int i = 0; i < dim; i++)
+      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
+    if (lev != PLi->data->lev)
+      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
+    PLi = PLi->next;
+  }
+
+  for (int i = 0; i < dim; i++)
+    mmin_width[i] = Mymin(min_width, min_shape[i]);
+  min_size = mmin_width[0];
+  for (int i = 1; i < dim; i++)
+    min_size = min_size * mmin_width[i];
+
+  PLi = PatchLIST;
+  while (PLi)
+  {
+    Patch *PP = PLi->data;
+    int bs = PP->shape[0];
+    for (int i = 1; i < dim; i++)
+      bs = bs * PP->shape[i];
+    block_size = block_size + bs;
+    PLi = PLi->next;
+  }
+  split_size = Mymax(min_size, block_size / nodes);
+  split_size = Mymax(1, split_size);
+
+  int n_rank = 0;
+  PLi = PatchLIST;
+  int reacpu = 0;
+  int current_block_id = 0;
+  while (PLi) {
+    Block *ng0, *ng;
+    bool first_block_in_patch = true;
+    Patch *PP = PLi->data;
+    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
+
+    for (int i = 0; i < nxyz[0]; i++)
+    for (int j = 0; j < nxyz[1]; j++)
+    for (int k = 0; k < nxyz[2]; k++)
+    {
+        int ibbox_here[6], shape_here[3];
+        double bbox_here[6], dd;
+        Block *current_ng_start = nullptr;
+
+        bool is_heavy = false;
+        int r_l = -1, r_r = -1;
+        if (cpusize == INTERP_LB_NPROCS) {
+          for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) {
+            if (current_block_id == interp_lb_splits[si][0]) {
+              is_heavy = true;
+              r_l = interp_lb_splits[si][1];
+              r_r = interp_lb_splits[si][2];
+              break;
+            }
+          }
+        }
+
+        if (is_heavy)
+        {
+            int ib0 = (PP->shape[0] * i) / nxyz[0];
+            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            int jb1 = (PP->shape[1] * j) / nxyz[1];
+            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            int kb2 = (PP->shape[2] * k) / nxyz[2];
+            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            Block *split_first_block = nullptr;
+            Block *split_last_block = nullptr;
+            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
+                              PP, r_l, r_r, ingfsi, fngfsi, periodic,
+                              split_first_block, split_last_block);
+
+            current_ng_start = split_first_block;
+            ng = split_last_block;
+        }
+        else
+        {
+            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
+            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
+            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
+            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
+            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
+            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
+
+            if (periodic) {
+                for(int d=0; d<3; d++) {
+                    ibbox_here[d] -= ghost_width;
+                    ibbox_here[d+3] += ghost_width;
+                }
+            } else {
+                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
+                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
+                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
+                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
+                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
+                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
+            }
+
+            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
+          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
+          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
+          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
+          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
+          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
+#else
+#ifdef Cell
+          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
+          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
+          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
+          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
+          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
+          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+            ng = createMappedBlock(BlL, dim, shape_here, bbox_here,
+                                   current_block_id, ingfsi, fngfsi, PP->lev);
+            current_ng_start = ng;
+        }
+
+        if (first_block_in_patch) {
+            ng0 = current_ng_start;
+            MyList<Block> *Bp_start = BlL;
+            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
+            PP->blb = Bp_start;
+            first_block_in_patch = false;
+        }
+
+        current_block_id++;
+    }
+
+    {
+      MyList<Block> *Bp_end = BlL;
+      while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
+      PP->ble = Bp_end;
+    }
+
+    PLi = PLi->next;
+  }
+  if (reacpu < nodes * 2 / 3)
+  {
+    int myrank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+    if (myrank == 0)
+      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
+  }
+
+  return BlL;
+}
+
+Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
+                                 int ib0_orig, int ib3_orig,
+                                 int jb1_orig, int jb4_orig,
+                                 int kb2_orig, int kb5_orig,
+                                 Patch* PP, int r_left, int r_right,
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block)
+{
+    int mid = (ib0_orig + ib3_orig) / 2;
+
+    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
+    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
+
+    auto createSubBlock = [&](int* ib_raw, int target_rank) {
+        int ib_final[6];
+        int sh_here[3];
+        double bb_here[6], dd;
+
+        if (periodic) {
+            ib_final[0] = ib_raw[0] - ghost_width;
+            ib_final[3] = ib_raw[3] + ghost_width;
+            ib_final[1] = ib_raw[1] - ghost_width;
+            ib_final[4] = ib_raw[4] + ghost_width;
+            ib_final[2] = ib_raw[2] - ghost_width;
+            ib_final[5] = ib_raw[5] + ghost_width;
+        } else {
+            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
+            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
+            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
+            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
+            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
+            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
+        }
+
+        sh_here[0] = ib_final[3] - ib_final[0] + 1;
+        sh_here[1] = ib_final[4] - ib_final[1] + 1;
+        sh_here[2] = ib_final[5] - ib_final[2] + 1;
+
+#ifdef Vertex
+        dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
+        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
+        bb_here[3] = PP->bbox[0] + ib_final[3] * dd;
+        dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
+        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
+        bb_here[4] = PP->bbox[1] + ib_final[4] * dd;
+        dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
+        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
+        bb_here[5] = PP->bbox[2] + ib_final[5] * dd;
+#else
+#ifdef Cell
+        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
+        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
+        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
+        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
+        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
+        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
+        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
+        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
+        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
+#endif
+#endif
+
+        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
+        if (BlL) BlL->insert(Bg);
+        else     BlL = new MyList<Block>(Bg);
+
+        return Bg;
+    };
+
+    split_first_block = createSubBlock(indices_L, r_left);
+    split_last_block  = createSubBlock(indices_R, r_right);
+    return split_last_block;
+}
+
+Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                                   int block_id, int ingfsi, int fngfsi, int lev)
+{
+    int target_rank = block_id;
+    if (INTERP_LB_NPROCS > 0) {
+      for (int ri = 0; ri < interp_lb_num_remaps; ri++) {
+        if (block_id == interp_lb_remaps[ri][0]) {
+          target_rank = interp_lb_remaps[ri][1];
+          break;
+        }
+      }
+    }
+
+    Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
+    if (BlL) BlL->insert(ng);
+    else     BlL = new MyList<Block>(ng);
+
+    return ng;
+}
+#else
+// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute
+MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
+                                    bool periodic, int nodes)
+{
+  return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes);
+}
+Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
+                                 int ib0_orig, int ib3_orig,
+                                 int jb1_orig, int jb4_orig,
+                                 int kb2_orig, int kb5_orig,
+                                 Patch* PP, int r_left, int r_right,
+                                 int ingfsi, int fngfsi, bool periodic,
+                                 Block* &split_first_block, Block* &split_last_block)
+{ return nullptr; }
+Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
+                                   int block_id, int ingfsi, int fngfsi, int lev)
+{ return nullptr; }
+#endif
+
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int start_rank, int end_rank, int nodes)