对rank运行时间统计，两个函数分别在不同的计算中被调用，因此我对两个重载的函数分别进行了mpi实际计算时间的统计，对于第一个PatList_Interp_Points 调用 Interp_points，我取排名前三的rank时间，发现每次只有一个rank时间较长，Rank [ 52]: Calc 0.000012 s

Rank [ 20]: Calc 0.000003 s Rank [ 35]: Calc 0.000003 s Rank [ 10]: Calc 0.000010 s Rank [ 17]: Calc 0.000005 s Rank [ 32]: Calc 0.000003 s，而且rank不固定，一般就是rank 10 和 rank 52；但尽管有很多，比前者时间还是少很多对于第二个Surf_Wave 调用 Interp_points，我发现前四个rank时间最长，比较固定，就是下面四个rank Rank [ 27]: Calc 0.331978 s Rank [ 35]: Calc 0.242219 s Rank [ 28]: Calc 0.242132 s Rank [ 36]: Calc 0.197024 s 因此下面surf_wave是核心
2026-02-24 14:33:04 +08:00
parent 82339f5282
commit 8abac8dd88
2 changed files with 96 additions and 4 deletions
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -341,6 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
+  double t_calc_end, t_calc_total = 0;
+  double t_calc_start = MPI_Wtime();
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -442,7 +444,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-
+        t_calc_end = MPI_Wtime();
+      t_calc_total = t_calc_end - t_calc_start;
  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
  // Since each point's data is non-zero only on the owner rank,
@@ -498,6 +501,49 @@ void Patch::Interp_Points(MyList<var> *VarList,
  }

  delete[] owner_rank;
+
+
+  
+  // 4. 汇总并输出真正干活最慢的 Top 10
+  struct RankStats {
+    int rank;
+    double calc_time; // 净计算时间
+    double comm_time; // 等待时间
+  };
+
+  // 创建当前进程的统计数据
+  RankStats local_stat;
+  local_stat.rank = myrank;
+  local_stat.calc_time = t_calc_total;
+  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
+
+  // 为所有进程的统计数据分配内存
+  RankStats *all_stats = nullptr;
+  if (myrank == 0) {
+    all_stats = new RankStats[nprocs];
+  }
+
+  // 使用MPI_Gather收集所有进程的数据到rank 0
+  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
+             all_stats, sizeof(RankStats), MPI_BYTE,
+             0, MPI_COMM_WORLD);
+
+  if (myrank == 0) {
+    // 按 calc_time（净计算时间）排序
+    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
+        return a.calc_time > b.calc_time;
+    });
+
+    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
+    int display_count = (nprocs < 10) ? nprocs : 10;
+    for (int i = 0; i < display_count; i++) {
+        printf("Rank [%4d]: Calc %.6f s\n", 
+                all_stats[i].rank, all_stats[i].calc_time);
+    }
+    
+    // 清理分配的内存
+    delete[] all_stats;
+  }
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -507,6 +553,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
+  double t_calc_end, t_calc_total = 0;
+  double t_calc_start = MPI_Wtime();
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -607,7 +655,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-
+      t_calc_end = MPI_Wtime();
+      t_calc_total = t_calc_end - t_calc_start;
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
@@ -764,6 +813,48 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
+
+  // 4. 汇总并输出真正干活最慢的 Top 10
+  struct RankStats {
+    int rank;
+    double calc_time; // 净计算时间
+    double comm_time; // 等待时间
+  };
+
+  // 创建当前进程的统计数据
+  RankStats local_stat;
+  local_stat.rank = myrank;
+  local_stat.calc_time = t_calc_total;
+  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
+
+  // 为所有进程的统计数据分配内存
+  RankStats *all_stats = nullptr;
+  if (myrank == 0) {
+    all_stats = new RankStats[nprocs];
+  }
+
+  // 使用MPI_Gather收集所有进程的数据到rank 0
+  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
+             all_stats, sizeof(RankStats), MPI_BYTE,
+             0, MPI_COMM_WORLD);
+  
+  if (myrank == 0) {
+    // 按 calc_time（净计算时间）排序
+    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
+        return a.calc_time > b.calc_time;
+    });
+/*
+    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
+    int display_count = (nprocs < 10) ? nprocs : 10;
+    for (int i = 0; i < display_count; i++) {
+        printf("Rank [%4d]: Calc %.6f s\n", 
+                all_stats[i].rank, all_stats[i].calc_time);
+    }*/
+    
+    // 清理分配的内存
+    delete[] all_stats;
+  }
+
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -1668,4 +1759,4 @@ bool Patch::Find_Point(double *XX)
  delete[] DH;

  return true;
-}
+}