对rank运行时间统计，两个函数分别在不同的计算中被调用，因此我对两个重载的函数分别进行了mpi实际计算时间的统计，对于第一个PatList_Interp_Points 调用 Interp_points，我取排名前三的rank时间，发现每次只有一个rank时间较长，Rank [ 52]: Calc 0.000012 s

Rank [ 20]: Calc 0.000003 s Rank [ 35]: Calc 0.000003 s Rank [ 10]: Calc 0.000010 s Rank [ 17]: Calc 0.000005 s Rank [ 32]: Calc 0.000003 s，而且rank不固定，一般就是rank 10 和 rank 52；但尽管有很多，比前者时间还是少很多对于第二个Surf_Wave 调用 Interp_points，我发现前四个rank时间最长，比较固定，就是下面四个rank Rank [ 27]: Calc 0.331978 s Rank [ 35]: Calc 0.242219 s Rank [ 28]: Calc 0.242132 s Rank [ 36]: Calc 0.197024 s 因此下面surf_wave是核心
2026-02-24 14:33:04 +08:00
parent 82339f5282
commit 8abac8dd88
2 changed files with 96 additions and 4 deletions
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -66,7 +66,8 @@ if os.path.exists(File_directory):
    ## Prompt whether to overwrite the existing directory
    while True:
        try:
-            inputvalue = input()
+            ## inputvalue = input()
            inputvalue = "continue"
            ## If the user agrees to overwrite, proceed and remove the existing directory
            if ( inputvalue == "continue" ):
                print( " Continue the calculation !!! " )
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -341,6 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
  double t_calc_end, t_calc_total = 0;
  double t_calc_start = MPI_Wtime();
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -442,7 +444,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-
+        t_calc_end = MPI_Wtime();
      t_calc_total = t_calc_end - t_calc_start;
  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
  // Since each point's data is non-zero only on the owner rank,
@@ -498,6 +501,49 @@ void Patch::Interp_Points(MyList<var> *VarList,
  }
  delete[] owner_rank;
  // 4. 汇总并输出真正干活最慢的 Top 10
  struct RankStats {
    int rank;
    double calc_time; // 净计算时间
    double comm_time; // 等待时间
  };
  // 创建当前进程的统计数据
  RankStats local_stat;
  local_stat.rank = myrank;
  local_stat.calc_time = t_calc_total;
  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
  // 为所有进程的统计数据分配内存
  RankStats *all_stats = nullptr;
  if (myrank == 0) {
    all_stats = new RankStats[nprocs];
  }
  // 使用MPI_Gather收集所有进程的数据到rank 0
  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
             all_stats, sizeof(RankStats), MPI_BYTE,
             0, MPI_COMM_WORLD);
  if (myrank == 0) {
    // 按 calc_time（净计算时间）排序
    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
        return a.calc_time > b.calc_time;
    });
    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
    int display_count = (nprocs < 10) ? nprocs : 10;
    for (int i = 0; i < display_count; i++) {
        printf("Rank [%4d]: Calc %.6f s\n", 
                all_stats[i].rank, all_stats[i].calc_time);
    }
    // 清理分配的内存
    delete[] all_stats;
  }
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -507,6 +553,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
  double t_calc_end, t_calc_total = 0;
  double t_calc_start = MPI_Wtime();
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -607,7 +655,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-
+      t_calc_end = MPI_Wtime();
      t_calc_total = t_calc_end - t_calc_start;
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
@@ -764,6 +813,48 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
  // 4. 汇总并输出真正干活最慢的 Top 10
  struct RankStats {
    int rank;
    double calc_time; // 净计算时间
    double comm_time; // 等待时间
  };
  // 创建当前进程的统计数据
  RankStats local_stat;
  local_stat.rank = myrank;
  local_stat.calc_time = t_calc_total;
  local_stat.comm_time = 0; // 此函数中未跟踪通信时间
  // 为所有进程的统计数据分配内存
  RankStats *all_stats = nullptr;
  if (myrank == 0) {
    all_stats = new RankStats[nprocs];
  }
  // 使用MPI_Gather收集所有进程的数据到rank 0
  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
             all_stats, sizeof(RankStats), MPI_BYTE,
             0, MPI_COMM_WORLD);
  if (myrank == 0) {
    // 按 calc_time（净计算时间）排序
    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
        return a.calc_time > b.calc_time;
    });
 /*
    printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
    int display_count = (nprocs < 10) ? nprocs : 10;
    for (int i = 0; i < display_count; i++) {
        printf("Rank [%4d]: Calc %.6f s\n", 
                all_stats[i].rank, all_stats[i].calc_time);
    }*/
    // 清理分配的内存
    delete[] all_stats;
  }
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,