From 8abac8dd8884804d0b1817d6652693185fc9afdc Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Tue, 24 Feb 2026 14:33:04 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AF=B9rank=E8=BF=90=E8=A1=8C=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E7=BB=9F=E8=AE=A1=EF=BC=8C=E4=B8=A4=E4=B8=AA=E5=87=BD?= =?UTF-8?q?=E6=95=B0=E5=88=86=E5=88=AB=E5=9C=A8=E4=B8=8D=E5=90=8C=E7=9A=84?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E4=B8=AD=E8=A2=AB=E8=B0=83=E7=94=A8=EF=BC=8C?= =?UTF-8?q?=E5=9B=A0=E6=AD=A4=E6=88=91=E5=AF=B9=E4=B8=A4=E4=B8=AA=E9=87=8D?= =?UTF-8?q?=E8=BD=BD=E7=9A=84=E5=87=BD=E6=95=B0=E5=88=86=E5=88=AB=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E4=BA=86mpi=E5=AE=9E=E9=99=85=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E7=9A=84=E7=BB=9F=E8=AE=A1=EF=BC=8C=E5=AF=B9?= =?UTF-8?q?=E4=BA=8E=E7=AC=AC=E4=B8=80=E4=B8=AAPatList=5FInterp=5FPoints?= =?UTF-8?q?=20=E8=B0=83=E7=94=A8=20Interp=5Fpoints=EF=BC=8C=E6=88=91?= =?UTF-8?q?=E5=8F=96=E6=8E=92=E5=90=8D=E5=89=8D=E4=B8=89=E7=9A=84rank?= =?UTF-8?q?=E6=97=B6=E9=97=B4=EF=BC=8C=E5=8F=91=E7=8E=B0=E6=AF=8F=E6=AC=A1?= =?UTF-8?q?=E5=8F=AA=E6=9C=89=E4=B8=80=E4=B8=AArank=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E8=BE=83=E9=95=BF=EF=BC=8CRank=20[=20=2052]:=20Calc=200.000012?= =?UTF-8?q?=20s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rank [ 20]: Calc 0.000003 s Rank [ 35]: Calc 0.000003 s Rank [ 10]: Calc 0.000010 s Rank [ 17]: Calc 0.000005 s Rank [ 32]: Calc 0.000003 s,而且rank不固定,一般就是rank 10 和 rank 52; 但尽管有很多,比前者时间还是少很多 对于第二个Surf_Wave 调用 Interp_points,我发现前四个rank时间最长,比较固定,就是下面四个rank Rank [ 27]: Calc 0.331978 s Rank [ 35]: Calc 0.242219 s Rank [ 28]: Calc 0.242132 s Rank [ 36]: Calc 0.197024 s 因此下面surf_wave是核心 --- AMSS_NCKU_Program.py | 3 +- AMSS_NCKU_source/MPatch.C | 97 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/AMSS_NCKU_Program.py b/AMSS_NCKU_Program.py index 6a7952a..4d5adfa 100755 --- a/AMSS_NCKU_Program.py +++ b/AMSS_NCKU_Program.py @@ -66,7 +66,8 @@ if os.path.exists(File_directory): ## Prompt whether to overwrite the existing directory while True: try: - inputvalue = input() + ## inputvalue = input() + inputvalue = "continue" ## If the user agrees to overwrite, proceed and remove the existing directory if ( inputvalue == "continue" ): print( " Continue the calculation !!! " ) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 91ead8a..e712a74 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -341,6 +341,8 @@ void Patch::Interp_Points(MyList *VarList, double *Shellf, int Symmetry) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine + double t_calc_end, t_calc_total = 0; + double t_calc_start = MPI_Wtime(); int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -442,7 +444,8 @@ void Patch::Interp_Points(MyList *VarList, Bp = Bp->next; } } - + t_calc_end = MPI_Wtime(); + t_calc_total = t_calc_end - t_calc_start; // Replace MPI_Allreduce with per-owner MPI_Bcast: // Group consecutive points by owner rank and broadcast each group. // Since each point's data is non-zero only on the owner rank, @@ -498,6 +501,49 @@ void Patch::Interp_Points(MyList *VarList, } delete[] owner_rank; + + + + // 4. 汇总并输出真正干活最慢的 Top 10 + struct RankStats { + int rank; + double calc_time; // 净计算时间 + double comm_time; // 等待时间 + }; + + // 创建当前进程的统计数据 + RankStats local_stat; + local_stat.rank = myrank; + local_stat.calc_time = t_calc_total; + local_stat.comm_time = 0; // 此函数中未跟踪通信时间 + + // 为所有进程的统计数据分配内存 + RankStats *all_stats = nullptr; + if (myrank == 0) { + all_stats = new RankStats[nprocs]; + } + + // 使用MPI_Gather收集所有进程的数据到rank 0 + MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE, + all_stats, sizeof(RankStats), MPI_BYTE, + 0, MPI_COMM_WORLD); + + if (myrank == 0) { + // 按 calc_time(净计算时间)排序 + std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) { + return a.calc_time > b.calc_time; + }); + + printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n"); + int display_count = (nprocs < 10) ? nprocs : 10; + for (int i = 0; i < display_count; i++) { + printf("Rank [%4d]: Calc %.6f s\n", + all_stats[i].rank, all_stats[i].calc_time); + } + + // 清理分配的内存 + delete[] all_stats; + } } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, @@ -507,6 +553,8 @@ void Patch::Interp_Points(MyList *VarList, // Targeted point-to-point overload: each owner sends each point only to // the one rank that needs it for integration (consumer), reducing // communication volume by ~nprocs times compared to the Bcast version. + double t_calc_end, t_calc_total = 0; + double t_calc_start = MPI_Wtime(); int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -607,7 +655,8 @@ void Patch::Interp_Points(MyList *VarList, Bp = Bp->next; } } - + t_calc_end = MPI_Wtime(); + t_calc_total = t_calc_end - t_calc_start; // --- Error check for unfound points --- for (int j = 0; j < NN; j++) { @@ -764,6 +813,48 @@ void Patch::Interp_Points(MyList *VarList, delete[] recv_count; delete[] consumer_rank; delete[] owner_rank; + + // 4. 汇总并输出真正干活最慢的 Top 10 + struct RankStats { + int rank; + double calc_time; // 净计算时间 + double comm_time; // 等待时间 + }; + + // 创建当前进程的统计数据 + RankStats local_stat; + local_stat.rank = myrank; + local_stat.calc_time = t_calc_total; + local_stat.comm_time = 0; // 此函数中未跟踪通信时间 + + // 为所有进程的统计数据分配内存 + RankStats *all_stats = nullptr; + if (myrank == 0) { + all_stats = new RankStats[nprocs]; + } + + // 使用MPI_Gather收集所有进程的数据到rank 0 + MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE, + all_stats, sizeof(RankStats), MPI_BYTE, + 0, MPI_COMM_WORLD); + + if (myrank == 0) { + // 按 calc_time(净计算时间)排序 + std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) { + return a.calc_time > b.calc_time; + }); +/* + printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n"); + int display_count = (nprocs < 10) ? nprocs : 10; + for (int i = 0; i < display_count; i++) { + printf("Rank [%4d]: Calc %.6f s\n", + all_stats[i].rank, all_stats[i].calc_time); + }*/ + + // 清理分配的内存 + delete[] all_stats; + } + } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, @@ -1668,4 +1759,4 @@ bool Patch::Find_Point(double *XX) delete[] DH; return true; -} +} \ No newline at end of file