对rank运行时间统计,两个函数分别在不同的计算中被调用,因此我对两个重载的函数分别进行了mpi实际计算时间的统计,对于第一个PatList_Interp_Points 调用 Interp_points,我取排名前三的rank时间,发现每次只有一个rank时间较长,Rank [ 52]: Calc 0.000012 s

Rank [  20]: Calc 0.000003 s

Rank [  35]: Calc 0.000003 s

Rank [  10]: Calc 0.000010 s

Rank [  17]: Calc 0.000005 s

Rank [  32]: Calc 0.000003 s,而且rank不固定,一般就是rank 10 和 rank 52;
但尽管有很多,比前者时间还是少很多
对于第二个Surf_Wave 调用 Interp_points,我发现前四个rank时间最长,比较固定,就是下面四个rank

Rank [  27]: Calc 0.331978 s

Rank [  35]: Calc 0.242219 s

Rank [  28]: Calc 0.242132 s

Rank [  36]: Calc 0.197024 s
因此下面surf_wave是核心
This commit is contained in:
jaunatisblue
2026-02-24 14:33:04 +08:00
parent 82339f5282
commit 8abac8dd88
2 changed files with 96 additions and 4 deletions

View File

@@ -341,6 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
double *Shellf, int Symmetry)
{
// NOTE: we do not Synchnize variables here, make sure of that before calling this routine
double t_calc_end, t_calc_total = 0;
double t_calc_start = MPI_Wtime();
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -442,7 +444,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next;
}
}
t_calc_end = MPI_Wtime();
t_calc_total = t_calc_end - t_calc_start;
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group.
// Since each point's data is non-zero only on the owner rank,
@@ -498,6 +501,49 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
delete[] owner_rank;
// 4. 汇总并输出真正干活最慢的 Top 10
struct RankStats {
int rank;
double calc_time; // 净计算时间
double comm_time; // 等待时间
};
// 创建当前进程的统计数据
RankStats local_stat;
local_stat.rank = myrank;
local_stat.calc_time = t_calc_total;
local_stat.comm_time = 0; // 此函数中未跟踪通信时间
// 为所有进程的统计数据分配内存
RankStats *all_stats = nullptr;
if (myrank == 0) {
all_stats = new RankStats[nprocs];
}
// 使用MPI_Gather收集所有进程的数据到rank 0
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
all_stats, sizeof(RankStats), MPI_BYTE,
0, MPI_COMM_WORLD);
if (myrank == 0) {
// 按 calc_time净计算时间排序
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
return a.calc_time > b.calc_time;
});
printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
int display_count = (nprocs < 10) ? nprocs : 10;
for (int i = 0; i < display_count; i++) {
printf("Rank [%4d]: Calc %.6f s\n",
all_stats[i].rank, all_stats[i].calc_time);
}
// 清理分配的内存
delete[] all_stats;
}
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,
@@ -507,6 +553,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
// Targeted point-to-point overload: each owner sends each point only to
// the one rank that needs it for integration (consumer), reducing
// communication volume by ~nprocs times compared to the Bcast version.
double t_calc_end, t_calc_total = 0;
double t_calc_start = MPI_Wtime();
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -607,7 +655,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next;
}
}
t_calc_end = MPI_Wtime();
t_calc_total = t_calc_end - t_calc_start;
// --- Error check for unfound points ---
for (int j = 0; j < NN; j++)
{
@@ -764,6 +813,48 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] recv_count;
delete[] consumer_rank;
delete[] owner_rank;
// 4. 汇总并输出真正干活最慢的 Top 10
struct RankStats {
int rank;
double calc_time; // 净计算时间
double comm_time; // 等待时间
};
// 创建当前进程的统计数据
RankStats local_stat;
local_stat.rank = myrank;
local_stat.calc_time = t_calc_total;
local_stat.comm_time = 0; // 此函数中未跟踪通信时间
// 为所有进程的统计数据分配内存
RankStats *all_stats = nullptr;
if (myrank == 0) {
all_stats = new RankStats[nprocs];
}
// 使用MPI_Gather收集所有进程的数据到rank 0
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
all_stats, sizeof(RankStats), MPI_BYTE,
0, MPI_COMM_WORLD);
if (myrank == 0) {
// 按 calc_time净计算时间排序
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
return a.calc_time > b.calc_time;
});
/*
printf("\n--- Top 10 Ranks by ACTIVE COMPUTATION (CPU Time) ---\n");
int display_count = (nprocs < 10) ? nprocs : 10;
for (int i = 0; i < display_count; i++) {
printf("Rank [%4d]: Calc %.6f s\n",
all_stats[i].rank, all_stats[i].calc_time);
}*/
// 清理分配的内存
delete[] all_stats;
}
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,
@@ -1668,4 +1759,4 @@ bool Patch::Find_Point(double *XX)
delete[] DH;
return true;
}
}