问题背景: Patch::Interp_Points 在球面插值时存在严重的 MPI 负载不均衡。 通过 MPI_Wtime 计时诊断发现,64 进程中 rank 27/28/35/36 四个进程 承担了绝大部分插值计算(耗时为平均值的 2.6~3.3 倍),导致其余 60 个进程在 MPI 集合通信处空等,成为整体性能瓶颈。 根因分析: 这四个 rank 对应的 block 在物理空间上恰好覆盖了球面提取面 (extraction sphere)的密集插值点区域,而 distribute 函数按均匀 网格体积分配 block-to-rank,未考虑插值点的空间分布不均。 优化方案: 1. 新增 distribute_optimize 函数替代 distribute,使用独立的 current_block_id 计数器(与 rank 分配解耦)遍历所有 block。 2. 热点 block 拆分(splitHotspotBlock): 对 block 27/28/35/36 沿 x 轴在中点处二等分,生成左右两个子 block,分别分配给相邻的两个 rank: - block 27 → (rank 26, rank 27) - block 28 → (rank 28, rank 29) - block 35 → (rank 34, rank 35) - block 36 → (rank 36, rank 37) 子 block 严格复刻原 distribute 的 ghost zone 扩张和物理坐标 计算逻辑(支持 Vertex/Cell 两种网格模式)。 3. 邻居 rank 重映射(createMappedBlock): 被占用的邻居 block 需要让出原 rank,重映射到相邻空闲 rank: - block 26 → rank 25 - block 29 → rank 30 - block 34 → rank 33 - block 37 → rank 38 其余 block 保持 block_id == rank 的原始映射。 4. cgh.C 中 compose_cgh 通过预处理宏切换调用 distribute_optimize 或原始 distribute。 5. MPatch.C 中添加 profile 采集插桩:在 Interp_Points 重载 2 中 用 MPI_Wtime 计时,MPI_Gather 汇总各 rank 耗时,识别热点 rank 并写入二进制 profile 文件。 6. 新增 interp_lb_profile.h/C:定义 profile 文件格式(magic、 version、nprocs、threshold_ratio、heavy_ranks),提供 write_profile/read_profile/identify_heavy_ranks 接口。 数学等价性:拆分和重映射仅改变 block 的几何划分与 rank 归属, 不修改任何物理方程、差分格式或插值算法,计算结果严格一致。
108 lines
3.3 KiB
C
108 lines
3.3 KiB
C
#include "interp_lb_profile.h"
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <algorithm>
|
|
|
|
namespace InterpLBProfile {
|
|
|
|
bool write_profile(const char *filepath, int nprocs,
|
|
const double *rank_times,
|
|
const int *heavy_ranks, int num_heavy,
|
|
double threshold_ratio)
|
|
{
|
|
FILE *fp = fopen(filepath, "wb");
|
|
if (!fp) return false;
|
|
|
|
ProfileHeader hdr;
|
|
hdr.magic = MAGIC;
|
|
hdr.version = VERSION;
|
|
hdr.nprocs = nprocs;
|
|
hdr.num_heavy = num_heavy;
|
|
hdr.threshold_ratio = threshold_ratio;
|
|
|
|
fwrite(&hdr, sizeof(hdr), 1, fp);
|
|
fwrite(rank_times, sizeof(double), nprocs, fp);
|
|
fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
|
|
fclose(fp);
|
|
return true;
|
|
}
|
|
|
|
bool read_profile(const char *filepath, int current_nprocs,
|
|
int *heavy_ranks, int &num_heavy,
|
|
double *rank_times, MPI_Comm comm)
|
|
{
|
|
int myrank;
|
|
MPI_Comm_rank(comm, &myrank);
|
|
|
|
int valid = 0;
|
|
ProfileHeader hdr;
|
|
memset(&hdr, 0, sizeof(hdr));
|
|
|
|
if (myrank == 0) {
|
|
FILE *fp = fopen(filepath, "rb");
|
|
if (fp) {
|
|
if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
|
|
hdr.magic == MAGIC && hdr.version == VERSION &&
|
|
hdr.nprocs == current_nprocs)
|
|
{
|
|
if (fread(rank_times, sizeof(double), current_nprocs, fp)
|
|
== (size_t)current_nprocs &&
|
|
fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
|
|
== (size_t)hdr.num_heavy)
|
|
{
|
|
num_heavy = hdr.num_heavy;
|
|
valid = 1;
|
|
}
|
|
} else if (fp) {
|
|
printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
|
|
"nprocs=%d (current=%d)\n",
|
|
hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
|
|
}
|
|
fclose(fp);
|
|
}
|
|
}
|
|
|
|
MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
|
|
if (!valid) return false;
|
|
|
|
MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
|
|
MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
|
|
MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
|
|
return true;
|
|
}
|
|
|
|
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
|
double threshold_ratio,
|
|
int *heavy_ranks, int max_heavy)
|
|
{
|
|
double sum = 0;
|
|
for (int i = 0; i < nprocs; i++) sum += rank_times[i];
|
|
double mean = sum / nprocs;
|
|
double threshold = threshold_ratio * mean;
|
|
|
|
// Collect candidates
|
|
struct RankTime { int rank; double time; };
|
|
RankTime *candidates = new RankTime[nprocs];
|
|
int ncand = 0;
|
|
|
|
for (int i = 0; i < nprocs; i++) {
|
|
if (rank_times[i] > threshold)
|
|
candidates[ncand++] = {i, rank_times[i]};
|
|
}
|
|
|
|
// Sort descending by time
|
|
std::sort(candidates, candidates + ncand,
|
|
[](const RankTime &a, const RankTime &b) {
|
|
return a.time > b.time;
|
|
});
|
|
|
|
int count = (ncand < max_heavy) ? ncand : max_heavy;
|
|
for (int i = 0; i < count; i++)
|
|
heavy_ranks[i] = candidates[i].rank;
|
|
|
|
delete[] candidates;
|
|
return count;
|
|
}
|
|
|
|
} // namespace InterpLBProfile
|