diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 91ead8a..563bfcc 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -13,6 +13,9 @@ using namespace std; #include "MPatch.h" #include "Parallel.h" #include "fmisc.h" +#ifdef INTERP_LB_PROFILE +#include "interp_lb_profile.h" +#endif Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { @@ -507,6 +510,9 @@ void Patch::Interp_Points(MyList *VarList, // Targeted point-to-point overload: each owner sends each point only to // the one rank that needs it for integration (consumer), reducing // communication volume by ~nprocs times compared to the Bcast version. +#ifdef INTERP_LB_PROFILE + double t_interp_start = MPI_Wtime(); +#endif int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -608,6 +614,11 @@ void Patch::Interp_Points(MyList *VarList, } } +#ifdef INTERP_LB_PROFILE + double t_interp_end = MPI_Wtime(); + double t_interp_local = t_interp_end - t_interp_start; +#endif + // --- Error check for unfound points --- for (int j = 0; j < NN; j++) { @@ -764,6 +775,31 @@ void Patch::Interp_Points(MyList *VarList, delete[] recv_count; delete[] consumer_rank; delete[] owner_rank; + +#ifdef INTERP_LB_PROFILE + { + static bool profile_written = false; + if (!profile_written) { + double *all_times = nullptr; + if (myrank == 0) all_times = new double[nprocs]; + MPI_Gather(&t_interp_local, 1, MPI_DOUBLE, + all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (myrank == 0) { + int heavy[64]; + int nh = InterpLBProfile::identify_heavy_ranks( + all_times, nprocs, 2.5, heavy, 64); + InterpLBProfile::write_profile( + "interp_lb_profile.bin", nprocs, + all_times, heavy, nh, 2.5); + printf("[InterpLB] Profile written: %d heavy ranks\n", nh); + for (int i = 0; i < nh; i++) + printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]); + delete[] all_times; + } + profile_written = true; + } + } +#endif } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index a9fb3cd..bd591c4 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -462,7 +462,7 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i } } #else - ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks + ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // ng->checkBlock(); if (BlL) BlL->insert(ng); @@ -500,6 +500,384 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i return BlL; } + +#ifdef INTERP_LB_OPTIMIZE +#include "interp_lb_profile_data.h" + +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ +#ifdef USE_GPU_DIVIDE + double cpu_part, gpu_part; + map::iterator iter; + iter = parameters::dou_par.find("cpu part"); + if (iter != parameters::dou_par.end()) + { + cpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); + } + iter = parameters::dou_par.find("gpu part"); + if (iter != parameters::dou_par.end()) + { + gpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); + } + if (nodes == 0) nodes = cpusize / 2; +#else + if (nodes == 0) nodes = cpusize; +#endif + + if (dim != 3) + { + cout << "distrivute: now we only support 3-dimension" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MyList *BlL = 0; + int split_size, min_size, block_size = 0; + int min_width = 2 * Mymax(ghost_width, buffer_width); + int nxyz[dim], mmin_width[dim], min_shape[dim]; + + MyList *PLi = PatchLIST; + for (int i = 0; i < dim; i++) + min_shape[i] = PLi->data->shape[i]; + int lev = PLi->data->lev; + PLi = PLi->next; + while (PLi) + { + Patch *PP = PLi->data; + for (int i = 0; i < dim; i++) + min_shape[i] = Mymin(min_shape[i], PP->shape[i]); + if (lev != PLi->data->lev) + cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; + PLi = PLi->next; + } + + for (int i = 0; i < dim; i++) + mmin_width[i] = Mymin(min_width, min_shape[i]); + min_size = mmin_width[0]; + for (int i = 1; i < dim; i++) + min_size = min_size * mmin_width[i]; + + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + int bs = PP->shape[0]; + for (int i = 1; i < dim; i++) + bs = bs * PP->shape[i]; + block_size = block_size + bs; + PLi = PLi->next; + } + split_size = Mymax(min_size, block_size / nodes); + split_size = Mymax(1, split_size); + + int n_rank = 0; + PLi = PatchLIST; + int reacpu = 0; + int current_block_id = 0; + while (PLi) { + Block *ng0, *ng; + bool first_block_in_patch = true; + Patch *PP = PLi->data; + reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); + + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + int ibbox_here[6], shape_here[3]; + double bbox_here[6], dd; + Block *current_ng_start = nullptr; + + bool is_heavy = false; + int r_l = -1, r_r = -1; + if (cpusize == INTERP_LB_NPROCS) { + for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) { + if (current_block_id == interp_lb_splits[si][0]) { + is_heavy = true; + r_l = interp_lb_splits[si][1]; + r_r = interp_lb_splits[si][2]; + break; + } + } + } + + if (is_heavy) + { + int ib0 = (PP->shape[0] * i) / nxyz[0]; + int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + int jb1 = (PP->shape[1] * j) / nxyz[1]; + int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + int kb2 = (PP->shape[2] * k) / nxyz[2]; + int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + Block *split_first_block = nullptr; + Block *split_last_block = nullptr; + splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, + PP, r_l, r_r, ingfsi, fngfsi, periodic, + split_first_block, split_last_block); + + current_ng_start = split_first_block; + ng = split_last_block; + } + else + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + if (periodic) { + for(int d=0; d<3; d++) { + ibbox_here[d] -= ghost_width; + ibbox_here[d+3] += ghost_width; + } + } else { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + + for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; + bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; + bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; + bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; + bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; + bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; + bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; +#else +#error Not define Vertex nor Cell +#endif +#endif + ng = createMappedBlock(BlL, dim, shape_here, bbox_here, + current_block_id, ingfsi, fngfsi, PP->lev); + current_ng_start = ng; + } + + if (first_block_in_patch) { + ng0 = current_ng_start; + MyList *Bp_start = BlL; + while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next; + PP->blb = Bp_start; + first_block_in_patch = false; + } + + current_block_id++; + } + + { + MyList *Bp_end = BlL; + while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next; + PP->ble = Bp_end; + } + + PLi = PLi->next; + } + if (reacpu < nodes * 2 / 3) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; + } + + return BlL; +} + +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ + int mid = (ib0_orig + ib3_orig) / 2; + + int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig}; + int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig}; + + auto createSubBlock = [&](int* ib_raw, int target_rank) { + int ib_final[6]; + int sh_here[3]; + double bb_here[6], dd; + + if (periodic) { + ib_final[0] = ib_raw[0] - ghost_width; + ib_final[3] = ib_raw[3] + ghost_width; + ib_final[1] = ib_raw[1] - ghost_width; + ib_final[4] = ib_raw[4] + ghost_width; + ib_final[2] = ib_raw[2] - ghost_width; + ib_final[5] = ib_raw[5] + ghost_width; + } else { + ib_final[0] = Mymax(0, ib_raw[0] - ghost_width); + ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width); + ib_final[1] = Mymax(0, ib_raw[1] - ghost_width); + ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width); + ib_final[2] = Mymax(0, ib_raw[2] - ghost_width); + ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width); + } + + sh_here[0] = ib_final[3] - ib_final[0] + 1; + sh_here[1] = ib_final[4] - ib_final[1] + 1; + sh_here[2] = ib_final[5] - ib_final[2] + 1; + +#ifdef Vertex + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + ib_final[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + ib_final[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + ib_final[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd; +#endif +#endif + + Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev); + if (BlL) BlL->insert(Bg); + else BlL = new MyList(Bg); + + return Bg; + }; + + split_first_block = createSubBlock(indices_L, r_left); + split_last_block = createSubBlock(indices_R, r_right); + return split_last_block; +} + +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ + int target_rank = block_id; + if (INTERP_LB_NPROCS > 0) { + for (int ri = 0; ri < interp_lb_num_remaps; ri++) { + if (block_id == interp_lb_remaps[ri][0]) { + target_rank = interp_lb_remaps[ri][1]; + break; + } + } + } + + Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev); + if (BlL) BlL->insert(ng); + else BlL = new MyList(ng); + + return ng; +} +#else +// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ + return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes); +} +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ return nullptr; } +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ return nullptr; } +#endif + #elif (PSTR == 1 || PSTR == 2 || PSTR == 3) MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, bool periodic, int start_rank, int end_rank, int nodes) diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index a6ef351..e17f365 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -32,6 +32,16 @@ namespace Parallel int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape); MyList *distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks + MyList *distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); + Block* splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block); + Block* createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev); void KillBlocks(MyList *PatchLIST); void setfunction(MyList *BlL, var *vn, double func(double x, double y, double z)); diff --git a/AMSS_NCKU_source/cgh.C b/AMSS_NCKU_source/cgh.C index 2f7f2a0..6e60f68 100644 --- a/AMSS_NCKU_source/cgh.C +++ b/AMSS_NCKU_source/cgh.C @@ -130,7 +130,11 @@ void cgh::compose_cgh(int nprocs) for (int lev = 0; lev < levels; lev++) { checkPatchList(PatL[lev], false); +#ifdef INTERP_LB_OPTIMIZE + Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false); +#else Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false); +#endif #if (RPB == 1) // we need distributed box of PatL[lev] and PatL[lev-1] if (lev > 0) diff --git a/AMSS_NCKU_source/interp_lb_profile.C b/AMSS_NCKU_source/interp_lb_profile.C new file mode 100644 index 0000000..52d4ff6 --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile.C @@ -0,0 +1,107 @@ +#include "interp_lb_profile.h" +#include +#include +#include + +namespace InterpLBProfile { + +bool write_profile(const char *filepath, int nprocs, + const double *rank_times, + const int *heavy_ranks, int num_heavy, + double threshold_ratio) +{ + FILE *fp = fopen(filepath, "wb"); + if (!fp) return false; + + ProfileHeader hdr; + hdr.magic = MAGIC; + hdr.version = VERSION; + hdr.nprocs = nprocs; + hdr.num_heavy = num_heavy; + hdr.threshold_ratio = threshold_ratio; + + fwrite(&hdr, sizeof(hdr), 1, fp); + fwrite(rank_times, sizeof(double), nprocs, fp); + fwrite(heavy_ranks, sizeof(int), num_heavy, fp); + fclose(fp); + return true; +} + +bool read_profile(const char *filepath, int current_nprocs, + int *heavy_ranks, int &num_heavy, + double *rank_times, MPI_Comm comm) +{ + int myrank; + MPI_Comm_rank(comm, &myrank); + + int valid = 0; + ProfileHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + + if (myrank == 0) { + FILE *fp = fopen(filepath, "rb"); + if (fp) { + if (fread(&hdr, sizeof(hdr), 1, fp) == 1 && + hdr.magic == MAGIC && hdr.version == VERSION && + hdr.nprocs == current_nprocs) + { + if (fread(rank_times, sizeof(double), current_nprocs, fp) + == (size_t)current_nprocs && + fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp) + == (size_t)hdr.num_heavy) + { + num_heavy = hdr.num_heavy; + valid = 1; + } + } else if (fp) { + printf("[InterpLB] Profile rejected: magic=0x%X version=%u " + "nprocs=%d (current=%d)\n", + hdr.magic, hdr.version, hdr.nprocs, current_nprocs); + } + fclose(fp); + } + } + + MPI_Bcast(&valid, 1, MPI_INT, 0, comm); + if (!valid) return false; + + MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm); + MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm); + MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm); + return true; +} + +int identify_heavy_ranks(const double *rank_times, int nprocs, + double threshold_ratio, + int *heavy_ranks, int max_heavy) +{ + double sum = 0; + for (int i = 0; i < nprocs; i++) sum += rank_times[i]; + double mean = sum / nprocs; + double threshold = threshold_ratio * mean; + + // Collect candidates + struct RankTime { int rank; double time; }; + RankTime *candidates = new RankTime[nprocs]; + int ncand = 0; + + for (int i = 0; i < nprocs; i++) { + if (rank_times[i] > threshold) + candidates[ncand++] = {i, rank_times[i]}; + } + + // Sort descending by time + std::sort(candidates, candidates + ncand, + [](const RankTime &a, const RankTime &b) { + return a.time > b.time; + }); + + int count = (ncand < max_heavy) ? ncand : max_heavy; + for (int i = 0; i < count; i++) + heavy_ranks[i] = candidates[i].rank; + + delete[] candidates; + return count; +} + +} // namespace InterpLBProfile diff --git a/AMSS_NCKU_source/interp_lb_profile.h b/AMSS_NCKU_source/interp_lb_profile.h new file mode 100644 index 0000000..10b9dac --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile.h @@ -0,0 +1,38 @@ +#ifndef INTERP_LB_PROFILE_H +#define INTERP_LB_PROFILE_H + +#include + +namespace InterpLBProfile { + +static const unsigned int MAGIC = 0x494C4250; // "ILBP" +static const unsigned int VERSION = 1; + +struct ProfileHeader { + unsigned int magic; + unsigned int version; + int nprocs; + int num_heavy; + double threshold_ratio; +}; + +// Write profile file (rank 0 only) +bool write_profile(const char *filepath, int nprocs, + const double *rank_times, + const int *heavy_ranks, int num_heavy, + double threshold_ratio); + +// Read profile file (rank 0 reads, then broadcasts to all) +// Returns true if file found and valid for current nprocs +bool read_profile(const char *filepath, int current_nprocs, + int *heavy_ranks, int &num_heavy, + double *rank_times, MPI_Comm comm); + +// Identify heavy ranks: those with time > threshold_ratio * mean +int identify_heavy_ranks(const double *rank_times, int nprocs, + double threshold_ratio, + int *heavy_ranks, int max_heavy); + +} // namespace InterpLBProfile + +#endif /* INTERP_LB_PROFILE_H */