diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index c0df582..33c0458 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -4,6 +4,8 @@ #include "prolongrestrict.h" #include "misc.h" #include "parameters.h" +#include +#include int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion { @@ -352,14 +354,73 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i split_size = Mymax(min_size, block_size / nodes); split_size = Mymax(1, split_size); - int n_rank = 0; + // Pass 1: compute block volumes for greedy rank assignment + std::vector block_volumes; PLi = PatchLIST; int reacpu = 0; while (PLi) { Patch *PP = PLi->data; - reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); + int ibbox_here[2 * dim]; + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + if (periodic) + { + for (int d = 0; d < dim; d++) { ibbox_here[d] -= ghost_width; ibbox_here[dim + d] += ghost_width; } + } + else + { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + long vol = 1; + for (int d = 0; d < dim; d++) + vol *= (ibbox_here[dim + d] - ibbox_here[d] + 1); + block_volumes.push_back(vol); + } + PLi = PLi->next; + } + + // Greedy LPT: sort by volume descending, assign each to least-loaded rank + std::vector assigned_ranks(block_volumes.size()); + { + std::vector order(block_volumes.size()); + for (int i = 0; i < (int)order.size(); i++) order[i] = i; + std::sort(order.begin(), order.end(), [&](int a, int b) { + return block_volumes[a] > block_volumes[b]; + }); + std::vector load(cpusize, 0); + for (int idx : order) + { + int min_r = 0; + for (int r = 1; r < cpusize; r++) + if (load[r] < load[min_r]) min_r = r; + assigned_ranks[idx] = min_r; + load[min_r] += block_volumes[idx]; + } + } + + // Pass 2: create blocks with pre-assigned ranks + int block_idx = 0; + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + + partition3(nxyz, split_size, mmin_width, nodes, PP->shape); Block *ng0, *ng; int shape_here[dim], ibbox_here[2 * dim]; @@ -443,10 +504,7 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i int shape_res[dim * pices]; double bbox_res[2 * dim * pices]; misc::dividBlock(dim, shape_here, bbox_here, pices, picef, shape_res, bbox_res, min_width); - ng = ng0 = new Block(dim, shape_res, bbox_res, n_rank++, ingfsi, fngfsi, PP->lev, 0); // delete through KillBlocks - - // if(n_rank==cpusize) {n_rank=0; cerr<<"place one!!"<lev, 0); // delete through KillBlocks // ng->checkBlock(); if (BlL) BlL->insert(ng); @@ -455,22 +513,19 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i for (int i = 1; i < pices; i++) { - ng = new Block(dim, shape_res + i * dim, bbox_res + i * 2 * dim, n_rank++, ingfsi, fngfsi, PP->lev, i); // delete through KillBlocks - // if(n_rank==cpusize) {n_rank=0; cerr<<"place two!! "<lev, i); // delete through KillBlocks // ng->checkBlock(); BlL->insert(ng); } } #else - ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks + ng = ng0 = new Block(dim, shape_here, bbox_here, assigned_ranks[block_idx++], ingfsi, fngfsi, PP->lev); // delete through KillBlocks // ng->checkBlock(); if (BlL) BlL->insert(ng); else BlL = new MyList(ng); // delete through KillBlocks #endif - if (n_rank == cpusize) - n_rank = 0; // set PP->blb if (i == 0 && j == 0 && k == 0) @@ -3524,10 +3579,8 @@ void Parallel::transfer(MyList **src, MyList **src, MyList **src, MyList **src, MyList **src, MyList