#include #include #include #include #include #include #include #include #include #include using namespace std; #include "misc.h" #include "MPatch.h" #include "Parallel.h" #include "fmisc.h" #include "bssn_cuda_ops.h" #ifdef INTERP_LB_PROFILE #include "interp_lb_profile.h" #endif #if defined(__GNUC__) || defined(__clang__) extern int bssn_cuda_interp_points_batch(const int *ex, const double *X, const double *Y, const double *Z, const double *const *fields, const double *soa_flat, int num_var, const double *px, const double *py, const double *pz, int num_points, int ordn, int symmetry, double *out) __attribute__((weak)); #endif namespace { struct InterpVarDesc { int sgfn; double soa[dim]; }; struct InterpPlanKey { const Patch *patch; const double *x; const double *y; const double *z; int NN; int Symmetry; int myrank; }; struct InterpPlanKeyLess { bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const { if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch; if (lhs.x != rhs.x) return lhs.x < rhs.x; if (lhs.y != rhs.y) return lhs.y < rhs.y; if (lhs.z != rhs.z) return lhs.z < rhs.z; if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN; if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry; return lhs.myrank < rhs.myrank; } }; struct CachedInterpPlan { int nblocks; vector owner_rank; vector owner_block; vector > block_points; vector > block_px; vector > block_py; vector > block_pz; CachedInterpPlan() : nblocks(0) {} }; struct CachedInterpPlanEntry { bool valid; InterpPlanKey key; CachedInterpPlan plan; CachedInterpPlanEntry() : valid(false) {} }; struct InterpBlockView { Block *bp; double llb[dim]; double uub[dim]; }; struct BlockBinIndex { int bins[dim]; double lo[dim]; double inv[dim]; vector views; vector> bin_to_blocks; bool valid; BlockBinIndex() : valid(false) { for (int i = 0; i < dim; i++) { bins[i] = 1; lo[i] = 0.0; inv[i] = 0.0; } } }; inline int clamp_int(int v, int lo, int hi) { return (v < lo) ? lo : ((v > hi) ? hi : v); } inline int coord_to_bin(double x, double lo, double inv, int nb) { if (nb <= 1 || inv <= 0.0) return 0; int b = int(floor((x - lo) * inv)); return clamp_int(b, 0, nb - 1); } inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2) { return b0 + index.bins[0] * (b1 + index.bins[1] * b2); } inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH) { for (int i = 0; i < dim; i++) { if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2) return false; } return true; } void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index) { index = BlockBinIndex(); MyList *Bp = patch->blb; while (Bp) { Block *BP = Bp->data; InterpBlockView view; view.bp = BP; for (int i = 0; i < dim; i++) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; #else #ifdef Cell view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; #else #error Not define Vertex nor Cell #endif #endif } index.views.push_back(view); if (Bp == patch->ble) break; Bp = Bp->next; } const int nblocks = int(index.views.size()); if (nblocks <= 0) return; int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0))); bins_1d = clamp_int(bins_1d, 1, 32); for (int i = 0; i < dim; i++) { index.bins[i] = bins_1d; index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i]; const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i]; if (hi > index.lo[i] && bins_1d > 1) index.inv[i] = bins_1d / (hi - index.lo[i]); else index.inv[i] = 0.0; } index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]); for (int bi = 0; bi < nblocks; bi++) { const InterpBlockView &view = index.views[bi]; int bmin[dim], bmax[dim]; for (int d = 0; d < dim; d++) { const double low = view.llb[d] - DH[d] / 2; const double up = view.uub[d] + DH[d] / 2; bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]); bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]); if (bmax[d] < bmin[d]) { int t = bmin[d]; bmin[d] = bmax[d]; bmax[d] = t; } } for (int bz = bmin[2]; bz <= bmax[2]; bz++) for (int by = bmin[1]; by <= bmax[1]; by++) for (int bx = bmin[0]; bx <= bmax[0]; bx++) index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi); } index.valid = true; } int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) { if (!index.valid) return -1; const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]); const vector &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)]; for (size_t ci = 0; ci < cand.size(); ci++) { const int bi = cand[ci]; if (point_in_block_view(index.views[bi], pox, DH)) return bi; } // Fallback to full scan for numerical edge cases around bin boundaries. for (size_t bi = 0; bi < index.views.size(); bi++) if (point_in_block_view(index.views[bi], pox, DH)) return int(bi); return -1; } void collect_interp_vars(MyList *VarList, vector &vars) { vars.clear(); MyList *varl = VarList; while (varl) { InterpVarDesc desc; desc.sgfn = varl->data->sgfn; for (int d = 0; d < dim; ++d) desc.soa[d] = varl->data->SoA[d]; vars.push_back(desc); varl = varl->next; } } bool should_try_cuda_interp(int ordn, int num_points, int num_var) { #if defined(__GNUC__) || defined(__clang__) if (!bssn_cuda_interp_points_batch) return false; #else return false; #endif if (ordn != 6) return false; if (num_points < 32) return false; return num_points * num_var >= 256; } CachedInterpPlanEntry &interp_plan_cache_entry() { static CachedInterpPlanEntry cache; return cache; } bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs) { return lhs.patch == rhs.patch && lhs.x == rhs.x && lhs.y == rhs.y && lhs.z == rhs.z && lhs.NN == rhs.NN && lhs.Symmetry == rhs.Symmetry && lhs.myrank == rhs.myrank; } CachedInterpPlan &get_cached_interp_plan(Patch *patch, int NN, double **XX, int Symmetry, int myrank, const double *DH, const BlockBinIndex &block_index, bool report_bounds_here, bool allow_missing_points) { InterpPlanKey key; key.patch = patch; key.x = XX[0]; key.y = XX[1]; key.z = XX[2]; key.NN = NN; key.Symmetry = Symmetry; key.myrank = myrank; CachedInterpPlanEntry &cache = interp_plan_cache_entry(); if (cache.valid && same_interp_plan_key(cache.key, key) && cache.plan.nblocks == static_cast(block_index.views.size())) return cache.plan; cache.valid = true; cache.key = key; cache.plan = CachedInterpPlan(); CachedInterpPlan &plan = cache.plan; plan.nblocks = static_cast(block_index.views.size()); plan.owner_rank.assign(NN, -1); plan.owner_block.assign(NN, -1); plan.block_points.resize(plan.nblocks); plan.block_px.resize(plan.nblocks); plan.block_py.resize(plan.nblocks); plan.block_pz.resize(plan.nblocks); for (int j = 0; j < NN; ++j) { double pox[dim]; for (int i = 0; i < dim; ++i) { pox[i] = XX[i][j]; if (report_bounds_here && (XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] || XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i])) { cout << "Patch::Interp_Points: point ("; for (int k = 0; k < dim; ++k) { cout << XX[k][j]; if (k < dim - 1) cout << ","; else cout << ") is out of current Patch." << endl; } MPI_Abort(MPI_COMM_WORLD, 1); } } const int block_i = find_block_index_for_point(block_index, pox, DH); if (block_i >= 0) { Block *BP = block_index.views[block_i].bp; plan.owner_rank[j] = BP->rank; plan.owner_block[j] = block_i; if (BP->rank == myrank) { plan.block_points[block_i].push_back(j); plan.block_px[block_i].push_back(XX[0][j]); plan.block_py[block_i].push_back(XX[1][j]); plan.block_pz[block_i].push_back(XX[2][j]); } } } if (!allow_missing_points && report_bounds_here) { for (int j = 0; j < NN; ++j) { if (plan.owner_rank[j] >= 0) continue; cout << "ERROR: Patch::Interp_Points fails to find point ("; for (int d = 0; d < dim; ++d) { cout << XX[d][j]; if (d < dim - 1) cout << ","; else cout << ")"; } cout << " on Patch ("; for (int d = 0; d < dim; ++d) { cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d]; if (d < dim - 1) cout << ","; else cout << ")--"; } cout << "("; for (int d = 0; d < dim; ++d) { cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d]; if (d < dim - 1) cout << ","; else cout << ")" << endl; } MPI_Abort(MPI_COMM_WORLD, 1); } } return plan; } void release_interp_plan_cache_internal() { CachedInterpPlanEntry &cache = interp_plan_cache_entry(); cache.valid = false; cache.plan = CachedInterpPlan(); } bool run_cuda_interp_for_block(Block *BP, const vector &vars, const vector &point_ids, const vector &px, const vector &py, const vector &pz, double *Shellf, int num_var, int ordn, int Symmetry) { if (!should_try_cuda_interp(ordn, static_cast(point_ids.size()), num_var)) return false; vector field_ptrs(num_var); vector soa_flat(3 * num_var); for (int v = 0; v < num_var; ++v) { field_ptrs[v] = BP->fgfs[vars[v].sgfn]; for (int d = 0; d < dim; ++d) soa_flat[3 * v + d] = vars[v].soa[d]; } const int npts = static_cast(point_ids.size()); vector out(static_cast(npts) * static_cast(num_var)); if (bssn_cuda_interp_points_batch(BP->shape, BP->X[0], BP->X[1], BP->X[2], field_ptrs.data(), soa_flat.data(), num_var, px.data(), py.data(), pz.data(), npts, ordn, Symmetry, out.data()) != 0) { return false; } for (int p = 0; p < npts; ++p) { const int j = point_ids[p]; memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var); } return true; } void run_cpu_interp_for_block(Block *BP, const vector &vars, const vector &point_ids, const vector &px, const vector &py, const vector &pz, double *Shellf, int num_var, int ordn, int Symmetry) { for (size_t p = 0; p < point_ids.size(); ++p) { const int j = point_ids[p]; double x = px[p]; double y = py[p]; double z = pz[p]; int ordn_local = ordn; int symmetry_local = Symmetry; for (int v = 0; v < num_var; ++v) { f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v], x, y, z, ordn_local, const_cast(vars[v].soa), symmetry_local); } } } void interpolate_owned_points(MyList *VarList, double *Shellf, int Symmetry, int ordn, const BlockBinIndex &block_index, const CachedInterpPlan &plan) { vector vars; collect_interp_vars(VarList, vars); const int num_var = static_cast(vars.size()); for (size_t bi = 0; bi < plan.block_points.size(); ++bi) { if (plan.block_points[bi].empty()) continue; Block *BP = block_index.views[bi].bp; bool done = run_cuda_interp_for_block(BP, vars, plan.block_points[bi], plan.block_px[bi], plan.block_py[bi], plan.block_pz[bi], Shellf, num_var, ordn, Symmetry); if (!done) run_cpu_interp_for_block(BP, vars, plan.block_points[bi], plan.block_px[bi], plan.block_py[bi], plan.block_pz[bi], Shellf, num_var, ordn, Symmetry); } } } // namespace void patch_release_interp_plan_cache() { release_interp_plan_cache_internal(); } Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { int hbuffer_width = buffer_width; if (lev == 0) hbuffer_width = CS_width; // specific for shell-box coulping if (DIM != dim) { cout << "dimension is not consistent in Patch construction" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } for (int i = 0; i < dim; i++) { shape[i] = shapei[i]; bbox[i] = bboxi[i]; bbox[dim + i] = bboxi[dim + i]; lli[i] = uui[i] = 0; if (buflog) { double DH; #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1); #else #ifdef Cell DH = (bbox[dim + i] - bbox[i]) / shape[i]; #else #error Not define Vertex nor Cell #endif #endif uui[i] = hbuffer_width; bbox[dim + i] = bbox[dim + i] + uui[i] * DH; shape[i] = shape[i] + uui[i]; } } if (buflog) { if (DIM != 3) { cout << "Symmetry in Patch construction only support 3 yet but dim = " << DIM << endl; MPI_Abort(MPI_COMM_WORLD, 1); } double tmpb, DH; if (Symmetry > 0) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[5] - bbox[2]) / (shape[2] - 1); #else #ifdef Cell DH = (bbox[5] - bbox[2]) / shape[2]; #else #error Not define Vertex nor Cell #endif #endif tmpb = Mymax(0, bbox[2] - hbuffer_width * DH); lli[2] = int((bbox[2] - tmpb) / DH + 0.4); bbox[2] = bbox[2] - lli[2] * DH; shape[2] = shape[2] + lli[2]; if (lli[2] < hbuffer_width) { if (feq(bbox[2], 0, DH / 2)) lli[2] = 0; else { cout << "Code mistake for lli[2] = " << lli[2] << ", bbox[2] = " << bbox[2] << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } if (Symmetry > 1) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[3] - bbox[0]) / (shape[0] - 1); #else #ifdef Cell DH = (bbox[3] - bbox[0]) / shape[0]; #else #error Not define Vertex nor Cell #endif #endif tmpb = Mymax(0, bbox[0] - hbuffer_width * DH); lli[0] = int((bbox[0] - tmpb) / DH + 0.4); bbox[0] = bbox[0] - lli[0] * DH; shape[0] = shape[0] + lli[0]; if (lli[0] < hbuffer_width) { if (feq(bbox[0], 0, DH / 2)) lli[0] = 0; else { cout << "Code mistake for lli[0] = " << lli[0] << ", bbox[0] = " << bbox[0] << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[4] - bbox[1]) / (shape[1] - 1); #else #ifdef Cell DH = (bbox[4] - bbox[1]) / shape[1]; #else #error Not define Vertex nor Cell #endif #endif tmpb = Mymax(0, bbox[1] - hbuffer_width * DH); lli[1] = int((bbox[1] - tmpb) / DH + 0.4); bbox[1] = bbox[1] - lli[1] * DH; shape[1] = shape[1] + lli[1]; if (lli[1] < hbuffer_width) { if (feq(bbox[1], 0, DH / 2)) lli[1] = 0; else { cout << "Code mistake for lli[1] = " << lli[1] << ", bbox[1] = " << bbox[1] << endl; MPI_Abort(MPI_COMM_WORLD, 1); } } } else { for (int i = 0; i < 2; i++) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1); #else #ifdef Cell DH = (bbox[dim + i] - bbox[i]) / shape[i]; #else #error Not define Vertex nor Cell #endif #endif lli[i] = hbuffer_width; bbox[i] = bbox[i] - lli[i] * DH; shape[i] = shape[i] + lli[i]; } } } else { for (int i = 0; i < dim; i++) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif DH = (bbox[dim + i] - bbox[i]) / (shape[i] - 1); #else #ifdef Cell DH = (bbox[dim + i] - bbox[i]) / shape[i]; #else #error Not define Vertex nor Cell #endif #endif lli[i] = hbuffer_width; bbox[i] = bbox[i] - lli[i] * DH; shape[i] = shape[i] + lli[i]; } } } blb = ble = 0; } Patch::~Patch() { } // buflog 1: with buffer points; 0 without void Patch::checkPatch(bool buflog) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (myrank == 0) { if (buflog) { cout << " belong to level " << lev << endl; cout << " shape: ["; for (int i = 0; i < dim; i++) { cout << shape[i]; if (i < dim - 1) cout << ","; else cout << "]"; } cout << " resolution: ["; for (int i = 0; i < dim; i++) { cout << getdX(i); if (i < dim - 1) cout << ","; else cout << "]" << endl; } cout << " range:" << "("; for (int i = 0; i < dim; i++) { cout << bbox[i] << ":" << bbox[dim + i]; if (i < dim - 1) cout << ","; else cout << ")" << endl; } } else { cout << " belong to level " << lev << endl; cout << " shape: ["; for (int i = 0; i < dim; i++) { cout << shape[i] - lli[i] - uui[i]; if (i < dim - 1) cout << ","; else cout << "]"; } cout << " resolution: ["; for (int i = 0; i < dim; i++) { cout << getdX(i); if (i < dim - 1) cout << ","; else cout << "]" << endl; } cout << " range:" << "("; for (int i = 0; i < dim; i++) { cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i); if (i < dim - 1) cout << ","; else cout << ")" << endl; } } } } void Patch::checkPatch(bool buflog, const int out_rank) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (myrank == out_rank) { cout << " out_rank = " << out_rank << endl; if (buflog) { cout << " belong to level " << lev << endl; cout << " shape: ["; for (int i = 0; i < dim; i++) { cout << shape[i]; if (i < dim - 1) cout << ","; else cout << "]"; } cout << " resolution: ["; for (int i = 0; i < dim; i++) { cout << getdX(i); if (i < dim - 1) cout << ","; else cout << "]" << endl; } cout << " range:" << "("; for (int i = 0; i < dim; i++) { cout << bbox[i] << ":" << bbox[dim + i]; if (i < dim - 1) cout << ","; else cout << ")" << endl; } } else { cout << " belong to level " << lev << endl; cout << " shape: ["; for (int i = 0; i < dim; i++) { cout << shape[i] - lli[i] - uui[i]; if (i < dim - 1) cout << ","; else cout << "]"; } cout << " resolution: ["; for (int i = 0; i < dim; i++) { cout << getdX(i); if (i < dim - 1) cout << ","; else cout << "]" << endl; } cout << " range:" << "("; for (int i = 0; i < dim; i++) { cout << bbox[i] + lli[i] * getdX(i) << ":" << bbox[dim + i] - uui[i] * getdX(i); if (i < dim - 1) cout << ","; else cout << ")" << endl; } } } } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, double *Shellf, int Symmetry) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int ordn = 2 * ghost_width; MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } memset(Shellf, 0, sizeof(double) * NN * num_var); double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); BlockBinIndex block_index; build_block_bin_index(this, DH, block_index); CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); const int *owner_rank = plan.owner_rank.data(); interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); // Replace MPI_Allreduce with per-owner MPI_Bcast: // Group consecutive points by owner rank and broadcast each group. // Since each point's data is non-zero only on the owner rank, // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper. { int j = 0; while (j < NN) { int cur_owner = owner_rank[j]; if (cur_owner < 0) { if (myrank == 0) { cout << "ERROR: Patch::Interp_Points fails to find point ("; for (int d = 0; d < dim; d++) { cout << XX[d][j]; if (d < dim - 1) cout << ","; else cout << ")"; } cout << " on Patch ("; for (int d = 0; d < dim; d++) { cout << bbox[d] << "+" << lli[d] * DH[d]; if (d < dim - 1) cout << ","; else cout << ")--"; } cout << "("; for (int d = 0; d < dim; d++) { cout << bbox[dim + d] << "-" << uui[d] * DH[d]; if (d < dim - 1) cout << ","; else cout << ")" << endl; } MPI_Abort(MPI_COMM_WORLD, 1); } j++; continue; } // Find contiguous run of points with the same owner int jstart = j; while (j < NN && owner_rank[j] == cur_owner) j++; int count = (j - jstart) * num_var; MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); } } } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, double *Shellf, int Symmetry, int Nmin_consumer, int Nmax_consumer) { // Targeted point-to-point overload: each owner sends each point only to // the one rank that needs it for integration (consumer), reducing // communication volume by ~nprocs times compared to the Bcast version. #ifdef INTERP_LB_PROFILE double t_interp_start = MPI_Wtime(); #endif int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); int ordn = 2 * ghost_width; MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } memset(Shellf, 0, sizeof(double) * NN * num_var); double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); BlockBinIndex block_index; build_block_bin_index(this, DH, block_index); CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); const int *owner_rank = plan.owner_rank.data(); interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); #ifdef INTERP_LB_PROFILE double t_interp_end = MPI_Wtime(); double t_interp_local = t_interp_end - t_interp_start; #endif // --- Targeted point-to-point communication phase --- // Compute consumer_rank[j] using the same deterministic formula as surface_integral int *consumer_rank = new int[NN]; { int mp = NN / nprocs; int Lp = NN - nprocs * mp; for (int j = 0; j < NN; j++) { if (j < Lp * (mp + 1)) consumer_rank[j] = j / (mp + 1); else consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp; } } // Count sends and recvs per rank int *send_count = new int[nprocs]; int *recv_count = new int[nprocs]; memset(send_count, 0, sizeof(int) * nprocs); memset(recv_count, 0, sizeof(int) * nprocs); for (int j = 0; j < NN; j++) { int own = owner_rank[j]; int con = consumer_rank[j]; if (own == con) continue; // local — no communication needed if (own == myrank) send_count[con]++; if (con == myrank) recv_count[own]++; } // Build send buffers: for each destination rank, pack (index, data) pairs // Each entry: 1 int (point index j) + num_var doubles int total_send = 0, total_recv = 0; int *send_offset = new int[nprocs]; int *recv_offset = new int[nprocs]; for (int r = 0; r < nprocs; r++) { send_offset[r] = total_send; total_send += send_count[r]; recv_offset[r] = total_recv; total_recv += recv_count[r]; } // Pack send buffers: each message contains (j, data[0..num_var-1]) per point int stride = 1 + num_var; // 1 double for index + num_var doubles for data double *sendbuf = new double[total_send * stride]; double *recvbuf = new double[total_recv * stride]; // Temporary counters for packing int *pack_pos = new int[nprocs]; memset(pack_pos, 0, sizeof(int) * nprocs); for (int j = 0; j < NN; j++) { int own = owner_rank[j]; int con = consumer_rank[j]; if (own != myrank || con == myrank) continue; int pos = (send_offset[con] + pack_pos[con]) * stride; sendbuf[pos] = (double)j; // point index for (int v = 0; v < num_var; v++) sendbuf[pos + 1 + v] = Shellf[j * num_var + v]; pack_pos[con]++; } // Post non-blocking recvs and sends int n_req = 0; for (int r = 0; r < nprocs; r++) { if (recv_count[r] > 0) n_req++; if (send_count[r] > 0) n_req++; } MPI_Request *reqs = new MPI_Request[n_req]; int req_idx = 0; for (int r = 0; r < nprocs; r++) { if (recv_count[r] > 0) { MPI_Irecv(recvbuf + recv_offset[r] * stride, recv_count[r] * stride, MPI_DOUBLE, r, 0, MPI_COMM_WORLD, &reqs[req_idx++]); } } for (int r = 0; r < nprocs; r++) { if (send_count[r] > 0) { MPI_Isend(sendbuf + send_offset[r] * stride, send_count[r] * stride, MPI_DOUBLE, r, 0, MPI_COMM_WORLD, &reqs[req_idx++]); } } if (n_req > 0) MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE); // Unpack recv buffers into Shellf for (int i = 0; i < total_recv; i++) { int pos = i * stride; int j = (int)recvbuf[pos]; for (int v = 0; v < num_var; v++) Shellf[j * num_var + v] = recvbuf[pos + 1 + v]; } delete[] reqs; delete[] sendbuf; delete[] recvbuf; delete[] pack_pos; delete[] send_offset; delete[] recv_offset; delete[] send_count; delete[] recv_count; delete[] consumer_rank; #ifdef INTERP_LB_PROFILE { static bool profile_written = false; if (!profile_written) { double *all_times = nullptr; if (myrank == 0) all_times = new double[nprocs]; MPI_Gather(&t_interp_local, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (myrank == 0) { int heavy[64]; int nh = InterpLBProfile::identify_heavy_ranks( all_times, nprocs, 2.5, heavy, 64); InterpLBProfile::write_profile( "interp_lb_profile.bin", nprocs, all_times, heavy, nh, 2.5); printf("[InterpLB] Profile written: %d heavy ranks\n", nh); for (int i = 0; i < nh; i++) printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]); delete[] all_times; } profile_written = true; } } #endif } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, double *Shellf, int Symmetry, MPI_Comm Comm_here) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine int myrank, lmyrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(Comm_here, &lmyrank); int ordn = 2 * ghost_width; MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } memset(Shellf, 0, sizeof(double) * NN * num_var); // Build global-to-local rank translation for Comm_here MPI_Group world_group, local_group; MPI_Comm_group(MPI_COMM_WORLD, &world_group); MPI_Comm_group(Comm_here, &local_group); double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); BlockBinIndex block_index; build_block_bin_index(this, DH, block_index); CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true); const int *owner_rank = plan.owner_rank.data(); interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); // Collect unique global owner ranks and translate to local ranks in Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here { int j = 0; while (j < NN) { int cur_owner_global = owner_rank[j]; if (cur_owner_global < 0) { // Point not found — skip (error check disabled for sub-communicator levels) j++; continue; } // Translate global rank to local rank in Comm_here int cur_owner_local; MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local); // Find contiguous run of points with the same owner int jstart = j; while (j < NN && owner_rank[j] == cur_owner_global) j++; int count = (j - jstart) * num_var; MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here); } } MPI_Group_free(&world_group); MPI_Group_free(&local_group); } void Patch::checkBlock() { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (myrank == 0) { MyList *BP = blb; while (BP) { BP->data->checkBlock(); if (BP == ble) break; BP = BP->next; } } } double Patch::getdX(int dir) { if (dir < 0 || dir >= dim) { cout << "Patch::getdX: error input dir = " << dir << ", this Patch has direction (0," << dim - 1 << ")" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } double h; #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif if (shape[dir] == 1) { cout << "Patch::getdX: for direction " << dir << ", this Patch has only one point. Can not determine dX for vertex center grid." << endl; MPI_Abort(MPI_COMM_WORLD, 1); } h = (bbox[dim + dir] - bbox[dir]) / (shape[dir] - 1); #else #ifdef Cell h = (bbox[dim + dir] - bbox[dir]) / shape[dir]; #else #error Not define Vertex nor Cell #endif #endif return h; } bool Patch::Interp_ONE_Point(MyList *VarList, double *XX, double *Shellf, int Symmetry) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int ordn = 2 * ghost_width; MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } double *shellf; shellf = new double[num_var]; memset(shellf, 0, sizeof(double) * num_var); double *DH, *llb, *uub; DH = new double[dim]; for (int i = 0; i < dim; i++) { DH[i] = getdX(i); } llb = new double[dim]; uub = new double[dim]; double pox[dim]; for (int i = 0; i < dim; i++) { pox[i] = XX[i]; // has excluded the buffer points if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100) { delete[] shellf; delete[] DH; delete[] llb; delete[] uub; return false; // out of current patch, // remember to delete the allocated arrays before return!!! } } MyList *Bp = blb; bool notfind = true; while (notfind && Bp) // run along Blocks { Block *BP = Bp->data; bool flag = true; for (int i = 0; i < dim; i++) { // NOTE: our dividing structure is (exclude ghost) // -1 0 // 1 2 // so (0,1) does not belong to any part for vertex structure // here we put (0,0.5) to left part and (0.5,1) to right part // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; #else #ifdef Cell llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; #else #error Not define Vertex nor Cell #endif #endif if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2) { flag = false; break; } } if (flag) { notfind = false; if (myrank == BP->rank) { // test old code #if 0 #define floorint(a) ((a) < 0 ? int(a) - 1 : int(a)) //---> interpolation int ixl,iyl,izl,ixu,iyu,izu; double Delx,Dely,Delz; ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]); iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]); izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]); int nn=ordn/2; ixl = ixl-nn; iyl = iyl-nn; izl = izl-nn; int tmi; tmi = (Symmetry==2)?-1:0; if(ixl0)?-1:0; if(izlBP->shape[0]) ixl=BP->shape[0]-ordn; if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn; if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn; // support cell center if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0]; else Delx = ( pox[0] + BP->X[0][0] )/ DH[0]; if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1]; else Dely = ( pox[1] + BP->X[1][0] )/ DH[1]; if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2]; else Delz = ( pox[2] + BP->X[2][0] )/ DH[2]; //change to fortran index ixl++; iyl++; izl++; ixu = ixl + ordn - 1; iyu = iyl + ordn - 1; izu = izl + ordn - 1; varl=VarList; int j=0; while(varl) { f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz, ordn,varl->data->SoA,Symmetry); varl=varl->next; j++; } //varl #else //---> interpolation varl = VarList; int k = 0; while (varl) // run along variables { // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], // pox,ordn,varl->data->SoA,Symmetry); f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k], pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); varl = varl->next; k++; } #endif } } if (Bp == ble) break; Bp = Bp->next; } if (notfind && myrank == 0) { cout << "ERROR: Patch::Interp_Points fails to find point ("; for (int j = 0; j < dim; j++) { cout << XX[j]; if (j < dim - 1) cout << ","; else cout << ")"; } cout << " on Patch ("; for (int j = 0; j < dim; j++) { cout << bbox[j] << "+" << lli[j] * getdX(j); if (j < dim - 1) cout << ","; else cout << ")--"; } cout << "("; for (int j = 0; j < dim; j++) { cout << bbox[dim + j] << "-" << uui[j] * getdX(j); if (j < dim - 1) cout << ","; else cout << ")" << endl; } #if 0 checkBlock(); #else cout << "splited domains:" << endl; { MyList *Bp = blb; while (Bp) { Block *BP = Bp->data; for (int i = 0; i < dim; i++) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; #else #ifdef Cell llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; #else #error Not define Vertex nor Cell #endif #endif } cout << "("; for (int j = 0; j < dim; j++) { cout << llb[j] << ":" << uub[j]; if (j < dim - 1) cout << ","; else cout << ")" << endl; } if (Bp == ble) break; Bp = Bp->next; } } #endif MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); delete[] shellf; delete[] DH; delete[] llb; delete[] uub; return true; } bool Patch::Interp_ONE_Point(MyList *VarList, double *XX, double *Shellf, int Symmetry, MPI_Comm Comm_here) { // NOTE: we do not Synchnize variables here, make sure of that before calling this routine int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); int ordn = 2 * ghost_width; MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } double *shellf; shellf = new double[num_var]; memset(shellf, 0, sizeof(double) * num_var); double *DH, *llb, *uub; DH = new double[dim]; for (int i = 0; i < dim; i++) { DH[i] = getdX(i); } llb = new double[dim]; uub = new double[dim]; double pox[dim]; for (int i = 0; i < dim; i++) { pox[i] = XX[i]; // has excluded the buffer points if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100) { delete[] shellf; delete[] DH; delete[] llb; delete[] uub; return false; // out of current patch, // remember to delete the allocated arrays before return!!! } } MyList *Bp = blb; bool notfind = true; while (notfind && Bp) // run along Blocks { Block *BP = Bp->data; bool flag = true; for (int i = 0; i < dim; i++) { // NOTE: our dividing structure is (exclude ghost) // -1 0 // 1 2 // so (0,1) does not belong to any part for vertex structure // here we put (0,0.5) to left part and (0.5,1) to right part // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; #else #ifdef Cell llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; #else #error Not define Vertex nor Cell #endif #endif if (XX[i] - llb[i] < -DH[i] / 2 || XX[i] - uub[i] > DH[i] / 2) { flag = false; break; } } if (flag) { notfind = false; if (myrank == BP->rank) { // test old code #if 0 #define floorint(a) ((a) < 0 ? int(a) - 1 : int(a)) //---> interpolation int ixl,iyl,izl,ixu,iyu,izu; double Delx,Dely,Delz; ixl = 1+floorint((pox[0]-BP->X[0][0])/DH[0]); iyl = 1+floorint((pox[1]-BP->X[1][0])/DH[1]); izl = 1+floorint((pox[2]-BP->X[2][0])/DH[2]); int nn=ordn/2; ixl = ixl-nn; iyl = iyl-nn; izl = izl-nn; int tmi; tmi = (Symmetry==2)?-1:0; if(ixl0)?-1:0; if(izlBP->shape[0]) ixl=BP->shape[0]-ordn; if(iyl+ordn>BP->shape[1]) iyl=BP->shape[1]-ordn; if(izl+ordn>BP->shape[2]) izl=BP->shape[2]-ordn; // support cell center if(ixl>=0) Delx = ( pox[0] - BP->X[0][ixl] )/ DH[0]; else Delx = ( pox[0] + BP->X[0][0] )/ DH[0]; if(iyl>=0) Dely = ( pox[1] - BP->X[1][iyl] )/ DH[1]; else Dely = ( pox[1] + BP->X[1][0] )/ DH[1]; if(izl>=0) Delz = ( pox[2] - BP->X[2][izl] )/ DH[2]; else Delz = ( pox[2] + BP->X[2][0] )/ DH[2]; //change to fortran index ixl++; iyl++; izl++; ixu = ixl + ordn - 1; iyu = iyl + ordn - 1; izu = izl + ordn - 1; varl=VarList; int j=0; while(varl) { f_interp_2(BP->shape,BP->fgfs[varl->data->sgfn],shellf[j],ixl,ixu,iyl,iyu,izl,izu,Delx,Dely,Delz, ordn,varl->data->SoA,Symmetry); varl=varl->next; j++; } //varl #else //---> interpolation varl = VarList; int k = 0; while (varl) // run along variables { // shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn], // pox,ordn,varl->data->SoA,Symmetry); f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k], pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); varl = varl->next; k++; } #endif } } if (Bp == ble) break; Bp = Bp->next; } if (notfind && myrank == 0) { cout << "ERROR: Patch::Interp_Points fails to find point ("; for (int j = 0; j < dim; j++) { cout << XX[j]; if (j < dim - 1) cout << ","; else cout << ")"; } cout << " on Patch ("; for (int j = 0; j < dim; j++) { cout << bbox[j] << "+" << lli[j] * getdX(j); if (j < dim - 1) cout << ","; else cout << ")--"; } cout << "("; for (int j = 0; j < dim; j++) { cout << bbox[dim + j] << "-" << uui[j] * getdX(j); if (j < dim - 1) cout << ","; else cout << ")" << endl; } #if 0 checkBlock(); #else cout << "splited domains:" << endl; { MyList *Bp = blb; while (Bp) { Block *BP = Bp->data; for (int i = 0; i < dim; i++) { #ifdef Vertex #ifdef Cell #error Both Cell and Vertex are defined #endif llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i]; #else #ifdef Cell llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i]; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i]; #else #error Not define Vertex nor Cell #endif #endif } cout << "("; for (int j = 0; j < dim; j++) { cout << llb[j] << ":" << uub[j]; if (j < dim - 1) cout << ","; else cout << ")" << endl; } if (Bp == ble) break; Bp = Bp->next; } } #endif MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Allreduce(shellf, Shellf, num_var, MPI_DOUBLE, MPI_SUM, Comm_here); delete[] shellf; delete[] DH; delete[] llb; delete[] uub; return true; } // find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs void Patch::Find_Maximum(MyList *VarList, double *XX, double *Shellf) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } double *shellf, *xx; shellf = new double[num_var]; xx = new double[dim * num_var]; memset(shellf, 0, sizeof(double) * num_var); memset(xx, 0, sizeof(double) * dim * num_var); double *DH; int *llb, *uub; DH = new double[dim]; for (int i = 0; i < dim; i++) { DH[i] = getdX(i); } llb = new int[dim]; uub = new int[dim]; MyList *Bp = blb; while (Bp) // run along Blocks { Block *BP = Bp->data; if (myrank == BP->rank) { for (int i = 0; i < dim; i++) { llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width; } varl = VarList; int k = 0; double tmp, tmpx[dim]; while (varl) // run along variables { f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub); if (tmp > shellf[k]) { shellf[k] = tmp; for (int i = 0; i < dim; i++) xx[dim * k + i] = tmpx[i]; } varl = varl->next; k++; } } if (Bp == ble) break; Bp = Bp->next; } struct mloc { double val; int rank; }; mloc *IN, *OUT; IN = new mloc[num_var]; OUT = new mloc[num_var]; for (int i = 0; i < num_var; i++) { IN[i].val = shellf[i]; IN[i].rank = myrank; } MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, MPI_COMM_WORLD); for (int i = 0; i < num_var; i++) { Shellf[i] = OUT[i].val; if (myrank != OUT[i].rank) for (int k = 0; k < 3; k++) xx[3 * i + k] = 0; } MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); delete[] IN; delete[] OUT; delete[] shellf; delete[] xx; delete[] DH; delete[] llb; delete[] uub; } void Patch::Find_Maximum(MyList *VarList, double *XX, double *Shellf, MPI_Comm Comm_here) { int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MyList *varl; int num_var = 0; varl = VarList; while (varl) { num_var++; varl = varl->next; } double *shellf, *xx; shellf = new double[num_var]; xx = new double[dim * num_var]; memset(shellf, 0, sizeof(double) * num_var); memset(xx, 0, sizeof(double) * dim * num_var); double *DH; int *llb, *uub; DH = new double[dim]; for (int i = 0; i < dim; i++) { DH[i] = getdX(i); } llb = new int[dim]; uub = new int[dim]; MyList *Bp = blb; while (Bp) // run along Blocks { Block *BP = Bp->data; if (myrank == BP->rank) { for (int i = 0; i < dim; i++) { llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? lli[i] : ghost_width; uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? uui[i] : ghost_width; } varl = VarList; int k = 0; double tmp, tmpx[dim]; while (varl) // run along variables { f_find_maximum(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], tmp, tmpx, llb, uub); if (tmp > shellf[k]) { shellf[k] = tmp; for (int i = 0; i < dim; i++) xx[dim * k + i] = tmpx[i]; } varl = varl->next; k++; } } if (Bp == ble) break; Bp = Bp->next; } struct mloc { double val; int rank; }; mloc *IN, *OUT; IN = new mloc[num_var]; OUT = new mloc[num_var]; for (int i = 0; i < num_var; i++) { IN[i].val = shellf[i]; IN[i].rank = myrank; } MPI_Allreduce(IN, OUT, num_var, MPI_DOUBLE_INT, MPI_MAXLOC, Comm_here); for (int i = 0; i < num_var; i++) { Shellf[i] = OUT[i].val; if (myrank != OUT[i].rank) for (int k = 0; k < 3; k++) xx[3 * i + k] = 0; } MPI_Allreduce(xx, XX, dim * num_var, MPI_DOUBLE, MPI_SUM, Comm_here); delete[] IN; delete[] OUT; delete[] shellf; delete[] xx; delete[] DH; delete[] llb; delete[] uub; } // if the given point locates in the present Patch return true // otherwise return false bool Patch::Find_Point(double *XX) { double *DH; DH = new double[dim]; for (int i = 0; i < dim; i++) { DH[i] = getdX(i); } for (int i = 0; i < dim; i++) { // has excluded the buffer points if (XX[i] < bbox[i] + lli[i] * DH[i] - DH[i] / 100 || XX[i] > bbox[dim + i] - uui[i] * DH[i] + DH[i] / 100) { delete[] DH; return false; // out of current patch, // remember to delete the allocated arrays before return!!! } } delete[] DH; return true; }