diff --git a/AMSS_NCKU_source/ShellPatch.C b/AMSS_NCKU_source/ShellPatch.C index b32fd8c..927f997 100644 --- a/AMSS_NCKU_source/ShellPatch.C +++ b/AMSS_NCKU_source/ShellPatch.C @@ -18,9 +18,26 @@ using namespace std; #include "fmisc.h" #include "misc.h" #include "shellfunctions.h" -#include "parameters.h" - -#define PI M_PI +#include "parameters.h" + +#define PI M_PI + +#if USE_CUDA_BSSN +extern "C" int bssn_cuda_shell_pack_host_fields(int npoints, + int nvars, + int nblocks, + int ordn, + double **block_var_fields, + int *block_shapes, + int *point_block, + int *point_dimh, + int *point_dumyd, + int *point_sind, + double *point_coef, + double *out); +extern "C" void bssn_cuda_shell_pack_cache_begin(); +extern "C" void bssn_cuda_shell_pack_cache_end(); +#endif // x x x x x o * // * o x x x x x @@ -52,6 +69,21 @@ bool shell_parallel_interp_enabled() return enabled != 0; } +bool shell_cuda_interp_enabled() +{ +#if USE_CUDA_BSSN + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_CUDA_INTERP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +#else + return false; +#endif +} + bool shell_interp_stats_enabled() { static int enabled = -1; @@ -63,6 +95,17 @@ bool shell_interp_stats_enabled() return enabled != 0; } +bool shell_transfer_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_TRANSFER_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + int shell_interp_threads() { static int threads = -1; @@ -81,6 +124,17 @@ int shell_interp_threads() return threads; } +int shell_var_count(MyList *VarList) +{ + int count = 0; + while (VarList) + { + ++count; + VarList = VarList->next; + } + return count; +} + struct ShellInterpStats { long long pack_dim3 = 0; @@ -142,6 +196,70 @@ inline int shell_idx3(const int *shape, int i, int j, int k) return i + shape[0] * (j + shape[1] * k); } +bool shell_reflect_index(int idx, int n, int ordn, int &mapped, bool &reflected) +{ + reflected = false; + if (idx >= 0 && idx < n) + { + mapped = idx; + return true; + } + if (idx < 0) + { + if (idx < -ordn) + return false; + mapped = -idx; + reflected = true; + return mapped >= 0 && mapped < n; + } + if (idx <= n + ordn - 1) + { + mapped = 2 * n - 2 - idx; + reflected = true; + return mapped >= 0 && mapped < n; + } + return false; +} + +void shell_symmetry_soa3(const var *vp, int sst, double soa[3]) +{ + if (sst == 2 || sst == 3) + { + soa[0] = vp->SoA[1]; + soa[1] = vp->SoA[2]; + soa[2] = 0.0; + } + else if (sst == 4 || sst == 5) + { + soa[0] = vp->SoA[0]; + soa[1] = vp->SoA[2]; + soa[2] = 0.0; + } + else + { + soa[0] = vp->SoA[0]; + soa[1] = vp->SoA[1]; + soa[2] = (sst == -1) ? vp->SoA[2] : 0.0; + } +} + +double shell_symmetry_soa1(const var *vp, int sst, int dumyd) +{ + if (dumyd == 1) + { + if (sst == 2 || sst == 3) + return vp->SoA[1]; + return vp->SoA[0]; + } + if (dumyd == 0) + { + if (sst == 0 || sst == 1) + return vp->SoA[1]; + return vp->SoA[2]; + } + return 1.0; +} + bool shell_interp3d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn) { const int *shape = pt->Bg->shape; @@ -235,19 +353,24 @@ bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn) return false; if (DIMh == 3) - return s[0] >= 0 && s[1] >= 0 && s[2] >= 0 && - s[0] + ordn <= shape[0] && - s[1] + ordn <= shape[1] && - s[2] + ordn <= shape[2]; + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 || + s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1) + return false; + if (pt->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2])) + return false; + return true; + } if (DIMh == 1) { if (pt->dumyd == 1) - return s[0] >= 0 && s[0] + ordn <= shape[0] && + return s[0] >= -ordn && s[0] + ordn - 1 <= shape[0] + ordn - 1 && s[1] >= 0 && s[1] < shape[1] && s[2] >= 0 && s[2] < shape[2]; if (pt->dumyd == 0) - return s[0] >= 0 && s[0] + ordn <= shape[1] && + return s[0] >= -ordn && s[0] + ordn - 1 <= shape[1] + ordn - 1 && s[1] >= 0 && s[1] < shape[0] && s[2] >= 0 && s[2] < shape[2]; } @@ -255,7 +378,18 @@ bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn) return false; } +bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx); + bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double value) +{ + int idx = -1; + if (!shell_pointcopy_index(pt, idx)) + return false; + pt->Bg->fgfs[vp->sgfn][idx] = value; + return true; +} + +bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx) { Block *bg = pt->Bg; const int *shape = bg->shape; @@ -268,50 +402,199 @@ bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double #else h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d]; #endif - int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4); int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4); int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4); if (i < 0 || i >= shape[0] || j < 0 || j >= shape[1] || k < 0 || k >= shape[2]) return false; - bg->fgfs[vp->sgfn][shell_idx3(shape, i, j, k)] = value; + idx = shell_idx3(shape, i, j, k); return true; } bool shell_pointcopy_possible(const ShellPatch::pointstru *pt) { - Block *bg = pt->Bg; - const int *shape = bg->shape; - if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1) - return false; - double h[3]; - for (int d = 0; d < dim; ++d) -#ifdef Vertex - h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1); -#else - h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d]; -#endif - int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4); - int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4); - int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4); - return i >= 0 && i < shape[0] && j >= 0 && j < shape[1] && k >= 0 && k < shape[2]; + int idx = -1; + return shell_pointcopy_index(pt, idx); } -bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src, - const std::vector &vars, int ordn) +bool shell_pack3d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) { - const int DIMh = (src->dumyd == -1) ? dim : 1; + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 || + s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1) + return false; + if (src->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2])) + return false; + + const int nst = ordn * ordn * ordn; + if (ordn <= 0 || nst > 1728) + return false; + double weights[1728]; + int offsets[1728]; + unsigned char reflect_mask[1728]; + const double *cx = src->coef; + const double *cy = src->coef + ordn; + const double *cz = src->coef + 2 * ordn; + int q = 0; + for (int k = 0; k < ordn; ++k) + { + int mk = 0; + bool rk = false; + if (!shell_reflect_index(s[2] + k, shape[2], ordn, mk, rk)) + return false; + const double wz = cz[k]; + const int zk = mk * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + int mj = 0; + bool rj = false; + if (!shell_reflect_index(s[1] + j, shape[1], ordn, mj, rj)) + return false; + const double wyz = cy[j] * wz; + const int yj = mj * shape[0]; + for (int i = 0; i < ordn; ++i) + { + int mi = 0; + bool ri = false; + if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri)) + return false; + weights[q] = cx[i] * wyz; + offsets[q] = mi + yj + zk; + reflect_mask[q] = (ri ? 1 : 0) | (rj ? 2 : 0) | (rk ? 4 : 0); + ++q; + } + } + } + for (size_t vi = 0; vi < vars.size(); ++vi) { - double &out = data[out_base + (int)vi]; - bool ok = false; - if (DIMh == 3) - ok = shell_interp3d_fast(src, vars[vi], out, ordn); - else if (DIMh == 1) - ok = shell_interp1d_fast(src, vars[vi], out, ordn); - if (!ok) + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + double soa[3]; + shell_symmetry_soa3(vars[vi], src->ssst, soa); + double sum = 0.0; + for (int p = 0; p < nst; ++p) + { + double fac = 1.0; + if (reflect_mask[p] & 1) fac *= soa[0]; + if (reflect_mask[p] & 2) fac *= soa[1]; + if (reflect_mask[p] & 4) fac *= soa[2]; + sum += weights[p] * fac * f[offsets[p]]; + } + data[out_base + (int)vi] = sum; + shell_note_pack(3, true); + } + return true; +} + +bool shell_pack2d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + const int k0 = s[2] - 1; + if (s[0] < 0 || s[1] < 0 || + s[0] + ordn > shape[0] || + s[1] + ordn > shape[1] || + k0 < 0 || k0 >= shape[2]) + return false; + + const int nst = ordn * ordn; + if (ordn <= 0 || nst > 144) + return false; + double weights[144]; + int offsets[144]; + const double *cx = src->coef; + const double *cy = src->coef + ordn; + int q = 0; + const int zk = k0 * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + const int yj = (s[1] + j) * shape[0]; + for (int i = 0; i < ordn; ++i) + { + weights[q] = cx[i] * cy[j]; + offsets[q] = s[0] + i + yj + zk; + ++q; + } + } + + for (size_t vi = 0; vi < vars.size(); ++vi) + { + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + double sum = 0.0; + for (int p = 0; p < nst; ++p) + sum += weights[p] * f[offsets[p]]; + data[out_base + (int)vi] = sum; + shell_note_pack(2, true); + } + return true; +} + +bool shell_pack1d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + if (ordn <= 0 || ordn > 12) + return false; + + int offsets[12]; + unsigned char reflected[12]; + if (src->dumyd == 1) + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < 0 || s[1] >= shape[1] || + s[2] < 0 || s[2] >= shape[2]) return false; - shell_note_pack(DIMh, true); + const int base = s[1] * shape[0] + s[2] * shape[0] * shape[1]; + for (int i = 0; i < ordn; ++i) + { + int mi = 0; + bool ri = false; + if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri)) + return false; + offsets[i] = mi + base; + reflected[i] = ri ? 1 : 0; + } + } + else if (src->dumyd == 0) + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[1] + ordn - 1 || + s[1] < 0 || s[1] >= shape[0] || + s[2] < 0 || s[2] >= shape[2]) + return false; + const int base = s[1] + s[2] * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + int mj = 0; + bool rj = false; + if (!shell_reflect_index(s[0] + j, shape[1], ordn, mj, rj)) + return false; + offsets[j] = base + mj * shape[0]; + reflected[j] = rj ? 1 : 0; + } + } + else + return false; + + for (size_t vi = 0; vi < vars.size(); ++vi) + { + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + const double soa = shell_symmetry_soa1(vars[vi], src->ssst, src->dumyd); + double sum = 0.0; + for (int p = 0; p < ordn; ++p) + sum += src->coef[p] * (reflected[p] ? soa : 1.0) * f[offsets[p]]; + data[out_base + (int)vi] = sum; + shell_note_pack(1, true); } return true; } @@ -322,6 +605,243 @@ struct ShellPointPair ShellPatch::pointstru *dst; }; +struct ShellActiveCache +{ + MyList *src = 0; + MyList *dst = 0; + MyList *var_src = 0; + MyList *var_dst = 0; + int rank_in = -1; + int dir = -1; + int myrank = -1; + bool valid = false; + int ordn = -1; + std::vector src_vars; + std::vector dst_vars; + std::vector active_points; + std::vector fast_points; + std::vector dst_indices; +}; + +std::vector shell_active_caches; + +void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn); + +bool shell_active_cache_matches(MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *var_src, + MyList *var_dst, + int myrank, + int ordn, + const ShellActiveCache &cache) +{ + return cache.valid && + cache.src == src && + cache.dst == dst && + cache.rank_in == rank_in && + cache.dir == dir && + cache.var_src == var_src && + cache.var_dst == var_dst && + cache.myrank == myrank && + cache.ordn == ordn; +} + +ShellActiveCache &shell_get_active_cache(MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + MyList *VarListd, + int myrank, + int ordn) +{ + for (size_t c = 0; c < shell_active_caches.size(); ++c) + if (shell_active_cache_matches(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn, shell_active_caches[c])) + return shell_active_caches[c]; + + shell_active_caches.push_back(ShellActiveCache()); + ShellActiveCache &cache = shell_active_caches.back(); + cache.src = src; + cache.dst = dst; + cache.var_src = VarLists; + cache.var_dst = VarListd; + cache.rank_in = rank_in; + cache.dir = dir; + cache.myrank = myrank; + cache.ordn = ordn; + cache.valid = true; + + for (MyList *varls = VarLists, *varld = VarListd; + varls && varld; + varls = varls->next, varld = varld->next) + { + cache.src_vars.push_back(varls->data); + cache.dst_vars.push_back(varld->data); + } + + MyList *src_scan = src; + MyList *dst_scan = dst; + while (src_scan && dst_scan) + { + if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) || + (dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank)) + { + ShellPointPair pair; + pair.src = src_scan->data; + pair.dst = dst_scan->data; + cache.active_points.push_back(pair); + } + src_scan = src_scan->next; + dst_scan = dst_scan->next; + } + + cache.fast_points.assign(cache.active_points.size(), 0); + cache.dst_indices.assign(cache.active_points.size(), -1); + if (dir == PACK && shell_fast_interp_enabled()) + { + for (size_t p = 0; p < cache.active_points.size(); ++p) + { + shell_prepare_interp_coeffs(cache.active_points[p].src, ordn); + cache.fast_points[p] = shell_interp_fast_possible(cache.active_points[p].src, ordn) ? 1 : 0; + } + } + else if (dir == UNPACK) + { + for (size_t p = 0; p < cache.active_points.size(); ++p) + { + int idx = -1; + if (shell_pointcopy_index(cache.active_points[p].dst, idx)) + { + cache.fast_points[p] = 1; + cache.dst_indices[p] = idx; + } + } + } + + return cache; +} + +bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int DIMh = (src->dumyd == -1) ? dim : 1; + if (DIMh == 3) + return shell_pack3d_fast_all(data, out_base, src, vars, ordn); + if (DIMh == 2) + return shell_pack2d_fast_all(data, out_base, src, vars, ordn); + if (DIMh == 1) + return shell_pack1d_fast_all(data, out_base, src, vars, ordn); + return false; +} + +bool shell_pack_cuda_fast_points(double *data, + const std::vector &active_points, + const std::vector &vars, + const std::vector &fast_points, + int ordn) +{ +#if USE_CUDA_BSSN + if (!shell_cuda_interp_enabled() || active_points.empty() || vars.empty()) + return false; + + std::vector point_to_active; + point_to_active.reserve(active_points.size()); + std::map block_index; + std::vector blocks; + + for (size_t p = 0; p < active_points.size(); ++p) + { + if (!fast_points[p]) + continue; + ShellPatch::pointstru *src = active_points[p].src; + const int DIMh = (src->dumyd == -1) ? dim : 1; + if (DIMh != 3 && DIMh != 1) + continue; + point_to_active.push_back((int)p); + if (block_index.find(src->Bg) == block_index.end()) + { + int next = (int)blocks.size(); + block_index[src->Bg] = next; + blocks.push_back(src->Bg); + } + } + + const int npoints = (int)point_to_active.size(); + const int nvars = (int)vars.size(); + const int nblocks = (int)blocks.size(); + if (npoints <= 0 || nvars <= 0 || nblocks <= 0) + return false; + + std::vector block_var_fields((size_t)nblocks * nvars, 0); + std::vector block_shapes((size_t)nblocks * 3, 0); + for (int b = 0; b < nblocks; ++b) + { + Block *bg = blocks[b]; + block_shapes[3 * b + 0] = bg->shape[0]; + block_shapes[3 * b + 1] = bg->shape[1]; + block_shapes[3 * b + 2] = bg->shape[2]; + for (int v = 0; v < nvars; ++v) + block_var_fields[(size_t)b * nvars + v] = bg->fgfs[vars[v]->sgfn]; + } + + std::vector point_block(npoints, 0); + std::vector point_dimh(npoints, 0); + std::vector point_dumyd(npoints, -1); + std::vector point_sind((size_t)npoints * 3, 0); + std::vector point_coef((size_t)npoints * 3 * ordn, 0.0); + std::vector out((size_t)npoints * nvars, 0.0); + + for (int q = 0; q < npoints; ++q) + { + const int p = point_to_active[q]; + ShellPatch::pointstru *src = active_points[p].src; + const int DIMh = (src->dumyd == -1) ? dim : 1; + point_block[q] = block_index[src->Bg]; + point_dimh[q] = DIMh; + point_dumyd[q] = src->dumyd; + point_sind[(size_t)q * 3 + 0] = src->sind[0]; + point_sind[(size_t)q * 3 + 1] = src->sind[1]; + point_sind[(size_t)q * 3 + 2] = src->sind[2]; + const int coef_count = (DIMh == 3) ? 3 * ordn : ordn; + for (int c = 0; c < coef_count; ++c) + point_coef[(size_t)q * 3 * ordn + c] = src->coef[c]; + } + + int rc = bssn_cuda_shell_pack_host_fields(npoints, nvars, nblocks, ordn, + block_var_fields.data(), + block_shapes.data(), + point_block.data(), + point_dimh.data(), + point_dumyd.data(), + point_sind.data(), + point_coef.data(), + out.data()); + if (rc != 0) + return false; + + for (int q = 0; q < npoints; ++q) + { + const int p = point_to_active[q]; + const int base = p * nvars; + for (int v = 0; v < nvars; ++v) + { + data[base + v] = out[(size_t)q * nvars + v]; + shell_note_pack(point_dimh[q], true); + } + } + return true; +#else + (void)data; + (void)active_points; + (void)vars; + (void)fast_points; + (void)ordn; + return false; +#endif +} + void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn) { if (pt->coef) @@ -2971,17 +3491,32 @@ double *ShellPatch::Collect_Data(ss_patch *PP, var *VP) return databuffer; } -void ShellPatch::intertransfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ +void ShellPatch::intertransfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ int myrank, cpusize; MPI_Comm_size(MPI_COMM_WORLD, &cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int node; - - MPI_Request *reqs; + int node; + const bool transfer_timing = shell_transfer_timing_enabled(); + double t_query = 0.0; + double t_pack = 0.0; + double t_post = 0.0; + double t_wait = 0.0; + double t_unpack = 0.0; + long long send_values = 0; + long long recv_values = 0; + double tt = 0.0; + +#if USE_CUDA_BSSN + const bool use_shell_cuda_pack = shell_cuda_interp_enabled(); + if (use_shell_cuda_pack) + bssn_cuda_shell_pack_cache_begin(); +#endif + + MPI_Request *reqs; MPI_Status *stats; reqs = new MPI_Request[2 * cpusize]; stats = new MPI_Status[2 * cpusize]; @@ -2995,55 +3530,109 @@ void ShellPatch::intertransfer(MyList **src, MyList **dst, for (node = 0; node < cpusize; node++) { send_data[node] = rec_data[node] = 0; - if (node == myrank) - { - if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - // send from this cpu to cpu#node - if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - send_data[node] = new double[length]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - // receive from cpu#node to this cpu - if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - } - } - // wait for all requests to complete - MPI_Waitall(req_no, reqs, stats); - - for (node = 0; node < cpusize; node++) - if (rec_data[node]) - interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { + if (node == myrank) + { + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + rec_data[node] = new double[length]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + recv_values += length; + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_pack += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + } + else + { + // send from this cpu to cpu#node + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + send_data[node] = new double[length]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + send_values += length; + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_pack += MPI_Wtime() - tt; + if (transfer_timing) tt = MPI_Wtime(); + MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); + if (transfer_timing) t_post += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + // receive from cpu#node to this cpu + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + rec_data[node] = new double[length]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + recv_values += length; + if (transfer_timing) tt = MPI_Wtime(); + MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); + if (transfer_timing) t_post += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + } + } + // wait for all requests to complete + if (transfer_timing) tt = MPI_Wtime(); + MPI_Waitall(req_no, reqs, stats); + if (transfer_timing) t_wait += MPI_Wtime() - tt; + +#if USE_CUDA_BSSN + if (use_shell_cuda_pack) + bssn_cuda_shell_pack_cache_end(); +#endif + + for (node = 0; node < cpusize; node++) + if (rec_data[node]) + { + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_unpack += MPI_Wtime() - tt; + } + + if (transfer_timing) + { + double local[5] = {t_query, t_pack, t_post, t_wait, t_unpack}; + double maxv[5] = {}; + long long counts[2] = {send_values, recv_values}; + long long max_counts[2] = {}; + MPI_Reduce(local, maxv, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(counts, max_counts, 2, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + cout << "[AMSS-SHELL-TRANSFER] vars=" << shell_var_count(VarList1) + << " query=" << maxv[0] + << " pack=" << maxv[1] + << " post=" << maxv[2] + << " wait=" << maxv[3] + << " unpack=" << maxv[4] + << " send_values=" << max_counts[0] + << " recv_values=" << max_counts[1] + << endl; + } + + for (node = 0; node < cpusize; node++) + { if (send_data[node]) delete[] send_data[node]; if (rec_data[node]) @@ -3093,32 +3682,14 @@ int ShellPatch::interdata_packer(double *data, MyList *src, MyList src_vars; - std::vector dst_vars; - for (varls = VarLists, varld = VarListd; varls && varld; varls = varls->next, varld = varld->next) - { - src_vars.push_back(varls->data); - dst_vars.push_back(varld->data); - } - - std::vector active_points; - MyList *src_scan = src; - MyList *dst_scan = dst; - while (src_scan && dst_scan) - { - if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) || - (dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank)) - { - ShellPointPair pair; - pair.src = src_scan->data; - pair.dst = dst_scan->data; - active_points.push_back(pair); - } - src_scan = src_scan->next; - dst_scan = dst_scan->next; - } + ShellActiveCache &active_cache = shell_get_active_cache(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn); + const std::vector &src_vars = active_cache.src_vars; + const std::vector &dst_vars = active_cache.dst_vars; + const std::vector &active_points = active_cache.active_points; + const std::vector &fast_points = active_cache.fast_points; + const std::vector &dst_indices = active_cache.dst_indices; if (!data) return (int)(active_points.size() * src_vars.size()); @@ -3127,46 +3698,38 @@ int ShellPatch::interdata_packer(double *data, MyList *src, MyList fast_points(active_points.size(), 0); - if (dir == PACK && shell_fast_interp_enabled()) - { - for (size_t p = 0; p < active_points.size(); ++p) - { - shell_prepare_interp_coeffs(active_points[p].src, ordn); - fast_points[p] = shell_interp_fast_possible(active_points[p].src, ordn) ? 1 : 0; - } - } - else if (dir == UNPACK) - { - for (size_t p = 0; p < active_points.size(); ++p) - fast_points[p] = shell_pointcopy_possible(active_points[p].dst) ? 1 : 0; - } + bool cuda_pack_done = false; + if (dir == PACK) + cuda_pack_done = shell_pack_cuda_fast_points(data, active_points, src_vars, fast_points, ordn); std::vector workers; workers.reserve(nthreads); - for (int tid = 0; tid < nthreads; ++tid) + if (!cuda_pack_done) { - const int begin = (int)((long long)active_points.size() * tid / nthreads); - const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads); - workers.push_back(std::thread([&, begin, end]() { - for (int p = begin; p < end; ++p) - { - const int base = p * nvar; - if (!fast_points[p]) - continue; - if (dir == PACK) - shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn); - else + for (int tid = 0; tid < nthreads; ++tid) + { + const int begin = (int)((long long)active_points.size() * tid / nthreads); + const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads); + workers.push_back(std::thread([&, begin, end]() { + for (int p = begin; p < end; ++p) { - for (int vi = 0; vi < nvar; ++vi) - shell_pointcopy_fast(active_points[p].dst, dst_vars[vi], data[base + vi]); + const int base = p * nvar; + if (!fast_points[p]) + continue; + if (dir == PACK) + shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn); + else + { + for (int vi = 0; vi < nvar; ++vi) + active_points[p].dst->Bg->fgfs[dst_vars[vi]->sgfn][dst_indices[p]] = data[base + vi]; + } } - } - })); + })); + } + for (size_t t = 0; t < workers.size(); ++t) + workers[t].join(); } - for (size_t t = 0; t < workers.size(); ++t) - workers[t].join(); for (size_t p = 0; p < active_points.size(); ++p) { diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 52b4303..6313e5f 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -9463,6 +9463,197 @@ int bssn_cuda_interp_host_two_fields(void *block_tag, return 0; } +__global__ void kern_shell_pack_host_fields(double **fields, + const int *block_shapes, + const int *point_block, + const int *point_dimh, + const int *point_dumyd, + const int *point_sind, + const double *point_coef, + double *out, + int npoints, + int nvars, + int ordn) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int total = npoints * nvars; + if (tid >= total) return; + + const int p = tid / nvars; + const int v = tid - p * nvars; + const int b = point_block[p]; + const int *shape = block_shapes + 3 * b; + const int *s = point_sind + 3 * p; + const double *coef = point_coef + 3 * ordn * p; + const double *f = fields[b * nvars + v]; + + const int nx = shape[0]; + const int ny = shape[1]; + const int nz = shape[2]; + const int dimh = point_dimh[p]; + const int dumyd = point_dumyd[p]; + double sum = 0.0; + + if (dimh == 3) { + const double *cx = coef; + const double *cy = coef + ordn; + const double *cz = coef + 2 * ordn; + for (int kk = 0; kk < ordn; ++kk) + for (int jj = 0; jj < ordn; ++jj) + for (int ii = 0; ii < ordn; ++ii) { + const int idx = (s[0] + ii) + nx * ((s[1] + jj) + ny * (s[2] + kk)); + sum += cx[ii] * cy[jj] * cz[kk] * f[idx]; + } + } else if (dimh == 1 && dumyd == 1) { + for (int ii = 0; ii < ordn; ++ii) { + const int idx = (s[0] + ii) + nx * (s[1] + ny * s[2]); + sum += coef[ii] * f[idx]; + } + } else if (dimh == 1 && dumyd == 0) { + for (int jj = 0; jj < ordn; ++jj) { + const int idx = s[1] + nx * ((s[0] + jj) + ny * s[2]); + sum += coef[jj] * f[idx]; + } + } + + out[tid] = sum; +} + +struct ShellPackCachedField { + double *device; + size_t bytes; + int generation; +}; + +static std::unordered_map g_shell_pack_cache; +static int g_shell_pack_generation = 0; + +extern "C" +void bssn_cuda_shell_pack_cache_begin() +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + for (auto &kv : g_shell_pack_cache) + cudaFree(kv.second.device); + g_shell_pack_cache.clear(); + ++g_shell_pack_generation; +} + +extern "C" +void bssn_cuda_shell_pack_cache_end() +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + for (auto &kv : g_shell_pack_cache) + cudaFree(kv.second.device); + g_shell_pack_cache.clear(); +} + +extern "C" +int bssn_cuda_shell_pack_host_fields(int npoints, + int nvars, + int nblocks, + int ordn, + double **block_var_fields, + int *block_shapes, + int *point_block, + int *point_dimh, + int *point_dumyd, + int *point_sind, + double *point_coef, + double *out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (npoints <= 0 || nvars <= 0 || nblocks <= 0 || ordn <= 0 || ordn > 8 || + !block_var_fields || !block_shapes || !point_block || !point_dimh || + !point_dumyd || !point_sind || !point_coef || !out) + return 1; + + const int field_count = nblocks * nvars; + std::vector h_device_fields((size_t)field_count, nullptr); + double **d_fields = nullptr; + int *d_block_shapes = nullptr; + int *d_point_block = nullptr; + int *d_point_dimh = nullptr; + int *d_point_dumyd = nullptr; + int *d_point_sind = nullptr; + double *d_point_coef = nullptr; + double *d_out = nullptr; + + for (int b = 0; b < nblocks; ++b) { + const size_t all = (size_t)block_shapes[3 * b] * + (size_t)block_shapes[3 * b + 1] * + (size_t)block_shapes[3 * b + 2]; + const size_t bytes = all * sizeof(double); + for (int v = 0; v < nvars; ++v) { + const int idx = b * nvars + v; + double *host_ptr = block_var_fields[idx]; + if (!host_ptr) return 1; + auto it = g_shell_pack_cache.find(host_ptr); + if (it != g_shell_pack_cache.end() && + it->second.bytes == bytes && + it->second.generation == g_shell_pack_generation) { + h_device_fields[idx] = it->second.device; + } else { + double *device_ptr = nullptr; + CUDA_CHECK(cudaMalloc(&device_ptr, bytes)); + CUDA_CHECK(cudaMemcpy(device_ptr, host_ptr, bytes, cudaMemcpyHostToDevice)); + g_shell_pack_cache[host_ptr] = {device_ptr, bytes, g_shell_pack_generation}; + h_device_fields[idx] = device_ptr; + } + } + } + + CUDA_CHECK(cudaMalloc(&d_fields, (size_t)field_count * sizeof(double *))); + CUDA_CHECK(cudaMemcpy(d_fields, h_device_fields.data(), + (size_t)field_count * sizeof(double *), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc(&d_block_shapes, (size_t)nblocks * 3 * sizeof(int))); + CUDA_CHECK(cudaMemcpy(d_block_shapes, block_shapes, + (size_t)nblocks * 3 * sizeof(int), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc(&d_point_block, (size_t)npoints * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_point_dimh, (size_t)npoints * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_point_dumyd, (size_t)npoints * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_point_sind, (size_t)npoints * 3 * sizeof(int))); + CUDA_CHECK(cudaMalloc(&d_point_coef, (size_t)npoints * 3 * ordn * sizeof(double))); + CUDA_CHECK(cudaMalloc(&d_out, (size_t)npoints * nvars * sizeof(double))); + + CUDA_CHECK(cudaMemcpy(d_point_block, point_block, + (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_point_dimh, point_dimh, + (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_point_dumyd, point_dumyd, + (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_point_sind, point_sind, + (size_t)npoints * 3 * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_point_coef, point_coef, + (size_t)npoints * 3 * ordn * sizeof(double), + cudaMemcpyHostToDevice)); + + const int total = npoints * nvars; + const int threads = 256; + const int blocks = (total + threads - 1) / threads; + kern_shell_pack_host_fields<<>>( + d_fields, d_block_shapes, d_point_block, d_point_dimh, + d_point_dumyd, d_point_sind, d_point_coef, d_out, + npoints, nvars, ordn); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaMemcpy(out, d_out, (size_t)total * sizeof(double), + cudaMemcpyDeviceToHost)); + + cudaFree(d_out); + cudaFree(d_point_coef); + cudaFree(d_point_sind); + cudaFree(d_point_dumyd); + cudaFree(d_point_dimh); + cudaFree(d_point_block); + cudaFree(d_block_shapes); + cudaFree(d_fields); + return 0; +} + extern "C" int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int state_index, diff --git a/makefile_and_run.py b/makefile_and_run.py index d2f8ec4..387c3dd 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -162,6 +162,7 @@ def _gpu_runtime_env(): "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", "AMSS_SHELL_FAST_INTERP": "0", "AMSS_SHELL_PARALLEL_INTERP": "0", + "AMSS_SHELL_CUDA_INTERP": "0", } if finite_difference in ("2nd-order", "8th-order"): defaults.update({ @@ -348,6 +349,7 @@ def run_ABE(): print(f" AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}") print(f" AMSS_SHELL_FAST_INTERP={mpi_env.get('AMSS_SHELL_FAST_INTERP', '')}") print(f" AMSS_SHELL_PARALLEL_INTERP={mpi_env.get('AMSS_SHELL_PARALLEL_INTERP', '')}") + print(f" AMSS_SHELL_CUDA_INTERP={mpi_env.get('AMSS_SHELL_CUDA_INTERP', '')}") print(f" AMSS_SHELL_INTERP_THREADS={mpi_env.get('AMSS_SHELL_INTERP_THREADS', '')}") print(f" AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}") print(f" AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}")