Accelerate Shell-Patch interpolation fast paths

This commit is contained in:
2026-05-08 13:26:16 +08:00
parent 063f28b3b4
commit 39450228f5
3 changed files with 906 additions and 150 deletions

View File

@@ -18,9 +18,26 @@ using namespace std;
#include "fmisc.h"
#include "misc.h"
#include "shellfunctions.h"
#include "parameters.h"
#define PI M_PI
#include "parameters.h"
#define PI M_PI
#if USE_CUDA_BSSN
extern "C" int bssn_cuda_shell_pack_host_fields(int npoints,
int nvars,
int nblocks,
int ordn,
double **block_var_fields,
int *block_shapes,
int *point_block,
int *point_dimh,
int *point_dumyd,
int *point_sind,
double *point_coef,
double *out);
extern "C" void bssn_cuda_shell_pack_cache_begin();
extern "C" void bssn_cuda_shell_pack_cache_end();
#endif
// x x x x x o *
// * o x x x x x
@@ -52,6 +69,21 @@ bool shell_parallel_interp_enabled()
return enabled != 0;
}
bool shell_cuda_interp_enabled()
{
#if USE_CUDA_BSSN
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_SHELL_CUDA_INTERP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
#else
return false;
#endif
}
bool shell_interp_stats_enabled()
{
static int enabled = -1;
@@ -63,6 +95,17 @@ bool shell_interp_stats_enabled()
return enabled != 0;
}
bool shell_transfer_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_SHELL_TRANSFER_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
int shell_interp_threads()
{
static int threads = -1;
@@ -81,6 +124,17 @@ int shell_interp_threads()
return threads;
}
int shell_var_count(MyList<var> *VarList)
{
int count = 0;
while (VarList)
{
++count;
VarList = VarList->next;
}
return count;
}
struct ShellInterpStats
{
long long pack_dim3 = 0;
@@ -142,6 +196,70 @@ inline int shell_idx3(const int *shape, int i, int j, int k)
return i + shape[0] * (j + shape[1] * k);
}
bool shell_reflect_index(int idx, int n, int ordn, int &mapped, bool &reflected)
{
reflected = false;
if (idx >= 0 && idx < n)
{
mapped = idx;
return true;
}
if (idx < 0)
{
if (idx < -ordn)
return false;
mapped = -idx;
reflected = true;
return mapped >= 0 && mapped < n;
}
if (idx <= n + ordn - 1)
{
mapped = 2 * n - 2 - idx;
reflected = true;
return mapped >= 0 && mapped < n;
}
return false;
}
void shell_symmetry_soa3(const var *vp, int sst, double soa[3])
{
if (sst == 2 || sst == 3)
{
soa[0] = vp->SoA[1];
soa[1] = vp->SoA[2];
soa[2] = 0.0;
}
else if (sst == 4 || sst == 5)
{
soa[0] = vp->SoA[0];
soa[1] = vp->SoA[2];
soa[2] = 0.0;
}
else
{
soa[0] = vp->SoA[0];
soa[1] = vp->SoA[1];
soa[2] = (sst == -1) ? vp->SoA[2] : 0.0;
}
}
double shell_symmetry_soa1(const var *vp, int sst, int dumyd)
{
if (dumyd == 1)
{
if (sst == 2 || sst == 3)
return vp->SoA[1];
return vp->SoA[0];
}
if (dumyd == 0)
{
if (sst == 0 || sst == 1)
return vp->SoA[1];
return vp->SoA[2];
}
return 1.0;
}
bool shell_interp3d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
{
const int *shape = pt->Bg->shape;
@@ -235,19 +353,24 @@ bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn)
return false;
if (DIMh == 3)
return s[0] >= 0 && s[1] >= 0 && s[2] >= 0 &&
s[0] + ordn <= shape[0] &&
s[1] + ordn <= shape[1] &&
s[2] + ordn <= shape[2];
{
if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 ||
s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1)
return false;
if (pt->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2]))
return false;
return true;
}
if (DIMh == 1)
{
if (pt->dumyd == 1)
return s[0] >= 0 && s[0] + ordn <= shape[0] &&
return s[0] >= -ordn && s[0] + ordn - 1 <= shape[0] + ordn - 1 &&
s[1] >= 0 && s[1] < shape[1] &&
s[2] >= 0 && s[2] < shape[2];
if (pt->dumyd == 0)
return s[0] >= 0 && s[0] + ordn <= shape[1] &&
return s[0] >= -ordn && s[0] + ordn - 1 <= shape[1] + ordn - 1 &&
s[1] >= 0 && s[1] < shape[0] &&
s[2] >= 0 && s[2] < shape[2];
}
@@ -255,7 +378,18 @@ bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn)
return false;
}
bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx);
bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double value)
{
int idx = -1;
if (!shell_pointcopy_index(pt, idx))
return false;
pt->Bg->fgfs[vp->sgfn][idx] = value;
return true;
}
bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx)
{
Block *bg = pt->Bg;
const int *shape = bg->shape;
@@ -268,50 +402,199 @@ bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double
#else
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d];
#endif
int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4);
int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4);
int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4);
if (i < 0 || i >= shape[0] || j < 0 || j >= shape[1] || k < 0 || k >= shape[2])
return false;
bg->fgfs[vp->sgfn][shell_idx3(shape, i, j, k)] = value;
idx = shell_idx3(shape, i, j, k);
return true;
}
bool shell_pointcopy_possible(const ShellPatch::pointstru *pt)
{
Block *bg = pt->Bg;
const int *shape = bg->shape;
if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1)
return false;
double h[3];
for (int d = 0; d < dim; ++d)
#ifdef Vertex
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1);
#else
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d];
#endif
int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4);
int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4);
int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4);
return i >= 0 && i < shape[0] && j >= 0 && j < shape[1] && k >= 0 && k < shape[2];
int idx = -1;
return shell_pointcopy_index(pt, idx);
}
bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src,
const std::vector<var *> &vars, int ordn)
bool shell_pack3d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
const std::vector<var *> &vars, int ordn)
{
const int DIMh = (src->dumyd == -1) ? dim : 1;
const int *shape = src->Bg->shape;
const int *s = src->sind;
if (!s || !src->coef)
return false;
if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 ||
s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1)
return false;
if (src->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2]))
return false;
const int nst = ordn * ordn * ordn;
if (ordn <= 0 || nst > 1728)
return false;
double weights[1728];
int offsets[1728];
unsigned char reflect_mask[1728];
const double *cx = src->coef;
const double *cy = src->coef + ordn;
const double *cz = src->coef + 2 * ordn;
int q = 0;
for (int k = 0; k < ordn; ++k)
{
int mk = 0;
bool rk = false;
if (!shell_reflect_index(s[2] + k, shape[2], ordn, mk, rk))
return false;
const double wz = cz[k];
const int zk = mk * shape[0] * shape[1];
for (int j = 0; j < ordn; ++j)
{
int mj = 0;
bool rj = false;
if (!shell_reflect_index(s[1] + j, shape[1], ordn, mj, rj))
return false;
const double wyz = cy[j] * wz;
const int yj = mj * shape[0];
for (int i = 0; i < ordn; ++i)
{
int mi = 0;
bool ri = false;
if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri))
return false;
weights[q] = cx[i] * wyz;
offsets[q] = mi + yj + zk;
reflect_mask[q] = (ri ? 1 : 0) | (rj ? 2 : 0) | (rk ? 4 : 0);
++q;
}
}
}
for (size_t vi = 0; vi < vars.size(); ++vi)
{
double &out = data[out_base + (int)vi];
bool ok = false;
if (DIMh == 3)
ok = shell_interp3d_fast(src, vars[vi], out, ordn);
else if (DIMh == 1)
ok = shell_interp1d_fast(src, vars[vi], out, ordn);
if (!ok)
const double *f = src->Bg->fgfs[vars[vi]->sgfn];
double soa[3];
shell_symmetry_soa3(vars[vi], src->ssst, soa);
double sum = 0.0;
for (int p = 0; p < nst; ++p)
{
double fac = 1.0;
if (reflect_mask[p] & 1) fac *= soa[0];
if (reflect_mask[p] & 2) fac *= soa[1];
if (reflect_mask[p] & 4) fac *= soa[2];
sum += weights[p] * fac * f[offsets[p]];
}
data[out_base + (int)vi] = sum;
shell_note_pack(3, true);
}
return true;
}
bool shell_pack2d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
const std::vector<var *> &vars, int ordn)
{
const int *shape = src->Bg->shape;
const int *s = src->sind;
if (!s || !src->coef)
return false;
const int k0 = s[2] - 1;
if (s[0] < 0 || s[1] < 0 ||
s[0] + ordn > shape[0] ||
s[1] + ordn > shape[1] ||
k0 < 0 || k0 >= shape[2])
return false;
const int nst = ordn * ordn;
if (ordn <= 0 || nst > 144)
return false;
double weights[144];
int offsets[144];
const double *cx = src->coef;
const double *cy = src->coef + ordn;
int q = 0;
const int zk = k0 * shape[0] * shape[1];
for (int j = 0; j < ordn; ++j)
{
const int yj = (s[1] + j) * shape[0];
for (int i = 0; i < ordn; ++i)
{
weights[q] = cx[i] * cy[j];
offsets[q] = s[0] + i + yj + zk;
++q;
}
}
for (size_t vi = 0; vi < vars.size(); ++vi)
{
const double *f = src->Bg->fgfs[vars[vi]->sgfn];
double sum = 0.0;
for (int p = 0; p < nst; ++p)
sum += weights[p] * f[offsets[p]];
data[out_base + (int)vi] = sum;
shell_note_pack(2, true);
}
return true;
}
bool shell_pack1d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
const std::vector<var *> &vars, int ordn)
{
const int *shape = src->Bg->shape;
const int *s = src->sind;
if (!s || !src->coef)
return false;
if (ordn <= 0 || ordn > 12)
return false;
int offsets[12];
unsigned char reflected[12];
if (src->dumyd == 1)
{
if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
s[1] < 0 || s[1] >= shape[1] ||
s[2] < 0 || s[2] >= shape[2])
return false;
shell_note_pack(DIMh, true);
const int base = s[1] * shape[0] + s[2] * shape[0] * shape[1];
for (int i = 0; i < ordn; ++i)
{
int mi = 0;
bool ri = false;
if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri))
return false;
offsets[i] = mi + base;
reflected[i] = ri ? 1 : 0;
}
}
else if (src->dumyd == 0)
{
if (s[0] < -ordn || s[0] + ordn - 1 > shape[1] + ordn - 1 ||
s[1] < 0 || s[1] >= shape[0] ||
s[2] < 0 || s[2] >= shape[2])
return false;
const int base = s[1] + s[2] * shape[0] * shape[1];
for (int j = 0; j < ordn; ++j)
{
int mj = 0;
bool rj = false;
if (!shell_reflect_index(s[0] + j, shape[1], ordn, mj, rj))
return false;
offsets[j] = base + mj * shape[0];
reflected[j] = rj ? 1 : 0;
}
}
else
return false;
for (size_t vi = 0; vi < vars.size(); ++vi)
{
const double *f = src->Bg->fgfs[vars[vi]->sgfn];
const double soa = shell_symmetry_soa1(vars[vi], src->ssst, src->dumyd);
double sum = 0.0;
for (int p = 0; p < ordn; ++p)
sum += src->coef[p] * (reflected[p] ? soa : 1.0) * f[offsets[p]];
data[out_base + (int)vi] = sum;
shell_note_pack(1, true);
}
return true;
}
@@ -322,6 +605,243 @@ struct ShellPointPair
ShellPatch::pointstru *dst;
};
struct ShellActiveCache
{
MyList<ShellPatch::pointstru> *src = 0;
MyList<ShellPatch::pointstru> *dst = 0;
MyList<var> *var_src = 0;
MyList<var> *var_dst = 0;
int rank_in = -1;
int dir = -1;
int myrank = -1;
bool valid = false;
int ordn = -1;
std::vector<var *> src_vars;
std::vector<var *> dst_vars;
std::vector<ShellPointPair> active_points;
std::vector<char> fast_points;
std::vector<int> dst_indices;
};
std::vector<ShellActiveCache> shell_active_caches;
void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn);
bool shell_active_cache_matches(MyList<ShellPatch::pointstru> *src,
MyList<ShellPatch::pointstru> *dst,
int rank_in,
int dir,
MyList<var> *var_src,
MyList<var> *var_dst,
int myrank,
int ordn,
const ShellActiveCache &cache)
{
return cache.valid &&
cache.src == src &&
cache.dst == dst &&
cache.rank_in == rank_in &&
cache.dir == dir &&
cache.var_src == var_src &&
cache.var_dst == var_dst &&
cache.myrank == myrank &&
cache.ordn == ordn;
}
ShellActiveCache &shell_get_active_cache(MyList<ShellPatch::pointstru> *src,
MyList<ShellPatch::pointstru> *dst,
int rank_in,
int dir,
MyList<var> *VarLists,
MyList<var> *VarListd,
int myrank,
int ordn)
{
for (size_t c = 0; c < shell_active_caches.size(); ++c)
if (shell_active_cache_matches(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn, shell_active_caches[c]))
return shell_active_caches[c];
shell_active_caches.push_back(ShellActiveCache());
ShellActiveCache &cache = shell_active_caches.back();
cache.src = src;
cache.dst = dst;
cache.var_src = VarLists;
cache.var_dst = VarListd;
cache.rank_in = rank_in;
cache.dir = dir;
cache.myrank = myrank;
cache.ordn = ordn;
cache.valid = true;
for (MyList<var> *varls = VarLists, *varld = VarListd;
varls && varld;
varls = varls->next, varld = varld->next)
{
cache.src_vars.push_back(varls->data);
cache.dst_vars.push_back(varld->data);
}
MyList<ShellPatch::pointstru> *src_scan = src;
MyList<ShellPatch::pointstru> *dst_scan = dst;
while (src_scan && dst_scan)
{
if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) ||
(dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank))
{
ShellPointPair pair;
pair.src = src_scan->data;
pair.dst = dst_scan->data;
cache.active_points.push_back(pair);
}
src_scan = src_scan->next;
dst_scan = dst_scan->next;
}
cache.fast_points.assign(cache.active_points.size(), 0);
cache.dst_indices.assign(cache.active_points.size(), -1);
if (dir == PACK && shell_fast_interp_enabled())
{
for (size_t p = 0; p < cache.active_points.size(); ++p)
{
shell_prepare_interp_coeffs(cache.active_points[p].src, ordn);
cache.fast_points[p] = shell_interp_fast_possible(cache.active_points[p].src, ordn) ? 1 : 0;
}
}
else if (dir == UNPACK)
{
for (size_t p = 0; p < cache.active_points.size(); ++p)
{
int idx = -1;
if (shell_pointcopy_index(cache.active_points[p].dst, idx))
{
cache.fast_points[p] = 1;
cache.dst_indices[p] = idx;
}
}
}
return cache;
}
bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src,
const std::vector<var *> &vars, int ordn)
{
const int DIMh = (src->dumyd == -1) ? dim : 1;
if (DIMh == 3)
return shell_pack3d_fast_all(data, out_base, src, vars, ordn);
if (DIMh == 2)
return shell_pack2d_fast_all(data, out_base, src, vars, ordn);
if (DIMh == 1)
return shell_pack1d_fast_all(data, out_base, src, vars, ordn);
return false;
}
bool shell_pack_cuda_fast_points(double *data,
const std::vector<ShellPointPair> &active_points,
const std::vector<var *> &vars,
const std::vector<char> &fast_points,
int ordn)
{
#if USE_CUDA_BSSN
if (!shell_cuda_interp_enabled() || active_points.empty() || vars.empty())
return false;
std::vector<int> point_to_active;
point_to_active.reserve(active_points.size());
std::map<Block *, int> block_index;
std::vector<Block *> blocks;
for (size_t p = 0; p < active_points.size(); ++p)
{
if (!fast_points[p])
continue;
ShellPatch::pointstru *src = active_points[p].src;
const int DIMh = (src->dumyd == -1) ? dim : 1;
if (DIMh != 3 && DIMh != 1)
continue;
point_to_active.push_back((int)p);
if (block_index.find(src->Bg) == block_index.end())
{
int next = (int)blocks.size();
block_index[src->Bg] = next;
blocks.push_back(src->Bg);
}
}
const int npoints = (int)point_to_active.size();
const int nvars = (int)vars.size();
const int nblocks = (int)blocks.size();
if (npoints <= 0 || nvars <= 0 || nblocks <= 0)
return false;
std::vector<double *> block_var_fields((size_t)nblocks * nvars, 0);
std::vector<int> block_shapes((size_t)nblocks * 3, 0);
for (int b = 0; b < nblocks; ++b)
{
Block *bg = blocks[b];
block_shapes[3 * b + 0] = bg->shape[0];
block_shapes[3 * b + 1] = bg->shape[1];
block_shapes[3 * b + 2] = bg->shape[2];
for (int v = 0; v < nvars; ++v)
block_var_fields[(size_t)b * nvars + v] = bg->fgfs[vars[v]->sgfn];
}
std::vector<int> point_block(npoints, 0);
std::vector<int> point_dimh(npoints, 0);
std::vector<int> point_dumyd(npoints, -1);
std::vector<int> point_sind((size_t)npoints * 3, 0);
std::vector<double> point_coef((size_t)npoints * 3 * ordn, 0.0);
std::vector<double> out((size_t)npoints * nvars, 0.0);
for (int q = 0; q < npoints; ++q)
{
const int p = point_to_active[q];
ShellPatch::pointstru *src = active_points[p].src;
const int DIMh = (src->dumyd == -1) ? dim : 1;
point_block[q] = block_index[src->Bg];
point_dimh[q] = DIMh;
point_dumyd[q] = src->dumyd;
point_sind[(size_t)q * 3 + 0] = src->sind[0];
point_sind[(size_t)q * 3 + 1] = src->sind[1];
point_sind[(size_t)q * 3 + 2] = src->sind[2];
const int coef_count = (DIMh == 3) ? 3 * ordn : ordn;
for (int c = 0; c < coef_count; ++c)
point_coef[(size_t)q * 3 * ordn + c] = src->coef[c];
}
int rc = bssn_cuda_shell_pack_host_fields(npoints, nvars, nblocks, ordn,
block_var_fields.data(),
block_shapes.data(),
point_block.data(),
point_dimh.data(),
point_dumyd.data(),
point_sind.data(),
point_coef.data(),
out.data());
if (rc != 0)
return false;
for (int q = 0; q < npoints; ++q)
{
const int p = point_to_active[q];
const int base = p * nvars;
for (int v = 0; v < nvars; ++v)
{
data[base + v] = out[(size_t)q * nvars + v];
shell_note_pack(point_dimh[q], true);
}
}
return true;
#else
(void)data;
(void)active_points;
(void)vars;
(void)fast_points;
(void)ordn;
return false;
#endif
}
void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn)
{
if (pt->coef)
@@ -2971,17 +3491,32 @@ double *ShellPatch::Collect_Data(ss_patch *PP, var *VP)
return databuffer;
}
void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry)
{
void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry)
{
int myrank, cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int node;
MPI_Request *reqs;
int node;
const bool transfer_timing = shell_transfer_timing_enabled();
double t_query = 0.0;
double t_pack = 0.0;
double t_post = 0.0;
double t_wait = 0.0;
double t_unpack = 0.0;
long long send_values = 0;
long long recv_values = 0;
double tt = 0.0;
#if USE_CUDA_BSSN
const bool use_shell_cuda_pack = shell_cuda_interp_enabled();
if (use_shell_cuda_pack)
bssn_cuda_shell_pack_cache_begin();
#endif
MPI_Request *reqs;
MPI_Status *stats;
reqs = new MPI_Request[2 * cpusize];
stats = new MPI_Status[2 * cpusize];
@@ -2995,55 +3530,109 @@ void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
for (node = 0; node < cpusize; node++)
{
send_data[node] = rec_data[node] = 0;
if (node == myrank)
{
if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
}
}
else
{
// send from this cpu to cpu#node
if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
{
send_data[node] = new double[length];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
}
// receive from cpu#node to this cpu
if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
{
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
}
}
}
// wait for all requests to complete
MPI_Waitall(req_no, reqs, stats);
for (node = 0; node < cpusize; node++)
if (rec_data[node])
interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
for (node = 0; node < cpusize; node++)
{
if (node == myrank)
{
if (transfer_timing) tt = MPI_Wtime();
if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
{
if (transfer_timing) t_query += MPI_Wtime() - tt;
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
recv_values += length;
if (transfer_timing) tt = MPI_Wtime();
interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (transfer_timing) t_pack += MPI_Wtime() - tt;
}
else if (transfer_timing)
t_query += MPI_Wtime() - tt;
}
else
{
// send from this cpu to cpu#node
if (transfer_timing) tt = MPI_Wtime();
if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
{
if (transfer_timing) t_query += MPI_Wtime() - tt;
send_data[node] = new double[length];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 2" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
send_values += length;
if (transfer_timing) tt = MPI_Wtime();
interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (transfer_timing) t_pack += MPI_Wtime() - tt;
if (transfer_timing) tt = MPI_Wtime();
MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
if (transfer_timing) t_post += MPI_Wtime() - tt;
}
else if (transfer_timing)
t_query += MPI_Wtime() - tt;
// receive from cpu#node to this cpu
if (transfer_timing) tt = MPI_Wtime();
if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
{
if (transfer_timing) t_query += MPI_Wtime() - tt;
rec_data[node] = new double[length];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
recv_values += length;
if (transfer_timing) tt = MPI_Wtime();
MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
if (transfer_timing) t_post += MPI_Wtime() - tt;
}
else if (transfer_timing)
t_query += MPI_Wtime() - tt;
}
}
// wait for all requests to complete
if (transfer_timing) tt = MPI_Wtime();
MPI_Waitall(req_no, reqs, stats);
if (transfer_timing) t_wait += MPI_Wtime() - tt;
#if USE_CUDA_BSSN
if (use_shell_cuda_pack)
bssn_cuda_shell_pack_cache_end();
#endif
for (node = 0; node < cpusize; node++)
if (rec_data[node])
{
if (transfer_timing) tt = MPI_Wtime();
interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (transfer_timing) t_unpack += MPI_Wtime() - tt;
}
if (transfer_timing)
{
double local[5] = {t_query, t_pack, t_post, t_wait, t_unpack};
double maxv[5] = {};
long long counts[2] = {send_values, recv_values};
long long max_counts[2] = {};
MPI_Reduce(local, maxv, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(counts, max_counts, 2, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
cout << "[AMSS-SHELL-TRANSFER] vars=" << shell_var_count(VarList1)
<< " query=" << maxv[0]
<< " pack=" << maxv[1]
<< " post=" << maxv[2]
<< " wait=" << maxv[3]
<< " unpack=" << maxv[4]
<< " send_values=" << max_counts[0]
<< " recv_values=" << max_counts[1]
<< endl;
}
for (node = 0; node < cpusize; node++)
{
if (send_data[node])
delete[] send_data[node];
if (rec_data[node])
@@ -3093,32 +3682,14 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!data || shell_parallel_interp_enabled())
if (!data || shell_parallel_interp_enabled() || shell_cuda_interp_enabled())
{
std::vector<var *> src_vars;
std::vector<var *> dst_vars;
for (varls = VarLists, varld = VarListd; varls && varld; varls = varls->next, varld = varld->next)
{
src_vars.push_back(varls->data);
dst_vars.push_back(varld->data);
}
std::vector<ShellPointPair> active_points;
MyList<pointstru> *src_scan = src;
MyList<pointstru> *dst_scan = dst;
while (src_scan && dst_scan)
{
if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) ||
(dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank))
{
ShellPointPair pair;
pair.src = src_scan->data;
pair.dst = dst_scan->data;
active_points.push_back(pair);
}
src_scan = src_scan->next;
dst_scan = dst_scan->next;
}
ShellActiveCache &active_cache = shell_get_active_cache(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn);
const std::vector<var *> &src_vars = active_cache.src_vars;
const std::vector<var *> &dst_vars = active_cache.dst_vars;
const std::vector<ShellPointPair> &active_points = active_cache.active_points;
const std::vector<char> &fast_points = active_cache.fast_points;
const std::vector<int> &dst_indices = active_cache.dst_indices;
if (!data)
return (int)(active_points.size() * src_vars.size());
@@ -3127,46 +3698,38 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
{
const int nthreads = std::min(shell_interp_threads(), (int)active_points.size());
const int nvar = (int)src_vars.size();
std::vector<char> fast_points(active_points.size(), 0);
if (dir == PACK && shell_fast_interp_enabled())
{
for (size_t p = 0; p < active_points.size(); ++p)
{
shell_prepare_interp_coeffs(active_points[p].src, ordn);
fast_points[p] = shell_interp_fast_possible(active_points[p].src, ordn) ? 1 : 0;
}
}
else if (dir == UNPACK)
{
for (size_t p = 0; p < active_points.size(); ++p)
fast_points[p] = shell_pointcopy_possible(active_points[p].dst) ? 1 : 0;
}
bool cuda_pack_done = false;
if (dir == PACK)
cuda_pack_done = shell_pack_cuda_fast_points(data, active_points, src_vars, fast_points, ordn);
std::vector<std::thread> workers;
workers.reserve(nthreads);
for (int tid = 0; tid < nthreads; ++tid)
if (!cuda_pack_done)
{
const int begin = (int)((long long)active_points.size() * tid / nthreads);
const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads);
workers.push_back(std::thread([&, begin, end]() {
for (int p = begin; p < end; ++p)
{
const int base = p * nvar;
if (!fast_points[p])
continue;
if (dir == PACK)
shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn);
else
for (int tid = 0; tid < nthreads; ++tid)
{
const int begin = (int)((long long)active_points.size() * tid / nthreads);
const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads);
workers.push_back(std::thread([&, begin, end]() {
for (int p = begin; p < end; ++p)
{
for (int vi = 0; vi < nvar; ++vi)
shell_pointcopy_fast(active_points[p].dst, dst_vars[vi], data[base + vi]);
const int base = p * nvar;
if (!fast_points[p])
continue;
if (dir == PACK)
shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn);
else
{
for (int vi = 0; vi < nvar; ++vi)
active_points[p].dst->Bg->fgfs[dst_vars[vi]->sgfn][dst_indices[p]] = data[base + vi];
}
}
}
}));
}));
}
for (size_t t = 0; t < workers.size(); ++t)
workers[t].join();
}
for (size_t t = 0; t < workers.size(); ++t)
workers[t].join();
for (size_t p = 0; p < active_points.size(); ++p)
{