Add Shell-Patch GPU runtime fast paths
This commit is contained in:
@@ -7,6 +7,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <new>
|
#include <new>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
#include <algorithm>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
#include "ShellPatch.h"
|
#include "ShellPatch.h"
|
||||||
@@ -24,6 +28,429 @@ using namespace std;
|
|||||||
// so we need half of that
|
// so we need half of that
|
||||||
#define overghost ((ghost_width + 1) / 2 + ghost_width)
|
#define overghost ((ghost_width + 1) / 2 + ghost_width)
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
bool shell_fast_interp_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_SHELL_FAST_INTERP");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_parallel_interp_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_interp_stats_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_SHELL_INTERP_STATS");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int shell_interp_threads()
|
||||||
|
{
|
||||||
|
static int threads = -1;
|
||||||
|
if (threads < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_SHELL_INTERP_THREADS");
|
||||||
|
int requested = env ? atoi(env) : 0;
|
||||||
|
if (requested <= 0)
|
||||||
|
{
|
||||||
|
unsigned int hw = std::thread::hardware_concurrency();
|
||||||
|
requested = hw > 0 ? (int)hw : 8;
|
||||||
|
requested = std::min(requested, 32);
|
||||||
|
}
|
||||||
|
threads = std::max(1, requested);
|
||||||
|
}
|
||||||
|
return threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ShellInterpStats
|
||||||
|
{
|
||||||
|
long long pack_dim3 = 0;
|
||||||
|
long long pack_dim2 = 0;
|
||||||
|
long long pack_dim1 = 0;
|
||||||
|
long long fast_dim3 = 0;
|
||||||
|
long long fast_dim2 = 0;
|
||||||
|
long long fast_dim1 = 0;
|
||||||
|
long long fallback_dim3 = 0;
|
||||||
|
long long fallback_dim2 = 0;
|
||||||
|
long long fallback_dim1 = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
ShellInterpStats shell_interp_stats;
|
||||||
|
|
||||||
|
void shell_note_pack(int dimh, bool fast_done)
|
||||||
|
{
|
||||||
|
if (!shell_interp_stats_enabled())
|
||||||
|
return;
|
||||||
|
if (dimh == 3)
|
||||||
|
{
|
||||||
|
shell_interp_stats.pack_dim3++;
|
||||||
|
fast_done ? shell_interp_stats.fast_dim3++ : shell_interp_stats.fallback_dim3++;
|
||||||
|
}
|
||||||
|
else if (dimh == 2)
|
||||||
|
{
|
||||||
|
shell_interp_stats.pack_dim2++;
|
||||||
|
fast_done ? shell_interp_stats.fast_dim2++ : shell_interp_stats.fallback_dim2++;
|
||||||
|
}
|
||||||
|
else if (dimh == 1)
|
||||||
|
{
|
||||||
|
shell_interp_stats.pack_dim1++;
|
||||||
|
fast_done ? shell_interp_stats.fast_dim1++ : shell_interp_stats.fallback_dim1++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void shell_print_interp_stats(const char *label)
|
||||||
|
{
|
||||||
|
if (!shell_interp_stats_enabled())
|
||||||
|
return;
|
||||||
|
|
||||||
|
int rank = 0;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
cout << "[AMSS-SHELL-INTERP-STATS] rank=" << rank << " " << label
|
||||||
|
<< " pack3=" << shell_interp_stats.pack_dim3
|
||||||
|
<< " fast3=" << shell_interp_stats.fast_dim3
|
||||||
|
<< " fallback3=" << shell_interp_stats.fallback_dim3
|
||||||
|
<< " pack2=" << shell_interp_stats.pack_dim2
|
||||||
|
<< " fast2=" << shell_interp_stats.fast_dim2
|
||||||
|
<< " fallback2=" << shell_interp_stats.fallback_dim2
|
||||||
|
<< " pack1=" << shell_interp_stats.pack_dim1
|
||||||
|
<< " fast1=" << shell_interp_stats.fast_dim1
|
||||||
|
<< " fallback1=" << shell_interp_stats.fallback_dim1
|
||||||
|
<< endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int shell_idx3(const int *shape, int i, int j, int k)
|
||||||
|
{
|
||||||
|
return i + shape[0] * (j + shape[1] * k);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_interp3d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
|
||||||
|
{
|
||||||
|
const int *shape = pt->Bg->shape;
|
||||||
|
const int *s = pt->sind;
|
||||||
|
if (!s || !pt->coef)
|
||||||
|
return false;
|
||||||
|
if (s[0] < 0 || s[1] < 0 || s[2] < 0 ||
|
||||||
|
s[0] + ordn > shape[0] ||
|
||||||
|
s[1] + ordn > shape[1] ||
|
||||||
|
s[2] + ordn > shape[2])
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const double *f = pt->Bg->fgfs[vp->sgfn];
|
||||||
|
const double *cx = pt->coef;
|
||||||
|
const double *cy = pt->coef + ordn;
|
||||||
|
const double *cz = pt->coef + 2 * ordn;
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int k = 0; k < ordn; ++k)
|
||||||
|
for (int j = 0; j < ordn; ++j)
|
||||||
|
for (int i = 0; i < ordn; ++i)
|
||||||
|
sum += cx[i] * cy[j] * cz[k] *
|
||||||
|
f[shell_idx3(shape, s[0] + i, s[1] + j, s[2] + k)];
|
||||||
|
out = sum;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_interp2d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
|
||||||
|
{
|
||||||
|
const int *shape = pt->Bg->shape;
|
||||||
|
const int *s = pt->sind;
|
||||||
|
if (!s || !pt->coef)
|
||||||
|
return false;
|
||||||
|
const int k0 = s[2] - 1; // Match global_interpind2d's Fortran fixed-index convention.
|
||||||
|
if (s[0] < 0 || s[1] < 0 ||
|
||||||
|
s[0] + ordn > shape[0] ||
|
||||||
|
s[1] + ordn > shape[1] ||
|
||||||
|
k0 < 0 || k0 >= shape[2])
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const double *f = pt->Bg->fgfs[vp->sgfn];
|
||||||
|
const double *cx = pt->coef;
|
||||||
|
const double *cy = pt->coef + ordn;
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int j = 0; j < ordn; ++j)
|
||||||
|
for (int i = 0; i < ordn; ++i)
|
||||||
|
sum += cx[i] * cy[j] * f[shell_idx3(shape, s[0] + i, s[1] + j, k0)];
|
||||||
|
out = sum;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_interp1d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
|
||||||
|
{
|
||||||
|
const int *shape = pt->Bg->shape;
|
||||||
|
const int *s = pt->sind;
|
||||||
|
if (!s || !pt->coef)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
const double *f = pt->Bg->fgfs[vp->sgfn];
|
||||||
|
double sum = 0.0;
|
||||||
|
if (pt->dumyd == 1)
|
||||||
|
{
|
||||||
|
if (s[0] < 0 || s[0] + ordn > shape[0] ||
|
||||||
|
s[1] < 0 || s[1] >= shape[1] ||
|
||||||
|
s[2] < 0 || s[2] >= shape[2])
|
||||||
|
return false;
|
||||||
|
for (int i = 0; i < ordn; ++i)
|
||||||
|
sum += pt->coef[i] * f[shell_idx3(shape, s[0] + i, s[1], s[2])];
|
||||||
|
}
|
||||||
|
else if (pt->dumyd == 0)
|
||||||
|
{
|
||||||
|
if (s[0] < 0 || s[0] + ordn > shape[1] ||
|
||||||
|
s[1] < 0 || s[1] >= shape[0] ||
|
||||||
|
s[2] < 0 || s[2] >= shape[2])
|
||||||
|
return false;
|
||||||
|
for (int j = 0; j < ordn; ++j)
|
||||||
|
sum += pt->coef[j] * f[shell_idx3(shape, s[1], s[0] + j, s[2])];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
|
||||||
|
out = sum;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn)
|
||||||
|
{
|
||||||
|
const int DIMh = (pt->dumyd == -1) ? dim : 1;
|
||||||
|
const int *shape = pt->Bg->shape;
|
||||||
|
const int *s = pt->sind;
|
||||||
|
if (!s || !pt->coef)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (DIMh == 3)
|
||||||
|
return s[0] >= 0 && s[1] >= 0 && s[2] >= 0 &&
|
||||||
|
s[0] + ordn <= shape[0] &&
|
||||||
|
s[1] + ordn <= shape[1] &&
|
||||||
|
s[2] + ordn <= shape[2];
|
||||||
|
|
||||||
|
if (DIMh == 1)
|
||||||
|
{
|
||||||
|
if (pt->dumyd == 1)
|
||||||
|
return s[0] >= 0 && s[0] + ordn <= shape[0] &&
|
||||||
|
s[1] >= 0 && s[1] < shape[1] &&
|
||||||
|
s[2] >= 0 && s[2] < shape[2];
|
||||||
|
if (pt->dumyd == 0)
|
||||||
|
return s[0] >= 0 && s[0] + ordn <= shape[1] &&
|
||||||
|
s[1] >= 0 && s[1] < shape[0] &&
|
||||||
|
s[2] >= 0 && s[2] < shape[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double value)
|
||||||
|
{
|
||||||
|
Block *bg = pt->Bg;
|
||||||
|
const int *shape = bg->shape;
|
||||||
|
if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1)
|
||||||
|
return false;
|
||||||
|
double h[3];
|
||||||
|
for (int d = 0; d < dim; ++d)
|
||||||
|
#ifdef Vertex
|
||||||
|
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1);
|
||||||
|
#else
|
||||||
|
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4);
|
||||||
|
int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4);
|
||||||
|
int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4);
|
||||||
|
if (i < 0 || i >= shape[0] || j < 0 || j >= shape[1] || k < 0 || k >= shape[2])
|
||||||
|
return false;
|
||||||
|
bg->fgfs[vp->sgfn][shell_idx3(shape, i, j, k)] = value;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_pointcopy_possible(const ShellPatch::pointstru *pt)
|
||||||
|
{
|
||||||
|
Block *bg = pt->Bg;
|
||||||
|
const int *shape = bg->shape;
|
||||||
|
if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1)
|
||||||
|
return false;
|
||||||
|
double h[3];
|
||||||
|
for (int d = 0; d < dim; ++d)
|
||||||
|
#ifdef Vertex
|
||||||
|
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1);
|
||||||
|
#else
|
||||||
|
h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d];
|
||||||
|
#endif
|
||||||
|
int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4);
|
||||||
|
int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4);
|
||||||
|
int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4);
|
||||||
|
return i >= 0 && i < shape[0] && j >= 0 && j < shape[1] && k >= 0 && k < shape[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src,
|
||||||
|
const std::vector<var *> &vars, int ordn)
|
||||||
|
{
|
||||||
|
const int DIMh = (src->dumyd == -1) ? dim : 1;
|
||||||
|
for (size_t vi = 0; vi < vars.size(); ++vi)
|
||||||
|
{
|
||||||
|
double &out = data[out_base + (int)vi];
|
||||||
|
bool ok = false;
|
||||||
|
if (DIMh == 3)
|
||||||
|
ok = shell_interp3d_fast(src, vars[vi], out, ordn);
|
||||||
|
else if (DIMh == 1)
|
||||||
|
ok = shell_interp1d_fast(src, vars[vi], out, ordn);
|
||||||
|
if (!ok)
|
||||||
|
return false;
|
||||||
|
shell_note_pack(DIMh, true);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ShellPointPair
|
||||||
|
{
|
||||||
|
ShellPatch::pointstru *src;
|
||||||
|
ShellPatch::pointstru *dst;
|
||||||
|
};
|
||||||
|
|
||||||
|
void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn)
|
||||||
|
{
|
||||||
|
if (pt->coef)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const int DIMh = (pt->dumyd == -1) ? dim : 1;
|
||||||
|
pt->coef = new double[ordn * DIMh];
|
||||||
|
pt->sind = new int[dim];
|
||||||
|
if (DIMh == 3)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < DIMh; i++)
|
||||||
|
{
|
||||||
|
double dd = pt->Bg->getdX(i);
|
||||||
|
pt->sind[i] = int((pt->lpox[i] - pt->Bg->X[i][0]) / dd) - ordn / 2 + 1;
|
||||||
|
double h1, h2;
|
||||||
|
for (int j = 0; j < ordn; j++)
|
||||||
|
{
|
||||||
|
h1 = pt->Bg->X[i][0] + (pt->sind[i] + j) * dd;
|
||||||
|
pt->coef[i * ordn + j] = 1;
|
||||||
|
for (int k = 0; k < j; k++)
|
||||||
|
{
|
||||||
|
h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd;
|
||||||
|
pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2);
|
||||||
|
}
|
||||||
|
for (int k = j + 1; k < ordn; k++)
|
||||||
|
{
|
||||||
|
h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd;
|
||||||
|
pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int actd = 1 - pt->dumyd;
|
||||||
|
double dd = pt->Bg->getdX(actd);
|
||||||
|
pt->sind[0] = int((pt->lpox[actd] - pt->Bg->X[actd][0]) / dd) - ordn / 2 + 1;
|
||||||
|
double h1, h2;
|
||||||
|
for (int j = 0; j < ordn; j++)
|
||||||
|
{
|
||||||
|
h1 = pt->Bg->X[actd][0] + (pt->sind[0] + j) * dd;
|
||||||
|
pt->coef[j] = 1;
|
||||||
|
for (int k = 0; k < j; k++)
|
||||||
|
{
|
||||||
|
h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd;
|
||||||
|
pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2);
|
||||||
|
}
|
||||||
|
for (int k = j + 1; k < ordn; k++)
|
||||||
|
{
|
||||||
|
h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd;
|
||||||
|
pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pt->sind[2] = int((pt->lpox[2] - pt->Bg->X[2][0]) / pt->Bg->getdX(2) + 0.001);
|
||||||
|
if (!feq(pt->Bg->X[2][pt->sind[2]], pt->lpox[2], pt->Bg->getdX(2) / 2000))
|
||||||
|
cout << "error in ShellPatch::interdata_packer point = " << pt->lpox[2] << " != grid " << pt->Bg->X[2][pt->sind[2]] << endl;
|
||||||
|
pt->sind[1] = int((pt->lpox[pt->dumyd] - pt->Bg->X[pt->dumyd][0]) /
|
||||||
|
pt->Bg->getdX(pt->dumyd) +
|
||||||
|
0.001);
|
||||||
|
if (!feq(pt->Bg->X[pt->dumyd][pt->sind[1]], pt->lpox[pt->dumyd], pt->Bg->getdX(pt->dumyd) / 2000))
|
||||||
|
cout << "error in ShellPatch::interdata_packer for dumy dimension point = "
|
||||||
|
<< pt->lpox[pt->dumyd] << " != grid " << pt->Bg->X[pt->dumyd][pt->sind[1]] << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void shell_pack_one(double *data, int out_base, ShellPatch::pointstru *src,
|
||||||
|
const std::vector<var *> &vars, int ordn, int Symmetry)
|
||||||
|
{
|
||||||
|
shell_prepare_interp_coeffs(src, ordn);
|
||||||
|
const int DIMh = (src->dumyd == -1) ? dim : 1;
|
||||||
|
for (size_t vi = 0; vi < vars.size(); ++vi)
|
||||||
|
{
|
||||||
|
double &out = data[out_base + (int)vi];
|
||||||
|
bool fast_done = false;
|
||||||
|
if (shell_fast_interp_enabled())
|
||||||
|
{
|
||||||
|
if (DIMh == 3)
|
||||||
|
fast_done = shell_interp3d_fast(src, vars[vi], out, ordn);
|
||||||
|
else if (DIMh == 2)
|
||||||
|
fast_done = shell_interp2d_fast(src, vars[vi], out, ordn);
|
||||||
|
else if (DIMh == 1)
|
||||||
|
fast_done = shell_interp1d_fast(src, vars[vi], out, ordn);
|
||||||
|
}
|
||||||
|
if (fast_done)
|
||||||
|
{
|
||||||
|
shell_note_pack(DIMh, true);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
shell_note_pack(DIMh, false);
|
||||||
|
switch (DIMh)
|
||||||
|
{
|
||||||
|
case 3:
|
||||||
|
f_global_interpind(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
|
||||||
|
src->Bg->fgfs[vars[vi]->sgfn], out,
|
||||||
|
src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
|
||||||
|
src->sind, src->coef, src->ssst);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
f_global_interpind2d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
|
||||||
|
src->Bg->fgfs[vars[vi]->sgfn], out,
|
||||||
|
src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
|
||||||
|
src->sind, src->coef, src->ssst);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
f_global_interpind1d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
|
||||||
|
src->Bg->fgfs[vars[vi]->sgfn], out,
|
||||||
|
src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
|
||||||
|
src->sind, src->coef, src->ssst, src->dumyd);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
cout << "ShellPatch::interdata_packer: not recognized DIM = " << DIMh << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void shell_unpack_one(double *data, int out_base, ShellPatch::pointstru *dst,
|
||||||
|
const std::vector<var *> &vars)
|
||||||
|
{
|
||||||
|
int DIM = dim;
|
||||||
|
for (size_t vi = 0; vi < vars.size(); ++vi)
|
||||||
|
f_pointcopy(DIM, dst->Bg->bbox, dst->Bg->bbox + dim, dst->Bg->shape,
|
||||||
|
dst->Bg->fgfs[vars[vi]->sgfn],
|
||||||
|
dst->lpox[0], dst->lpox[1], dst->lpox[2],
|
||||||
|
data[out_base + (int)vi]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
ss_patch::ss_patch(int ingfsi, int fngfsi, int *shapei, double *bboxi, int myranki) : ingfs(ingfsi), fngfs(fngfsi), myrank(myranki), blb(0), ble(0)
|
ss_patch::ss_patch(int ingfsi, int fngfsi, int *shapei, double *bboxi, int myranki) : ingfs(ingfsi), fngfs(fngfsi), myrank(myranki), blb(0), ble(0)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
@@ -2666,6 +3093,95 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!data || shell_parallel_interp_enabled())
|
||||||
|
{
|
||||||
|
std::vector<var *> src_vars;
|
||||||
|
std::vector<var *> dst_vars;
|
||||||
|
for (varls = VarLists, varld = VarListd; varls && varld; varls = varls->next, varld = varld->next)
|
||||||
|
{
|
||||||
|
src_vars.push_back(varls->data);
|
||||||
|
dst_vars.push_back(varld->data);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<ShellPointPair> active_points;
|
||||||
|
MyList<pointstru> *src_scan = src;
|
||||||
|
MyList<pointstru> *dst_scan = dst;
|
||||||
|
while (src_scan && dst_scan)
|
||||||
|
{
|
||||||
|
if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) ||
|
||||||
|
(dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank))
|
||||||
|
{
|
||||||
|
ShellPointPair pair;
|
||||||
|
pair.src = src_scan->data;
|
||||||
|
pair.dst = dst_scan->data;
|
||||||
|
active_points.push_back(pair);
|
||||||
|
}
|
||||||
|
src_scan = src_scan->next;
|
||||||
|
dst_scan = dst_scan->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!data)
|
||||||
|
return (int)(active_points.size() * src_vars.size());
|
||||||
|
|
||||||
|
if (active_points.size() * src_vars.size() >= 2048)
|
||||||
|
{
|
||||||
|
const int nthreads = std::min(shell_interp_threads(), (int)active_points.size());
|
||||||
|
const int nvar = (int)src_vars.size();
|
||||||
|
std::vector<char> fast_points(active_points.size(), 0);
|
||||||
|
|
||||||
|
if (dir == PACK && shell_fast_interp_enabled())
|
||||||
|
{
|
||||||
|
for (size_t p = 0; p < active_points.size(); ++p)
|
||||||
|
{
|
||||||
|
shell_prepare_interp_coeffs(active_points[p].src, ordn);
|
||||||
|
fast_points[p] = shell_interp_fast_possible(active_points[p].src, ordn) ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (dir == UNPACK)
|
||||||
|
{
|
||||||
|
for (size_t p = 0; p < active_points.size(); ++p)
|
||||||
|
fast_points[p] = shell_pointcopy_possible(active_points[p].dst) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::thread> workers;
|
||||||
|
workers.reserve(nthreads);
|
||||||
|
for (int tid = 0; tid < nthreads; ++tid)
|
||||||
|
{
|
||||||
|
const int begin = (int)((long long)active_points.size() * tid / nthreads);
|
||||||
|
const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads);
|
||||||
|
workers.push_back(std::thread([&, begin, end]() {
|
||||||
|
for (int p = begin; p < end; ++p)
|
||||||
|
{
|
||||||
|
const int base = p * nvar;
|
||||||
|
if (!fast_points[p])
|
||||||
|
continue;
|
||||||
|
if (dir == PACK)
|
||||||
|
shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (int vi = 0; vi < nvar; ++vi)
|
||||||
|
shell_pointcopy_fast(active_points[p].dst, dst_vars[vi], data[base + vi]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
for (size_t t = 0; t < workers.size(); ++t)
|
||||||
|
workers[t].join();
|
||||||
|
|
||||||
|
for (size_t p = 0; p < active_points.size(); ++p)
|
||||||
|
{
|
||||||
|
if (fast_points[p])
|
||||||
|
continue;
|
||||||
|
const int base = (int)p * nvar;
|
||||||
|
if (dir == PACK)
|
||||||
|
shell_pack_one(data, base, active_points[p].src, src_vars, ordn, Symmetry);
|
||||||
|
else
|
||||||
|
shell_unpack_one(data, base, active_points[p].dst, dst_vars);
|
||||||
|
}
|
||||||
|
return (int)(active_points.size() * src_vars.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
while (src && dst)
|
while (src && dst)
|
||||||
{
|
{
|
||||||
if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||||
@@ -2747,6 +3263,18 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// interpolate
|
// interpolate
|
||||||
|
bool fast_done = false;
|
||||||
|
if (shell_fast_interp_enabled())
|
||||||
|
{
|
||||||
|
if (DIMh == 3)
|
||||||
|
fast_done = shell_interp3d_fast(src->data, varls->data, data[size_out], ordn);
|
||||||
|
else if (DIMh == 2)
|
||||||
|
fast_done = shell_interp2d_fast(src->data, varls->data, data[size_out], ordn);
|
||||||
|
else if (DIMh == 1)
|
||||||
|
fast_done = shell_interp1d_fast(src->data, varls->data, data[size_out], ordn);
|
||||||
|
}
|
||||||
|
shell_note_pack(DIMh, fast_done);
|
||||||
|
if (!fast_done)
|
||||||
switch (DIMh)
|
switch (DIMh)
|
||||||
{
|
{
|
||||||
case 3:
|
case 3:
|
||||||
@@ -2773,8 +3301,10 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dir == UNPACK) // from target data to corresponding grid
|
if (dir == UNPACK) // from target data to corresponding grid
|
||||||
|
{
|
||||||
f_pointcopy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
f_pointcopy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
||||||
dst->data->lpox[0], dst->data->lpox[1], dst->data->lpox[2], data[size_out]);
|
dst->data->lpox[0], dst->data->lpox[1], dst->data->lpox[2], data[size_out]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
size_out += 1;
|
size_out += 1;
|
||||||
varls = varls->next;
|
varls = varls->next;
|
||||||
@@ -2797,6 +3327,7 @@ void ShellPatch::Synch(MyList<var> *VarList, int Symmetry)
|
|||||||
}
|
}
|
||||||
|
|
||||||
intertransfer(ss_src, ss_dst, VarList, VarList, Symmetry);
|
intertransfer(ss_src, ss_dst, VarList, VarList, Symmetry);
|
||||||
|
shell_print_interp_stats("after Synch");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShellPatch::CS_Inter(MyList<var> *VarList, int Symmetry)
|
void ShellPatch::CS_Inter(MyList<var> *VarList, int Symmetry)
|
||||||
@@ -2805,6 +3336,7 @@ void ShellPatch::CS_Inter(MyList<var> *VarList, int Symmetry)
|
|||||||
intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry);
|
intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry);
|
||||||
// fill box then
|
// fill box then
|
||||||
intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry);
|
intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry);
|
||||||
|
shell_print_interp_stats("after CS_Inter");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShellPatch::check_pointstrul(MyList<pointstru> *pp, bool first_only)
|
void ShellPatch::check_pointstrul(MyList<pointstru> *pp, bool first_only)
|
||||||
|
|||||||
@@ -160,6 +160,8 @@ def _gpu_runtime_env():
|
|||||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
||||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
||||||
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
|
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
|
||||||
|
"AMSS_SHELL_FAST_INTERP": "0",
|
||||||
|
"AMSS_SHELL_PARALLEL_INTERP": "0",
|
||||||
}
|
}
|
||||||
if finite_difference in ("2nd-order", "8th-order"):
|
if finite_difference in ("2nd-order", "8th-order"):
|
||||||
defaults.update({
|
defaults.update({
|
||||||
@@ -173,6 +175,13 @@ def _gpu_runtime_env():
|
|||||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
|
"AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
|
||||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
|
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
|
||||||
})
|
})
|
||||||
|
if getattr(input_data, "basic_grid_set", "") == "Shell-Patch":
|
||||||
|
defaults.update({
|
||||||
|
"AMSS_CUDA_AWARE_MPI": "0",
|
||||||
|
"AMSS_SHELL_FAST_INTERP": "1",
|
||||||
|
"AMSS_SHELL_PARALLEL_INTERP": "1",
|
||||||
|
"AMSS_SHELL_INTERP_THREADS": "16",
|
||||||
|
})
|
||||||
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
|
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
|
||||||
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
|
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
|
||||||
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
||||||
@@ -299,6 +308,13 @@ def run_ABE():
|
|||||||
mpi_processes = int(z4c_env_np)
|
mpi_processes = int(z4c_env_np)
|
||||||
elif mpi_processes < 4:
|
elif mpi_processes < 4:
|
||||||
mpi_processes = 4
|
mpi_processes = 4
|
||||||
|
if (input_data.GPU_Calculation == "yes" and
|
||||||
|
getattr(input_data, "basic_grid_set", "") == "Shell-Patch"):
|
||||||
|
shell_env_np = os.environ.get("AMSS_SHELL_GPU_MPI_PROCESSES")
|
||||||
|
if shell_env_np and int(shell_env_np) > 0:
|
||||||
|
mpi_processes = int(shell_env_np)
|
||||||
|
elif mpi_processes < 4:
|
||||||
|
mpi_processes = 4
|
||||||
|
|
||||||
if (input_data.GPU_Calculation == "no"):
|
if (input_data.GPU_Calculation == "no"):
|
||||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(mpi_processes) + " ./ABE"
|
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(mpi_processes) + " ./ABE"
|
||||||
@@ -330,6 +346,9 @@ def run_ABE():
|
|||||||
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
||||||
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
|
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
|
||||||
print(f" AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}")
|
print(f" AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}")
|
||||||
|
print(f" AMSS_SHELL_FAST_INTERP={mpi_env.get('AMSS_SHELL_FAST_INTERP', '')}")
|
||||||
|
print(f" AMSS_SHELL_PARALLEL_INTERP={mpi_env.get('AMSS_SHELL_PARALLEL_INTERP', '')}")
|
||||||
|
print(f" AMSS_SHELL_INTERP_THREADS={mpi_env.get('AMSS_SHELL_INTERP_THREADS', '')}")
|
||||||
print(f" AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}")
|
print(f" AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}")
|
||||||
print(f" AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}")
|
print(f" AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}")
|
||||||
if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
|
if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
|
||||||
|
|||||||
Reference in New Issue
Block a user