Add batched CUDA patch interpolation path
This commit is contained in:
@@ -10,21 +10,41 @@
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
#include "misc.h"
|
||||
#include "MPatch.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
|
||||
namespace
|
||||
{
|
||||
struct InterpBlockView
|
||||
{
|
||||
Block *bp;
|
||||
double llb[dim];
|
||||
double uub[dim];
|
||||
#include "misc.h"
|
||||
#include "MPatch.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#include "bssn_cuda_ops.h"
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
extern int bssn_cuda_interp_points_batch(const int *ex,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
const double *soa_flat,
|
||||
int num_var,
|
||||
const double *px, const double *py, const double *pz,
|
||||
int num_points,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double *out) __attribute__((weak));
|
||||
#endif
|
||||
|
||||
namespace
|
||||
{
|
||||
struct InterpVarDesc
|
||||
{
|
||||
int sgfn;
|
||||
double soa[dim];
|
||||
};
|
||||
|
||||
struct InterpBlockView
|
||||
{
|
||||
Block *bp;
|
||||
double llb[dim];
|
||||
double uub[dim];
|
||||
};
|
||||
|
||||
struct BlockBinIndex
|
||||
@@ -154,10 +174,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
|
||||
index.valid = true;
|
||||
}
|
||||
|
||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||
{
|
||||
if (!index.valid)
|
||||
return -1;
|
||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||
{
|
||||
if (!index.valid)
|
||||
return -1;
|
||||
|
||||
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
|
||||
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
|
||||
@@ -175,10 +195,151 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
|
||||
for (size_t bi = 0; bi < index.views.size(); bi++)
|
||||
if (point_in_block_view(index.views[bi], pox, DH))
|
||||
return int(bi);
|
||||
|
||||
return -1;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
|
||||
{
|
||||
vars.clear();
|
||||
MyList<var> *varl = VarList;
|
||||
while (varl)
|
||||
{
|
||||
InterpVarDesc desc;
|
||||
desc.sgfn = varl->data->sgfn;
|
||||
for (int d = 0; d < dim; ++d)
|
||||
desc.soa[d] = varl->data->SoA[d];
|
||||
vars.push_back(desc);
|
||||
varl = varl->next;
|
||||
}
|
||||
}
|
||||
|
||||
bool should_try_cuda_interp(int ordn, int num_points, int num_var)
|
||||
{
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
if (!bssn_cuda_interp_points_batch)
|
||||
return false;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
if (ordn != 6)
|
||||
return false;
|
||||
if (num_points < 32)
|
||||
return false;
|
||||
return num_points * num_var >= 256;
|
||||
}
|
||||
|
||||
bool run_cuda_interp_for_block(Block *BP,
|
||||
const vector<InterpVarDesc> &vars,
|
||||
const vector<int> &point_ids,
|
||||
double **XX,
|
||||
double *Shellf,
|
||||
int num_var,
|
||||
int ordn,
|
||||
int Symmetry)
|
||||
{
|
||||
if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
|
||||
return false;
|
||||
|
||||
vector<const double *> field_ptrs(num_var);
|
||||
vector<double> soa_flat(3 * num_var);
|
||||
for (int v = 0; v < num_var; ++v)
|
||||
{
|
||||
field_ptrs[v] = BP->fgfs[vars[v].sgfn];
|
||||
for (int d = 0; d < dim; ++d)
|
||||
soa_flat[3 * v + d] = vars[v].soa[d];
|
||||
}
|
||||
|
||||
const int npts = static_cast<int>(point_ids.size());
|
||||
vector<double> px(npts), py(npts), pz(npts);
|
||||
for (int p = 0; p < npts; ++p)
|
||||
{
|
||||
const int j = point_ids[p];
|
||||
px[p] = XX[0][j];
|
||||
py[p] = XX[1][j];
|
||||
pz[p] = XX[2][j];
|
||||
}
|
||||
|
||||
vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
|
||||
if (bssn_cuda_interp_points_batch(BP->shape,
|
||||
BP->X[0], BP->X[1], BP->X[2],
|
||||
field_ptrs.data(),
|
||||
soa_flat.data(),
|
||||
num_var,
|
||||
px.data(), py.data(), pz.data(),
|
||||
npts,
|
||||
ordn,
|
||||
Symmetry,
|
||||
out.data()) != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int p = 0; p < npts; ++p)
|
||||
{
|
||||
const int j = point_ids[p];
|
||||
memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void run_cpu_interp_for_block(Block *BP,
|
||||
const vector<InterpVarDesc> &vars,
|
||||
const vector<int> &point_ids,
|
||||
double **XX,
|
||||
double *Shellf,
|
||||
int num_var,
|
||||
int ordn,
|
||||
int Symmetry)
|
||||
{
|
||||
for (size_t p = 0; p < point_ids.size(); ++p)
|
||||
{
|
||||
const int j = point_ids[p];
|
||||
double x = XX[0][j];
|
||||
double y = XX[1][j];
|
||||
double z = XX[2][j];
|
||||
int ordn_local = ordn;
|
||||
int symmetry_local = Symmetry;
|
||||
for (int v = 0; v < num_var; ++v)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
|
||||
BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
|
||||
x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void interpolate_owned_points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetry,
|
||||
int myrank, int ordn,
|
||||
const BlockBinIndex &block_index,
|
||||
const int *owner_rank,
|
||||
const int *owner_block)
|
||||
{
|
||||
vector<InterpVarDesc> vars;
|
||||
collect_interp_vars(VarList, vars);
|
||||
const int num_var = static_cast<int>(vars.size());
|
||||
vector<vector<int>> block_points(block_index.views.size());
|
||||
|
||||
for (int j = 0; j < NN; ++j)
|
||||
{
|
||||
if (owner_rank[j] == myrank && owner_block[j] >= 0)
|
||||
block_points[owner_block[j]].push_back(j);
|
||||
}
|
||||
|
||||
for (size_t bi = 0; bi < block_points.size(); ++bi)
|
||||
{
|
||||
if (block_points[bi].empty())
|
||||
continue;
|
||||
|
||||
Block *BP = block_index.views[bi].bp;
|
||||
bool done = run_cuda_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry);
|
||||
if (!done)
|
||||
run_cpu_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||
{
|
||||
@@ -523,12 +684,17 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
// All ranks traverse the same block list so they all agree on ownership
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
// All ranks traverse the same block list so they all agree on ownership
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
int *owner_block;
|
||||
owner_block = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
owner_rank[j] = -1;
|
||||
owner_block[j] = -1;
|
||||
}
|
||||
|
||||
double DH[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
@@ -558,25 +724,15 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
owner_block[j] = block_i;
|
||||
}
|
||||
}
|
||||
|
||||
interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
|
||||
|
||||
// Replace MPI_Allreduce with per-owner MPI_Bcast:
|
||||
// Group consecutive points by owner rank and broadcast each group.
|
||||
@@ -631,9 +787,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
|
||||
}
|
||||
}
|
||||
|
||||
delete[] owner_rank;
|
||||
}
|
||||
|
||||
delete[] owner_rank;
|
||||
delete[] owner_block;
|
||||
}
|
||||
void Patch::Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetry,
|
||||
@@ -661,11 +818,16 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
int *owner_block;
|
||||
owner_block = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
owner_rank[j] = -1;
|
||||
owner_block[j] = -1;
|
||||
}
|
||||
|
||||
double DH[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
@@ -696,24 +858,15 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
owner_block[j] = block_i;
|
||||
}
|
||||
}
|
||||
|
||||
interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
double t_interp_end = MPI_Wtime();
|
||||
@@ -873,9 +1026,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
delete[] send_offset;
|
||||
delete[] recv_offset;
|
||||
delete[] send_count;
|
||||
delete[] recv_count;
|
||||
delete[] consumer_rank;
|
||||
delete[] owner_rank;
|
||||
delete[] recv_count;
|
||||
delete[] consumer_rank;
|
||||
delete[] owner_rank;
|
||||
delete[] owner_block;
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
{
|
||||
@@ -923,11 +1077,16 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] stores the global rank that owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
// owner_rank[j] stores the global rank that owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
int *owner_block;
|
||||
owner_block = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
owner_rank[j] = -1;
|
||||
owner_block[j] = -1;
|
||||
}
|
||||
|
||||
// Build global-to-local rank translation for Comm_here
|
||||
MPI_Group world_group, local_group;
|
||||
@@ -962,25 +1121,15 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
owner_block[j] = block_i;
|
||||
}
|
||||
}
|
||||
|
||||
interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
|
||||
|
||||
// Collect unique global owner ranks and translate to local ranks in Comm_here
|
||||
// Then broadcast each owner's points via MPI_Bcast on Comm_here
|
||||
@@ -1008,10 +1157,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Group_free(&world_group);
|
||||
MPI_Group_free(&local_group);
|
||||
delete[] owner_rank;
|
||||
}
|
||||
MPI_Group_free(&world_group);
|
||||
MPI_Group_free(&local_group);
|
||||
delete[] owner_rank;
|
||||
delete[] owner_block;
|
||||
}
|
||||
void Patch::checkBlock()
|
||||
{
|
||||
int myrank;
|
||||
|
||||
@@ -42,6 +42,12 @@ struct CachedBuffer
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
struct CachedIntBuffer
|
||||
{
|
||||
int *ptr = nullptr;
|
||||
size_t capacity = 0;
|
||||
};
|
||||
|
||||
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
@@ -67,6 +73,31 @@ inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
|
||||
{
|
||||
if (bytes <= buffer.capacity && buffer.ptr)
|
||||
return true;
|
||||
|
||||
if (buffer.ptr)
|
||||
{
|
||||
cudaError_t free_err = cudaFree(buffer.ptr);
|
||||
if (free_err != cudaSuccess)
|
||||
report_cuda_error("cudaFree", free_err);
|
||||
buffer.ptr = nullptr;
|
||||
buffer.capacity = 0;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMalloc", err);
|
||||
return false;
|
||||
}
|
||||
|
||||
buffer.capacity = bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
||||
{
|
||||
if (!ensure_capacity(dst, bytes))
|
||||
@@ -264,6 +295,148 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline int interp_idint_like(double x)
|
||||
{
|
||||
return static_cast<int>(x);
|
||||
}
|
||||
|
||||
__device__ inline void lagrange_weights_ord6(double x, double *w)
|
||||
{
|
||||
const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0};
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 6; ++i)
|
||||
{
|
||||
double num = 1.0;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 6; ++j)
|
||||
{
|
||||
if (j != i)
|
||||
num *= (x - static_cast<double>(j));
|
||||
}
|
||||
w[i] = num / denom[i];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline bool map_interp_index(int logical_idx, int n, double soa, int *mapped_idx, double *sign)
|
||||
{
|
||||
int idx = logical_idx;
|
||||
if (idx < 0)
|
||||
{
|
||||
idx = -idx;
|
||||
*sign *= soa;
|
||||
}
|
||||
if (idx < 0 || idx >= n)
|
||||
return false;
|
||||
*mapped_idx = idx;
|
||||
return true;
|
||||
}
|
||||
|
||||
__device__ inline bool compute_interp_window(double x, const double *coord, int n,
|
||||
int ordn, int allow_reflect,
|
||||
int *start_idx, double *cx)
|
||||
{
|
||||
const double dx = coord[1] - coord[0];
|
||||
const int center = interp_idint_like((x - coord[0]) / dx + 0.4);
|
||||
const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0;
|
||||
const int cmax = n - 1;
|
||||
|
||||
int begin = center - ordn / 2 + 1;
|
||||
int end = begin + ordn - 1;
|
||||
if (begin < cmin)
|
||||
{
|
||||
begin = cmin;
|
||||
end = begin + ordn - 1;
|
||||
}
|
||||
if (end > cmax)
|
||||
{
|
||||
end = cmax;
|
||||
begin = end + 1 - ordn;
|
||||
}
|
||||
|
||||
if (begin >= 0)
|
||||
*cx = (x - coord[begin]) / dx;
|
||||
else
|
||||
*cx = (x + coord[-begin]) / dx;
|
||||
|
||||
*start_idx = begin;
|
||||
return (begin >= cmin && end <= cmax);
|
||||
}
|
||||
|
||||
__global__ void interp_points_ord6_kernel(int num_points, int num_var,
|
||||
int nx, int ny, int nz,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
const double *soa_flat,
|
||||
const double *px, const double *py, const double *pz,
|
||||
int symmetry,
|
||||
double *out,
|
||||
int *error_flag)
|
||||
{
|
||||
const int total = num_points * num_var;
|
||||
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x)
|
||||
{
|
||||
const int point_id = idx / num_var;
|
||||
const int var_id = idx - point_id * num_var;
|
||||
const double *field = fields[var_id];
|
||||
const double *soa = soa_flat + 3 * var_id;
|
||||
|
||||
const double dx = X[1] - X[0];
|
||||
const double dy = Y[1] - Y[0];
|
||||
const double dz = Z[1] - Z[0];
|
||||
const int allow_reflect_x = (symmetry == 2 && fabs(X[0]) < dx);
|
||||
const int allow_reflect_y = (symmetry == 2 && fabs(Y[0]) < dy);
|
||||
const int allow_reflect_z = (symmetry != 0 && fabs(Z[0]) < dz);
|
||||
|
||||
int start_x = 0, start_y = 0, start_z = 0;
|
||||
double cx = 0.0, cy = 0.0, cz = 0.0;
|
||||
const bool ok_x = compute_interp_window(px[point_id], X, nx, 6, allow_reflect_x, &start_x, &cx);
|
||||
const bool ok_y = compute_interp_window(py[point_id], Y, ny, 6, allow_reflect_y, &start_y, &cy);
|
||||
const bool ok_z = compute_interp_window(pz[point_id], Z, nz, 6, allow_reflect_z, &start_z, &cz);
|
||||
if (!ok_x || !ok_y || !ok_z)
|
||||
{
|
||||
atomicExch(error_flag, 1);
|
||||
out[idx] = 0.0;
|
||||
continue;
|
||||
}
|
||||
|
||||
double wx[6], wy[6], wz[6];
|
||||
lagrange_weights_ord6(cx, wx);
|
||||
lagrange_weights_ord6(cy, wy);
|
||||
lagrange_weights_ord6(cz, wz);
|
||||
|
||||
double value = 0.0;
|
||||
#pragma unroll
|
||||
for (int kz = 0; kz < 6; ++kz)
|
||||
{
|
||||
double yz_sum = 0.0;
|
||||
#pragma unroll
|
||||
for (int jy = 0; jy < 6; ++jy)
|
||||
{
|
||||
double x_sum = 0.0;
|
||||
#pragma unroll
|
||||
for (int ix = 0; ix < 6; ++ix)
|
||||
{
|
||||
double sign = 1.0;
|
||||
int sx = 0, sy = 0, sz = 0;
|
||||
const bool ok_map =
|
||||
map_interp_index(start_x + ix, nx, soa[0], &sx, &sign) &&
|
||||
map_interp_index(start_y + jy, ny, soa[1], &sy, &sign) &&
|
||||
map_interp_index(start_z + kz, nz, soa[2], &sz, &sign);
|
||||
if (!ok_map)
|
||||
{
|
||||
atomicExch(error_flag, 1);
|
||||
continue;
|
||||
}
|
||||
x_sum += wx[ix] * sign * field[index3(sx, sy, sz, nx, ny)];
|
||||
}
|
||||
yz_sum += wy[jy] * x_sum;
|
||||
}
|
||||
value += wz[kz] * yz_sum;
|
||||
}
|
||||
out[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void rk4_kernel(int n, double dT,
|
||||
const double *f0,
|
||||
double *f1,
|
||||
@@ -771,6 +944,168 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_interp_points_batch(const int *ex,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
const double *soa_flat,
|
||||
int num_var,
|
||||
const double *px, const double *py, const double *pz,
|
||||
int num_points,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double *out)
|
||||
{
|
||||
if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out)
|
||||
return 1;
|
||||
if (num_var <= 0 || num_points <= 0 || ordn != 6)
|
||||
return 1;
|
||||
if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||
return 1;
|
||||
|
||||
struct InterpBatchCache
|
||||
{
|
||||
CachedBuffer X, Y, Z;
|
||||
CachedBuffer px, py, pz;
|
||||
CachedBuffer out;
|
||||
CachedBuffer soa;
|
||||
CachedBuffer field_ptrs;
|
||||
CachedIntBuffer error_flag;
|
||||
std::vector<CachedBuffer> host_field_copies;
|
||||
const double *host_X = nullptr;
|
||||
const double *host_Y = nullptr;
|
||||
const double *host_Z = nullptr;
|
||||
int nx = 0;
|
||||
int ny = 0;
|
||||
int nz = 0;
|
||||
};
|
||||
static thread_local InterpBatchCache cache;
|
||||
|
||||
const int nx = ex[0];
|
||||
const int ny = ex[1];
|
||||
const int nz = ex[2];
|
||||
const int field_points = count_points(ex);
|
||||
const size_t coord_bytes_x = static_cast<size_t>(nx) * sizeof(double);
|
||||
const size_t coord_bytes_y = static_cast<size_t>(ny) * sizeof(double);
|
||||
const size_t coord_bytes_z = static_cast<size_t>(nz) * sizeof(double);
|
||||
const size_t field_bytes = static_cast<size_t>(field_points) * sizeof(double);
|
||||
const size_t point_bytes = static_cast<size_t>(num_points) * sizeof(double);
|
||||
const size_t out_bytes = static_cast<size_t>(num_points) * static_cast<size_t>(num_var) * sizeof(double);
|
||||
const size_t soa_bytes = static_cast<size_t>(3 * num_var) * sizeof(double);
|
||||
const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
|
||||
|
||||
bool ok = true;
|
||||
if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
||||
cache.nx != nx || cache.ny != ny || cache.nz != nz)
|
||||
{
|
||||
ok = copy_to_device(cache.X, X, coord_bytes_x) &&
|
||||
copy_to_device(cache.Y, Y, coord_bytes_y) &&
|
||||
copy_to_device(cache.Z, Z, coord_bytes_z);
|
||||
if (ok)
|
||||
{
|
||||
cache.host_X = X;
|
||||
cache.host_Y = Y;
|
||||
cache.host_Z = Z;
|
||||
cache.nx = nx;
|
||||
cache.ny = ny;
|
||||
cache.nz = nz;
|
||||
}
|
||||
}
|
||||
|
||||
ok = ok &&
|
||||
copy_to_device(cache.px, px, point_bytes) &&
|
||||
copy_to_device(cache.py, py, point_bytes) &&
|
||||
copy_to_device(cache.pz, pz, point_bytes) &&
|
||||
copy_to_device(cache.soa, soa_flat, soa_bytes) &&
|
||||
ensure_capacity(cache.out, out_bytes) &&
|
||||
ensure_capacity(cache.field_ptrs, ptr_bytes) &&
|
||||
ensure_capacity(cache.error_flag, sizeof(int));
|
||||
if (!ok)
|
||||
return 1;
|
||||
|
||||
if (static_cast<int>(cache.host_field_copies.size()) < num_var)
|
||||
cache.host_field_copies.resize(num_var);
|
||||
|
||||
std::vector<const double *> device_fields(num_var);
|
||||
for (int v = 0; v < num_var; ++v)
|
||||
{
|
||||
const double *device_field = bssn_gpu_find_device_buffer(fields[v]);
|
||||
if (!device_field)
|
||||
{
|
||||
ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes);
|
||||
device_field = cache.host_field_copies[v].ptr;
|
||||
}
|
||||
device_fields[v] = device_field;
|
||||
if (!ok || !device_fields[v])
|
||||
return 1;
|
||||
}
|
||||
|
||||
int zero = 0;
|
||||
cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) field_ptrs", err);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
dim3 block(128);
|
||||
dim3 grid(div_up(num_points * num_var, static_cast<int>(block.x)));
|
||||
if (grid.x > 4096)
|
||||
grid.x = 4096;
|
||||
|
||||
int nx_local = nx;
|
||||
int ny_local = ny;
|
||||
int nz_local = nz;
|
||||
const double *dX = cache.X.ptr;
|
||||
const double *dY = cache.Y.ptr;
|
||||
const double *dZ = cache.Z.ptr;
|
||||
const double *dpx = cache.px.ptr;
|
||||
const double *dpy = cache.py.ptr;
|
||||
const double *dpz = cache.pz.ptr;
|
||||
const double *dsoa = cache.soa.ptr;
|
||||
const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
|
||||
double *dout = cache.out.ptr;
|
||||
int *derror = cache.error_flag.ptr;
|
||||
|
||||
void *args[] = {
|
||||
&num_points, &num_var,
|
||||
&nx_local, &ny_local, &nz_local,
|
||||
&dX, &dY, &dZ,
|
||||
&dfields,
|
||||
&dsoa,
|
||||
&dpx, &dpy, &dpz,
|
||||
&symmetry,
|
||||
&dout,
|
||||
&derror};
|
||||
ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args);
|
||||
if (!ok)
|
||||
return 1;
|
||||
|
||||
int error_flag = 0;
|
||||
err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err);
|
||||
return 1;
|
||||
}
|
||||
if (error_flag != 0)
|
||||
return 1;
|
||||
|
||||
err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) interp_out", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||
const double *llbf, const double *uubf, const int *extf, double *funf,
|
||||
|
||||
@@ -29,4 +29,15 @@ int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbp, const double *uubp,
|
||||
const double *SoA, int symmetry);
|
||||
|
||||
int bssn_cuda_interp_points_batch(const int *ex,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
const double *soa_flat,
|
||||
int num_var,
|
||||
const double *px, const double *py, const double *pz,
|
||||
int num_points,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double *out);
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user