From c47349b7a913fd452b46f8edab2ff926c4f6da5e Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 14:56:01 +0800 Subject: [PATCH] Add batched CUDA patch interpolation path --- AMSS_NCKU_source/MPatch.C | 360 +++++++++++++++++++++--------- AMSS_NCKU_source/bssn_cuda_ops.cu | 335 +++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_cuda_ops.h | 11 + 3 files changed, 601 insertions(+), 105 deletions(-) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 956e9c8..18fe7a4 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -10,21 +10,41 @@ #include using namespace std; -#include "misc.h" -#include "MPatch.h" -#include "Parallel.h" -#include "fmisc.h" -#ifdef INTERP_LB_PROFILE -#include "interp_lb_profile.h" -#endif - -namespace -{ -struct InterpBlockView -{ - Block *bp; - double llb[dim]; - double uub[dim]; +#include "misc.h" +#include "MPatch.h" +#include "Parallel.h" +#include "fmisc.h" +#include "bssn_cuda_ops.h" +#ifdef INTERP_LB_PROFILE +#include "interp_lb_profile.h" +#endif + +#if defined(__GNUC__) || defined(__clang__) +extern int bssn_cuda_interp_points_batch(const int *ex, + const double *X, const double *Y, const double *Z, + const double *const *fields, + const double *soa_flat, + int num_var, + const double *px, const double *py, const double *pz, + int num_points, + int ordn, + int symmetry, + double *out) __attribute__((weak)); +#endif + +namespace +{ +struct InterpVarDesc +{ + int sgfn; + double soa[dim]; +}; + +struct InterpBlockView +{ + Block *bp; + double llb[dim]; + double uub[dim]; }; struct BlockBinIndex @@ -154,10 +174,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index) index.valid = true; } -int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) -{ - if (!index.valid) - return -1; +int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) +{ + if (!index.valid) + return -1; const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]); const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]); @@ -175,10 +195,151 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co for (size_t bi = 0; bi < index.views.size(); bi++) if (point_in_block_view(index.views[bi], pox, DH)) return int(bi); - - return -1; -} -} // namespace + + return -1; +} + +void collect_interp_vars(MyList *VarList, vector &vars) +{ + vars.clear(); + MyList *varl = VarList; + while (varl) + { + InterpVarDesc desc; + desc.sgfn = varl->data->sgfn; + for (int d = 0; d < dim; ++d) + desc.soa[d] = varl->data->SoA[d]; + vars.push_back(desc); + varl = varl->next; + } +} + +bool should_try_cuda_interp(int ordn, int num_points, int num_var) +{ +#if defined(__GNUC__) || defined(__clang__) + if (!bssn_cuda_interp_points_batch) + return false; +#else + return false; +#endif + if (ordn != 6) + return false; + if (num_points < 32) + return false; + return num_points * num_var >= 256; +} + +bool run_cuda_interp_for_block(Block *BP, + const vector &vars, + const vector &point_ids, + double **XX, + double *Shellf, + int num_var, + int ordn, + int Symmetry) +{ + if (!should_try_cuda_interp(ordn, static_cast(point_ids.size()), num_var)) + return false; + + vector field_ptrs(num_var); + vector soa_flat(3 * num_var); + for (int v = 0; v < num_var; ++v) + { + field_ptrs[v] = BP->fgfs[vars[v].sgfn]; + for (int d = 0; d < dim; ++d) + soa_flat[3 * v + d] = vars[v].soa[d]; + } + + const int npts = static_cast(point_ids.size()); + vector px(npts), py(npts), pz(npts); + for (int p = 0; p < npts; ++p) + { + const int j = point_ids[p]; + px[p] = XX[0][j]; + py[p] = XX[1][j]; + pz[p] = XX[2][j]; + } + + vector out(static_cast(npts) * static_cast(num_var)); + if (bssn_cuda_interp_points_batch(BP->shape, + BP->X[0], BP->X[1], BP->X[2], + field_ptrs.data(), + soa_flat.data(), + num_var, + px.data(), py.data(), pz.data(), + npts, + ordn, + Symmetry, + out.data()) != 0) + { + return false; + } + + for (int p = 0; p < npts; ++p) + { + const int j = point_ids[p]; + memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var); + } + return true; +} + +void run_cpu_interp_for_block(Block *BP, + const vector &vars, + const vector &point_ids, + double **XX, + double *Shellf, + int num_var, + int ordn, + int Symmetry) +{ + for (size_t p = 0; p < point_ids.size(); ++p) + { + const int j = point_ids[p]; + double x = XX[0][j]; + double y = XX[1][j]; + double z = XX[2][j]; + int ordn_local = ordn; + int symmetry_local = Symmetry; + for (int v = 0; v < num_var; ++v) + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], + BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v], + x, y, z, ordn_local, const_cast(vars[v].soa), symmetry_local); + } + } +} + +void interpolate_owned_points(MyList *VarList, + int NN, double **XX, + double *Shellf, int Symmetry, + int myrank, int ordn, + const BlockBinIndex &block_index, + const int *owner_rank, + const int *owner_block) +{ + vector vars; + collect_interp_vars(VarList, vars); + const int num_var = static_cast(vars.size()); + vector> block_points(block_index.views.size()); + + for (int j = 0; j < NN; ++j) + { + if (owner_rank[j] == myrank && owner_block[j] >= 0) + block_points[owner_block[j]].push_back(j); + } + + for (size_t bi = 0; bi < block_points.size(); ++bi) + { + if (block_points[bi].empty()) + continue; + + Block *BP = block_index.views[bi].bp; + bool done = run_cuda_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry); + if (!done) + run_cpu_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry); + } +} +} // namespace Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { @@ -523,12 +684,17 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] records which MPI rank owns point j - // All ranks traverse the same block list so they all agree on ownership - int *owner_rank; - owner_rank = new int[NN]; - for (int j = 0; j < NN; j++) - owner_rank[j] = -1; + // owner_rank[j] records which MPI rank owns point j + // All ranks traverse the same block list so they all agree on ownership + int *owner_rank; + owner_rank = new int[NN]; + int *owner_block; + owner_block = new int[NN]; + for (int j = 0; j < NN; j++) + { + owner_rank[j] = -1; + owner_block[j] = -1; + } double DH[dim]; for (int i = 0; i < dim; i++) @@ -558,25 +724,15 @@ void Patch::Interp_Points(MyList *VarList, } const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + if (block_i >= 0) + { + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + owner_block[j] = block_i; + } + } + + interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); // Replace MPI_Allreduce with per-owner MPI_Bcast: // Group consecutive points by owner rank and broadcast each group. @@ -631,9 +787,10 @@ void Patch::Interp_Points(MyList *VarList, MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD); } } - - delete[] owner_rank; -} + + delete[] owner_rank; + delete[] owner_block; +} void Patch::Interp_Points(MyList *VarList, int NN, double **XX, double *Shellf, int Symmetry, @@ -661,11 +818,16 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] records which MPI rank owns point j - int *owner_rank; - owner_rank = new int[NN]; - for (int j = 0; j < NN; j++) - owner_rank[j] = -1; + // owner_rank[j] records which MPI rank owns point j + int *owner_rank; + owner_rank = new int[NN]; + int *owner_block; + owner_block = new int[NN]; + for (int j = 0; j < NN; j++) + { + owner_rank[j] = -1; + owner_block[j] = -1; + } double DH[dim]; for (int i = 0; i < dim; i++) @@ -696,24 +858,15 @@ void Patch::Interp_Points(MyList *VarList, } const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - varl = VarList; - int k = 0; - while (varl) - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + if (block_i >= 0) + { + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + owner_block[j] = block_i; + } + } + + interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); #ifdef INTERP_LB_PROFILE double t_interp_end = MPI_Wtime(); @@ -873,9 +1026,10 @@ void Patch::Interp_Points(MyList *VarList, delete[] send_offset; delete[] recv_offset; delete[] send_count; - delete[] recv_count; - delete[] consumer_rank; - delete[] owner_rank; + delete[] recv_count; + delete[] consumer_rank; + delete[] owner_rank; + delete[] owner_block; #ifdef INTERP_LB_PROFILE { @@ -923,11 +1077,16 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] stores the global rank that owns point j - int *owner_rank; - owner_rank = new int[NN]; - for (int j = 0; j < NN; j++) - owner_rank[j] = -1; + // owner_rank[j] stores the global rank that owns point j + int *owner_rank; + owner_rank = new int[NN]; + int *owner_block; + owner_block = new int[NN]; + for (int j = 0; j < NN; j++) + { + owner_rank[j] = -1; + owner_block[j] = -1; + } // Build global-to-local rank translation for Comm_here MPI_Group world_group, local_group; @@ -962,25 +1121,15 @@ void Patch::Interp_Points(MyList *VarList, } const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + if (block_i >= 0) + { + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + owner_block[j] = block_i; + } + } + + interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); // Collect unique global owner ranks and translate to local ranks in Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here @@ -1008,10 +1157,11 @@ void Patch::Interp_Points(MyList *VarList, } } - MPI_Group_free(&world_group); - MPI_Group_free(&local_group); - delete[] owner_rank; -} + MPI_Group_free(&world_group); + MPI_Group_free(&local_group); + delete[] owner_rank; + delete[] owner_block; +} void Patch::checkBlock() { int myrank; diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 68eb9ba..e292efe 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -42,6 +42,12 @@ struct CachedBuffer size_t capacity = 0; }; +struct CachedIntBuffer +{ + int *ptr = nullptr; + size_t capacity = 0; +}; + inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) { if (bytes <= buffer.capacity && buffer.ptr) @@ -67,6 +73,31 @@ inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) return true; } +inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes) +{ + if (bytes <= buffer.capacity && buffer.ptr) + return true; + + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + buffer.capacity = 0; + } + + cudaError_t err = cudaMalloc(&buffer.ptr, bytes); + if (err != cudaSuccess) + { + report_cuda_error("cudaMalloc", err); + return false; + } + + buffer.capacity = bytes; + return true; +} + inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes) { if (!ensure_capacity(dst, bytes)) @@ -264,6 +295,148 @@ __global__ void prolong3_cell_kernel(const double *funcc, double *funf, } } +__device__ inline int interp_idint_like(double x) +{ + return static_cast(x); +} + +__device__ inline void lagrange_weights_ord6(double x, double *w) +{ + const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0}; + #pragma unroll + for (int i = 0; i < 6; ++i) + { + double num = 1.0; + #pragma unroll + for (int j = 0; j < 6; ++j) + { + if (j != i) + num *= (x - static_cast(j)); + } + w[i] = num / denom[i]; + } +} + +__device__ inline bool map_interp_index(int logical_idx, int n, double soa, int *mapped_idx, double *sign) +{ + int idx = logical_idx; + if (idx < 0) + { + idx = -idx; + *sign *= soa; + } + if (idx < 0 || idx >= n) + return false; + *mapped_idx = idx; + return true; +} + +__device__ inline bool compute_interp_window(double x, const double *coord, int n, + int ordn, int allow_reflect, + int *start_idx, double *cx) +{ + const double dx = coord[1] - coord[0]; + const int center = interp_idint_like((x - coord[0]) / dx + 0.4); + const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0; + const int cmax = n - 1; + + int begin = center - ordn / 2 + 1; + int end = begin + ordn - 1; + if (begin < cmin) + { + begin = cmin; + end = begin + ordn - 1; + } + if (end > cmax) + { + end = cmax; + begin = end + 1 - ordn; + } + + if (begin >= 0) + *cx = (x - coord[begin]) / dx; + else + *cx = (x + coord[-begin]) / dx; + + *start_idx = begin; + return (begin >= cmin && end <= cmax); +} + +__global__ void interp_points_ord6_kernel(int num_points, int num_var, + int nx, int ny, int nz, + const double *X, const double *Y, const double *Z, + const double *const *fields, + const double *soa_flat, + const double *px, const double *py, const double *pz, + int symmetry, + double *out, + int *error_flag) +{ + const int total = num_points * num_var; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x) + { + const int point_id = idx / num_var; + const int var_id = idx - point_id * num_var; + const double *field = fields[var_id]; + const double *soa = soa_flat + 3 * var_id; + + const double dx = X[1] - X[0]; + const double dy = Y[1] - Y[0]; + const double dz = Z[1] - Z[0]; + const int allow_reflect_x = (symmetry == 2 && fabs(X[0]) < dx); + const int allow_reflect_y = (symmetry == 2 && fabs(Y[0]) < dy); + const int allow_reflect_z = (symmetry != 0 && fabs(Z[0]) < dz); + + int start_x = 0, start_y = 0, start_z = 0; + double cx = 0.0, cy = 0.0, cz = 0.0; + const bool ok_x = compute_interp_window(px[point_id], X, nx, 6, allow_reflect_x, &start_x, &cx); + const bool ok_y = compute_interp_window(py[point_id], Y, ny, 6, allow_reflect_y, &start_y, &cy); + const bool ok_z = compute_interp_window(pz[point_id], Z, nz, 6, allow_reflect_z, &start_z, &cz); + if (!ok_x || !ok_y || !ok_z) + { + atomicExch(error_flag, 1); + out[idx] = 0.0; + continue; + } + + double wx[6], wy[6], wz[6]; + lagrange_weights_ord6(cx, wx); + lagrange_weights_ord6(cy, wy); + lagrange_weights_ord6(cz, wz); + + double value = 0.0; + #pragma unroll + for (int kz = 0; kz < 6; ++kz) + { + double yz_sum = 0.0; + #pragma unroll + for (int jy = 0; jy < 6; ++jy) + { + double x_sum = 0.0; + #pragma unroll + for (int ix = 0; ix < 6; ++ix) + { + double sign = 1.0; + int sx = 0, sy = 0, sz = 0; + const bool ok_map = + map_interp_index(start_x + ix, nx, soa[0], &sx, &sign) && + map_interp_index(start_y + jy, ny, soa[1], &sy, &sign) && + map_interp_index(start_z + kz, nz, soa[2], &sz, &sign); + if (!ok_map) + { + atomicExch(error_flag, 1); + continue; + } + x_sum += wx[ix] * sign * field[index3(sx, sy, sz, nx, ny)]; + } + yz_sum += wy[jy] * x_sum; + } + value += wz[kz] * yz_sum; + } + out[idx] = value; + } +} + __global__ void rk4_kernel(int n, double dT, const double *f0, double *f1, @@ -771,6 +944,168 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) return ok ? 0 : 1; } +int bssn_cuda_interp_points_batch(const int *ex, + const double *X, const double *Y, const double *Z, + const double *const *fields, + const double *soa_flat, + int num_var, + const double *px, const double *py, const double *pz, + int num_points, + int ordn, + int symmetry, + double *out) +{ + if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out) + return 1; + if (num_var <= 0 || num_points <= 0 || ordn != 6) + return 1; + if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn) + return 1; + + struct InterpBatchCache + { + CachedBuffer X, Y, Z; + CachedBuffer px, py, pz; + CachedBuffer out; + CachedBuffer soa; + CachedBuffer field_ptrs; + CachedIntBuffer error_flag; + std::vector host_field_copies; + const double *host_X = nullptr; + const double *host_Y = nullptr; + const double *host_Z = nullptr; + int nx = 0; + int ny = 0; + int nz = 0; + }; + static thread_local InterpBatchCache cache; + + const int nx = ex[0]; + const int ny = ex[1]; + const int nz = ex[2]; + const int field_points = count_points(ex); + const size_t coord_bytes_x = static_cast(nx) * sizeof(double); + const size_t coord_bytes_y = static_cast(ny) * sizeof(double); + const size_t coord_bytes_z = static_cast(nz) * sizeof(double); + const size_t field_bytes = static_cast(field_points) * sizeof(double); + const size_t point_bytes = static_cast(num_points) * sizeof(double); + const size_t out_bytes = static_cast(num_points) * static_cast(num_var) * sizeof(double); + const size_t soa_bytes = static_cast(3 * num_var) * sizeof(double); + const size_t ptr_bytes = static_cast(num_var) * sizeof(double *); + + bool ok = true; + if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || + cache.nx != nx || cache.ny != ny || cache.nz != nz) + { + ok = copy_to_device(cache.X, X, coord_bytes_x) && + copy_to_device(cache.Y, Y, coord_bytes_y) && + copy_to_device(cache.Z, Z, coord_bytes_z); + if (ok) + { + cache.host_X = X; + cache.host_Y = Y; + cache.host_Z = Z; + cache.nx = nx; + cache.ny = ny; + cache.nz = nz; + } + } + + ok = ok && + copy_to_device(cache.px, px, point_bytes) && + copy_to_device(cache.py, py, point_bytes) && + copy_to_device(cache.pz, pz, point_bytes) && + copy_to_device(cache.soa, soa_flat, soa_bytes) && + ensure_capacity(cache.out, out_bytes) && + ensure_capacity(cache.field_ptrs, ptr_bytes) && + ensure_capacity(cache.error_flag, sizeof(int)); + if (!ok) + return 1; + + if (static_cast(cache.host_field_copies.size()) < num_var) + cache.host_field_copies.resize(num_var); + + std::vector device_fields(num_var); + for (int v = 0; v < num_var; ++v) + { + const double *device_field = bssn_gpu_find_device_buffer(fields[v]); + if (!device_field) + { + ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes); + device_field = cache.host_field_copies[v].ptr; + } + device_fields[v] = device_field; + if (!ok || !device_fields[v]) + return 1; + } + + int zero = 0; + cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) field_ptrs", err); + return 1; + } + err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err); + return 1; + } + + dim3 block(128); + dim3 grid(div_up(num_points * num_var, static_cast(block.x))); + if (grid.x > 4096) + grid.x = 4096; + + int nx_local = nx; + int ny_local = ny; + int nz_local = nz; + const double *dX = cache.X.ptr; + const double *dY = cache.Y.ptr; + const double *dZ = cache.Z.ptr; + const double *dpx = cache.px.ptr; + const double *dpy = cache.py.ptr; + const double *dpz = cache.pz.ptr; + const double *dsoa = cache.soa.ptr; + const double *const *dfields = reinterpret_cast(cache.field_ptrs.ptr); + double *dout = cache.out.ptr; + int *derror = cache.error_flag.ptr; + + void *args[] = { + &num_points, &num_var, + &nx_local, &ny_local, &nz_local, + &dX, &dY, &dZ, + &dfields, + &dsoa, + &dpx, &dpy, &dpz, + &symmetry, + &dout, + &derror}; + ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args); + if (!ok) + return 1; + + int error_flag = 0; + err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err); + return 1; + } + if (error_flag != 0) + return 1; + + err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) interp_out", err); + return 1; + } + + return 0; +} + int bssn_cuda_prolong3_pack(int wei, const double *llbc, const double *uubc, const int *extc, const double *func, const double *llbf, const double *uubf, const int *extf, double *funf, diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index 080ae95..4a11644 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -29,4 +29,15 @@ int bssn_cuda_prolong3_pack(int wei, const double *llbp, const double *uubp, const double *SoA, int symmetry); +int bssn_cuda_interp_points_batch(const int *ex, + const double *X, const double *Y, const double *Z, + const double *const *fields, + const double *soa_flat, + int num_var, + const double *px, const double *py, const double *pz, + int num_points, + int ordn, + int symmetry, + double *out); + #endif