Optimize BSSN CUDA state transfers
This commit is contained in:
@@ -6,6 +6,7 @@
|
|||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#ifndef USE_CUDA_Z4C
|
#ifndef USE_CUDA_Z4C
|
||||||
#define USE_CUDA_Z4C 0
|
#define USE_CUDA_Z4C 0
|
||||||
@@ -391,6 +392,113 @@ bool cuda_device_state_count_supported(int state_count)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
bool cuda_flush_device_segment_batch(Block *block,
|
||||||
|
double *data,
|
||||||
|
int state_count,
|
||||||
|
const std::vector<int> &meta,
|
||||||
|
int dir)
|
||||||
|
{
|
||||||
|
if (!block || meta.empty())
|
||||||
|
return true;
|
||||||
|
const int segment_count = (int)(meta.size() / 8);
|
||||||
|
if (dir == PACK)
|
||||||
|
return bssn_cuda_pack_state_segments_to_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||||
|
return bssn_cuda_unpack_state_segments_from_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cuda_data_packer_device_batched(double *data,
|
||||||
|
MyList<Parallel::gridseg> *src,
|
||||||
|
MyList<Parallel::gridseg> *dst,
|
||||||
|
int rank_in,
|
||||||
|
int dir,
|
||||||
|
MyList<var> *VarLists,
|
||||||
|
MyList<var> *VarListd,
|
||||||
|
int Symmetry)
|
||||||
|
{
|
||||||
|
(void)Symmetry;
|
||||||
|
if (!data || (dir != PACK && dir != UNPACK) || !src || !dst)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
|
||||||
|
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||||
|
if (!cuda_device_state_count_supported(state_count))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
int size_out = 0;
|
||||||
|
Block *batch_block = 0;
|
||||||
|
std::vector<int> batch_meta;
|
||||||
|
batch_meta.reserve(64);
|
||||||
|
|
||||||
|
while (src && dst)
|
||||||
|
{
|
||||||
|
const bool active =
|
||||||
|
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||||
|
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||||
|
if (active)
|
||||||
|
{
|
||||||
|
int type;
|
||||||
|
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||||
|
type = 1;
|
||||||
|
else if (src->data->Bg->lev > dst->data->Bg->lev)
|
||||||
|
type = 2;
|
||||||
|
else
|
||||||
|
type = 3;
|
||||||
|
if (type != 1)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
|
||||||
|
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
|
||||||
|
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (batch_block && batch_block != block)
|
||||||
|
{
|
||||||
|
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
|
||||||
|
return -1;
|
||||||
|
batch_meta.clear();
|
||||||
|
}
|
||||||
|
batch_block = block;
|
||||||
|
|
||||||
|
const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
|
||||||
|
: cuda_seg_begin(dst->data, block, 0);
|
||||||
|
const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
|
||||||
|
: cuda_seg_begin(dst->data, block, 1);
|
||||||
|
const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
|
||||||
|
: cuda_seg_begin(dst->data, block, 2);
|
||||||
|
const int sx = dst->data->shape[0];
|
||||||
|
const int sy = dst->data->shape[1];
|
||||||
|
const int sz = dst->data->shape[2];
|
||||||
|
const int region_all = sx * sy * sz;
|
||||||
|
|
||||||
|
batch_meta.push_back(i0);
|
||||||
|
batch_meta.push_back(j0);
|
||||||
|
batch_meta.push_back(k0);
|
||||||
|
batch_meta.push_back(sx);
|
||||||
|
batch_meta.push_back(sy);
|
||||||
|
batch_meta.push_back(sz);
|
||||||
|
batch_meta.push_back(region_all);
|
||||||
|
batch_meta.push_back(size_out);
|
||||||
|
|
||||||
|
size_out += state_count * region_all;
|
||||||
|
}
|
||||||
|
src = src->next;
|
||||||
|
dst = dst->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (batch_block)
|
||||||
|
{
|
||||||
|
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return size_out;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
bool cuda_segments_same_level(MyList<Parallel::gridseg> *src,
|
bool cuda_segments_same_level(MyList<Parallel::gridseg> *src,
|
||||||
MyList<Parallel::gridseg> *dst,
|
MyList<Parallel::gridseg> *dst,
|
||||||
int rank_in,
|
int rank_in,
|
||||||
@@ -465,6 +573,23 @@ int data_packer_with_device_buffer(double *data,
|
|||||||
MyList<var> *VarListd,
|
MyList<var> *VarListd,
|
||||||
int Symmetry)
|
int Symmetry)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
const double batched_t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int batched = cuda_data_packer_device_batched(data, src, dst, rank_in, dir,
|
||||||
|
VarLists, VarListd, Symmetry);
|
||||||
|
if (batched >= 0)
|
||||||
|
{
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
{
|
||||||
|
const double dt = MPI_Wtime() - batched_t0;
|
||||||
|
if (dir == PACK)
|
||||||
|
sync_profile_stats().direct_pack_sec += dt;
|
||||||
|
else if (dir == UNPACK)
|
||||||
|
sync_profile_stats().direct_unpack_sec += dt;
|
||||||
|
}
|
||||||
|
return batched;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
s_cuda_aware_pack_active = true;
|
s_cuda_aware_pack_active = true;
|
||||||
int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry);
|
int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry);
|
||||||
s_cuda_aware_pack_active = false;
|
s_cuda_aware_pack_active = false;
|
||||||
|
|||||||
@@ -74,6 +74,12 @@ struct CudaProfileStats {
|
|||||||
double bc_ms;
|
double bc_ms;
|
||||||
double finalize_ms;
|
double finalize_ms;
|
||||||
double output_ms;
|
double output_ms;
|
||||||
|
long long upload_calls;
|
||||||
|
long long resident_download_calls;
|
||||||
|
double upload_ms;
|
||||||
|
double resident_download_ms;
|
||||||
|
double upload_gb;
|
||||||
|
double resident_download_gb;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum RhsStageId {
|
enum RhsStageId {
|
||||||
@@ -97,7 +103,10 @@ struct RhsStageProfileStats {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static CudaProfileStats &cuda_profile_stats() {
|
static CudaProfileStats &cuda_profile_stats() {
|
||||||
static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
static CudaProfileStats stats = {
|
||||||
|
0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
||||||
|
0, 0, 0.0, 0.0, 0.0, 0.0
|
||||||
|
};
|
||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,7 +171,8 @@ static void cuda_profile_maybe_log() {
|
|||||||
CudaProfileStats &stats = cuda_profile_stats();
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
|
if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n",
|
"[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms"
|
||||||
|
" uploads=%lld avg_upload=%.3f ms upload_GB=%.3f resident_downloads=%lld avg_resident_download=%.3f ms resident_download_GB=%.3f\n",
|
||||||
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
|
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
|
||||||
stats.total_ms / (double)stats.calls,
|
stats.total_ms / (double)stats.calls,
|
||||||
stats.state_ms / (double)stats.calls,
|
stats.state_ms / (double)stats.calls,
|
||||||
@@ -170,7 +180,13 @@ static void cuda_profile_maybe_log() {
|
|||||||
stats.rhs_ms / (double)stats.calls,
|
stats.rhs_ms / (double)stats.calls,
|
||||||
stats.bc_ms / (double)stats.calls,
|
stats.bc_ms / (double)stats.calls,
|
||||||
stats.finalize_ms / (double)stats.calls,
|
stats.finalize_ms / (double)stats.calls,
|
||||||
stats.output_ms / (double)stats.calls);
|
stats.output_ms / (double)stats.calls,
|
||||||
|
stats.upload_calls,
|
||||||
|
stats.upload_calls ? stats.upload_ms / (double)stats.upload_calls : 0.0,
|
||||||
|
stats.upload_gb,
|
||||||
|
stats.resident_download_calls,
|
||||||
|
stats.resident_download_calls ? stats.resident_download_ms / (double)stats.resident_download_calls : 0.0,
|
||||||
|
stats.resident_download_gb);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -542,6 +558,8 @@ struct StepAllocation {
|
|||||||
|
|
||||||
static std::unordered_map<void *, StepContext> g_step_ctx;
|
static std::unordered_map<void *, StepContext> g_step_ctx;
|
||||||
static std::vector<StepAllocation> g_step_pool;
|
static std::vector<StepAllocation> g_step_pool;
|
||||||
|
static int *g_comm_segment_meta = nullptr;
|
||||||
|
static size_t g_comm_segment_meta_cap = 0;
|
||||||
|
|
||||||
static StepAllocation empty_step_allocation()
|
static StepAllocation empty_step_allocation()
|
||||||
{
|
{
|
||||||
@@ -760,6 +778,20 @@ static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doub
|
|||||||
return ctx.h_comm_mem;
|
return ctx.h_comm_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int *ensure_comm_segment_meta_buffer(size_t needed_ints)
|
||||||
|
{
|
||||||
|
if (needed_ints == 0) return nullptr;
|
||||||
|
if (g_comm_segment_meta_cap < needed_ints) {
|
||||||
|
if (g_comm_segment_meta) {
|
||||||
|
CUDA_CHECK(cudaFree(g_comm_segment_meta));
|
||||||
|
g_comm_segment_meta = nullptr;
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int)));
|
||||||
|
g_comm_segment_meta_cap = needed_ints;
|
||||||
|
}
|
||||||
|
return g_comm_segment_meta;
|
||||||
|
}
|
||||||
|
|
||||||
static void upload_grid_params_if_needed(const GridParams &gp)
|
static void upload_grid_params_if_needed(const GridParams &gp)
|
||||||
{
|
{
|
||||||
if (!g_gp_host_cache_valid ||
|
if (!g_gp_host_cache_valid ||
|
||||||
@@ -4716,18 +4748,25 @@ static void compute_patch_boundary_flags(int *ex,
|
|||||||
static void upload_state_inputs(double **state_host, size_t all)
|
static void upload_state_inputs(double **state_host, size_t all)
|
||||||
{
|
{
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
|
const bool profile = cuda_profile_enabled();
|
||||||
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
static int direct_upload = -1;
|
static int direct_upload = -1;
|
||||||
if (direct_upload < 0) {
|
if (direct_upload < 0) {
|
||||||
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
|
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
|
||||||
const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
|
direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||||
direct_upload = env ? ((atoi(env) != 0) ? 1 : 0)
|
|
||||||
: ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
|
|
||||||
}
|
}
|
||||||
if (direct_upload) {
|
if (direct_upload) {
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
|
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
|
||||||
bytes, cudaMemcpyHostToDevice));
|
bytes, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
stats.upload_calls++;
|
||||||
|
stats.upload_ms += cuda_profile_now_ms() - t0;
|
||||||
|
stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
@@ -4736,6 +4775,12 @@ static void upload_state_inputs(double **state_host, size_t all)
|
|||||||
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
||||||
(size_t)BSSN_STATE_COUNT * bytes,
|
(size_t)BSSN_STATE_COUNT * bytes,
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
|
if (profile) {
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
stats.upload_calls++;
|
||||||
|
stats.upload_ms += cuda_profile_now_ms() - t0;
|
||||||
|
stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void upload_matter_cache(StepContext &ctx,
|
static void upload_matter_cache(StepContext &ctx,
|
||||||
@@ -5168,6 +5213,62 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem,
|
||||||
|
double * __restrict__ dst,
|
||||||
|
int nx, int ny,
|
||||||
|
const int * __restrict__ meta,
|
||||||
|
int state_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const int segment = blockIdx.z;
|
||||||
|
const int state_index = blockIdx.y;
|
||||||
|
const int *m = meta + segment * 8;
|
||||||
|
const int i0 = m[0], j0 = m[1], k0 = m[2];
|
||||||
|
const int sx = m[3], sy = m[4];
|
||||||
|
const int region_all = m[6];
|
||||||
|
const int offset = m[7];
|
||||||
|
if (state_index >= state_count) return;
|
||||||
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
local < region_all;
|
||||||
|
local += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int ii = local % sx;
|
||||||
|
const int jj = (local / sx) % sy;
|
||||||
|
const int kk = local / (sx * sy);
|
||||||
|
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
|
dst[(size_t)offset + (size_t)state_index * region_all + local] =
|
||||||
|
src_mem[(size_t)state_index * all + src];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem,
|
||||||
|
const double * __restrict__ src,
|
||||||
|
int nx, int ny,
|
||||||
|
const int * __restrict__ meta,
|
||||||
|
int state_count,
|
||||||
|
int all)
|
||||||
|
{
|
||||||
|
const int segment = blockIdx.z;
|
||||||
|
const int state_index = blockIdx.y;
|
||||||
|
const int *m = meta + segment * 8;
|
||||||
|
const int i0 = m[0], j0 = m[1], k0 = m[2];
|
||||||
|
const int sx = m[3], sy = m[4];
|
||||||
|
const int region_all = m[6];
|
||||||
|
const int offset = m[7];
|
||||||
|
if (state_index >= state_count) return;
|
||||||
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
local < region_all;
|
||||||
|
local += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
const int ii = local % sx;
|
||||||
|
const int jj = (local / sx) % sy;
|
||||||
|
const int kk = local / (sx * sy);
|
||||||
|
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
||||||
|
dst_mem[(size_t)state_index * all + dst] =
|
||||||
|
src[(size_t)offset + (size_t)state_index * region_all + local];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
||||||
double * __restrict__ dst,
|
double * __restrict__ dst,
|
||||||
int subset_count,
|
int subset_count,
|
||||||
@@ -5308,12 +5409,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
|
|||||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
|
const bool profile = cuda_profile_enabled();
|
||||||
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
static int direct_download = -1;
|
static int direct_download = -1;
|
||||||
if (direct_download < 0) {
|
if (direct_download < 0) {
|
||||||
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
|
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
|
||||||
const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
|
direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||||
direct_download = env ? ((atoi(env) != 0) ? 1 : 0)
|
|
||||||
: ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
|
|
||||||
}
|
}
|
||||||
if (direct_download) {
|
if (direct_download) {
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
@@ -5321,6 +5422,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
|
|||||||
bytes, cudaMemcpyDeviceToHost));
|
bytes, cudaMemcpyDeviceToHost));
|
||||||
}
|
}
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
if (profile) {
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
stats.resident_download_calls++;
|
||||||
|
stats.resident_download_ms += cuda_profile_now_ms() - t0;
|
||||||
|
stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
|
||||||
@@ -5329,6 +5436,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
|
|||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
||||||
}
|
}
|
||||||
|
if (profile) {
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
stats.resident_download_calls++;
|
||||||
|
stats.resident_download_ms += cuda_profile_now_ms() - t0;
|
||||||
|
stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copy_state_subset(void *block_tag,
|
static void copy_state_subset(void *block_tag,
|
||||||
@@ -6060,6 +6173,47 @@ static void copy_state_device_batch(void *block_tag,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_state_device_segments(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
const int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta,
|
||||||
|
int pack_not_unpack)
|
||||||
|
{
|
||||||
|
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
|
||||||
|
if (segment_count <= 0 || !segment_meta) return;
|
||||||
|
|
||||||
|
int max_region_all = 0;
|
||||||
|
for (int s = 0; s < segment_count; ++s) {
|
||||||
|
const int *m = segment_meta + s * 8;
|
||||||
|
if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return;
|
||||||
|
if (m[6] > max_region_all) max_region_all = m[6];
|
||||||
|
}
|
||||||
|
if (max_region_all <= 0) return;
|
||||||
|
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||||
|
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
|
||||||
|
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
|
||||||
|
(size_t)segment_count * 8 * sizeof(int),
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
|
||||||
|
(unsigned int)state_count,
|
||||||
|
(unsigned int)segment_count);
|
||||||
|
if (pack_not_unpack) {
|
||||||
|
kern_pack_state_segments_batch<<<launch_grid, BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, device_buffer,
|
||||||
|
ex[0], ex[1], d_meta, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
} else {
|
||||||
|
kern_unpack_state_segments_batch<<<launch_grid, BLK>>>(
|
||||||
|
ctx.d_state_curr_mem, device_buffer,
|
||||||
|
ex[0], ex[1], d_meta, state_count,
|
||||||
|
ex[0] * ex[1] * ex[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
@@ -6090,6 +6244,36 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
||||||
|
segment_count, segment_meta, 1);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
||||||
|
segment_count, segment_meta, 0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
int bssn_cuda_download_state_subset(void *block_tag,
|
int bssn_cuda_download_state_subset(void *block_tag,
|
||||||
int *ex,
|
int *ex,
|
||||||
|
|||||||
@@ -118,6 +118,20 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta);
|
||||||
|
|
||||||
|
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta);
|
||||||
|
|
||||||
int bssn_cuda_download_state_subset(void *block_tag,
|
int bssn_cuda_download_state_subset(void *block_tag,
|
||||||
int *ex,
|
int *ex,
|
||||||
int subset_count,
|
int subset_count,
|
||||||
|
|||||||
Reference in New Issue
Block a user