Add direct CUDA resident-state sync path and profiling hooks
This commit is contained in:
@@ -4,6 +4,197 @@
|
|||||||
#include "prolongrestrict.h"
|
#include "prolongrestrict.h"
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstdio>
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
#include <cuda_runtime_api.h>
|
||||||
|
#include "bssn_rhs_cuda.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
struct SyncProfileStats
|
||||||
|
{
|
||||||
|
long long start_calls;
|
||||||
|
long long finish_calls;
|
||||||
|
double start_sec;
|
||||||
|
double finish_sec;
|
||||||
|
double direct_pack_sec;
|
||||||
|
double direct_unpack_sec;
|
||||||
|
double wait_sec;
|
||||||
|
};
|
||||||
|
|
||||||
|
SyncProfileStats &sync_profile_stats()
|
||||||
|
{
|
||||||
|
static SyncProfileStats stats = {0, 0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool sync_profile_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_PROFILE_SYNC");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sync_profile_every()
|
||||||
|
{
|
||||||
|
static int every = -1;
|
||||||
|
if (every < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_PROFILE_SYNC_EVERY");
|
||||||
|
every = (env && atoi(env) > 0) ? atoi(env) : 100;
|
||||||
|
}
|
||||||
|
return every;
|
||||||
|
}
|
||||||
|
|
||||||
|
void sync_profile_maybe_log()
|
||||||
|
{
|
||||||
|
if (!sync_profile_enabled())
|
||||||
|
return;
|
||||||
|
SyncProfileStats &stats = sync_profile_stats();
|
||||||
|
if (stats.finish_calls <= 0 || stats.finish_calls % sync_profile_every() != 0)
|
||||||
|
return;
|
||||||
|
int rank = 0;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||||
|
fprintf(stderr,
|
||||||
|
"[AMSS-SYNC][rank %d] start=%lld finish=%lld avg_start=%.6f s avg_finish=%.6f s avg_wait=%.6f s avg_cuda_pack=%.6f s avg_cuda_unpack=%.6f s\n",
|
||||||
|
rank,
|
||||||
|
stats.start_calls,
|
||||||
|
stats.finish_calls,
|
||||||
|
stats.start_calls ? stats.start_sec / (double)stats.start_calls : 0.0,
|
||||||
|
stats.finish_calls ? stats.finish_sec / (double)stats.finish_calls : 0.0,
|
||||||
|
stats.finish_calls ? stats.wait_sec / (double)stats.finish_calls : 0.0,
|
||||||
|
stats.finish_calls ? stats.direct_pack_sec / (double)stats.finish_calls : 0.0,
|
||||||
|
stats.finish_calls ? stats.direct_unpack_sec / (double)stats.finish_calls : 0.0);
|
||||||
|
fflush(stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_sync_pinned_enabled()
|
||||||
|
{
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0)
|
||||||
|
{
|
||||||
|
const char *env = getenv("AMSS_CUDA_PINNED_SYNC");
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||||
|
#else
|
||||||
|
enabled = 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_comm_buffer(double *&ptr, unsigned char &is_pinned)
|
||||||
|
{
|
||||||
|
if (!ptr)
|
||||||
|
return;
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (is_pinned)
|
||||||
|
cudaFreeHost(ptr);
|
||||||
|
else
|
||||||
|
delete[] ptr;
|
||||||
|
#else
|
||||||
|
delete[] ptr;
|
||||||
|
#endif
|
||||||
|
ptr = 0;
|
||||||
|
is_pinned = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double *alloc_comm_buffer(int length, unsigned char &is_pinned)
|
||||||
|
{
|
||||||
|
is_pinned = 0;
|
||||||
|
if (length <= 0)
|
||||||
|
return 0;
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (cuda_sync_pinned_enabled())
|
||||||
|
{
|
||||||
|
double *ptr = 0;
|
||||||
|
cudaError_t err = cudaMallocHost((void **)&ptr, (size_t)length * sizeof(double));
|
||||||
|
if (err == cudaSuccess)
|
||||||
|
{
|
||||||
|
is_pinned = 1;
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return new double[length];
|
||||||
|
}
|
||||||
|
|
||||||
|
void ensure_comm_buffer(double **buffers, unsigned char *pinned_flags, int *caps, int idx, int length)
|
||||||
|
{
|
||||||
|
if (length <= caps[idx])
|
||||||
|
return;
|
||||||
|
free_comm_buffer(buffers[idx], pinned_flags[idx]);
|
||||||
|
buffers[idx] = alloc_comm_buffer(length, pinned_flags[idx]);
|
||||||
|
if (!buffers[idx])
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Parallel: failed to allocate communication buffer (%d doubles)\n", length);
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
caps[idx] = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
int cuda_seg_begin(const Parallel::gridseg *seg, Block *bg, int dir)
|
||||||
|
{
|
||||||
|
const double dx = bg->getdX(dir);
|
||||||
|
return (int)floor((seg->llb[dir] - bg->bbox[dir]) / dx + 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type)
|
||||||
|
{
|
||||||
|
if (type != 1 || !src || !dst || !src->Bg)
|
||||||
|
return false;
|
||||||
|
return bssn_cuda_has_resident_state(src->Bg) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
|
||||||
|
{
|
||||||
|
if (type != 1 || !dst || !dst->Bg)
|
||||||
|
return false;
|
||||||
|
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_direct_pack_segment(double *buffer,
|
||||||
|
const Parallel::gridseg *src,
|
||||||
|
const Parallel::gridseg *dst,
|
||||||
|
int state_index)
|
||||||
|
{
|
||||||
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||||
|
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||||
|
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||||
|
const bool ok = bssn_cuda_pack_state_region_to_host_buffer(src->Bg, state_index, buffer, src->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_direct_unpack_segment(double *buffer,
|
||||||
|
const Parallel::gridseg *dst,
|
||||||
|
int state_index)
|
||||||
|
{
|
||||||
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||||
|
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||||
|
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||||
|
const bool ok = bssn_cuda_unpack_state_region_from_host_buffer(dst->Bg, state_index, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||||
|
return ok;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
|
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
|
||||||
{
|
{
|
||||||
@@ -3761,10 +3952,34 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
{
|
{
|
||||||
varls = VarLists;
|
varls = VarLists;
|
||||||
varld = VarListd;
|
varld = VarListd;
|
||||||
|
int state_idx = 0;
|
||||||
while (varls && varld)
|
while (varls && varld)
|
||||||
{
|
{
|
||||||
if (data)
|
if (data)
|
||||||
{
|
{
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
bool handled_by_cuda = false;
|
||||||
|
if (dir == PACK && cuda_can_direct_pack(src->data, dst->data, type))
|
||||||
|
{
|
||||||
|
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_idx);
|
||||||
|
if (!handled_by_cuda)
|
||||||
|
{
|
||||||
|
cout << "Parallel::data_packer: CUDA direct pack failed." << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (dir == UNPACK && cuda_can_direct_unpack(dst->data, type))
|
||||||
|
{
|
||||||
|
handled_by_cuda = cuda_direct_unpack_segment(data + size_out, dst->data, state_idx);
|
||||||
|
if (!handled_by_cuda)
|
||||||
|
{
|
||||||
|
cout << "Parallel::data_packer: CUDA direct unpack failed." << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!handled_by_cuda)
|
||||||
|
{
|
||||||
|
#endif
|
||||||
if (dir == PACK)
|
if (dir == PACK)
|
||||||
switch (type)
|
switch (type)
|
||||||
{
|
{
|
||||||
@@ -3788,10 +4003,14 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
||||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||||
dst->data->llb, dst->data->uub);
|
dst->data->llb, dst->data->uub);
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||||
varls = varls->next;
|
varls = varls->next;
|
||||||
varld = varld->next;
|
varld = varld->next;
|
||||||
|
++state_idx;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dst = dst->next;
|
dst = dst->next;
|
||||||
@@ -4319,7 +4538,8 @@ void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
|||||||
Parallel::SyncCache::SyncCache()
|
Parallel::SyncCache::SyncCache()
|
||||||
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
: valid(false), cpusize(0), combined_src(0), combined_dst(0),
|
||||||
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
|
||||||
send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
|
send_buf_caps(0), recv_buf_caps(0), send_buf_pinned(0), recv_buf_pinned(0),
|
||||||
|
reqs(0), stats(0), max_reqs(0),
|
||||||
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
|
lengths_valid(false), tc_req_node(0), tc_req_is_recv(0), tc_completed(0)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
@@ -4352,11 +4572,27 @@ void Parallel::SyncCache::destroy()
|
|||||||
if (recv_buf_caps) delete[] recv_buf_caps;
|
if (recv_buf_caps) delete[] recv_buf_caps;
|
||||||
for (int i = 0; i < cpusize; i++)
|
for (int i = 0; i < cpusize; i++)
|
||||||
{
|
{
|
||||||
if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
|
if (send_bufs && send_bufs[i])
|
||||||
if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
|
{
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
free_comm_buffer(send_bufs[i], send_buf_pinned[i]);
|
||||||
|
#else
|
||||||
|
delete[] send_bufs[i];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (recv_bufs && recv_bufs[i])
|
||||||
|
{
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
free_comm_buffer(recv_bufs[i], recv_buf_pinned[i]);
|
||||||
|
#else
|
||||||
|
delete[] recv_bufs[i];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (send_bufs) delete[] send_bufs;
|
if (send_bufs) delete[] send_bufs;
|
||||||
if (recv_bufs) delete[] recv_bufs;
|
if (recv_bufs) delete[] recv_bufs;
|
||||||
|
if (send_buf_pinned) delete[] send_buf_pinned;
|
||||||
|
if (recv_buf_pinned) delete[] recv_buf_pinned;
|
||||||
if (reqs) delete[] reqs;
|
if (reqs) delete[] reqs;
|
||||||
if (stats) delete[] stats;
|
if (stats) delete[] stats;
|
||||||
if (tc_req_node) delete[] tc_req_node;
|
if (tc_req_node) delete[] tc_req_node;
|
||||||
@@ -4366,6 +4602,7 @@ void Parallel::SyncCache::destroy()
|
|||||||
send_lengths = recv_lengths = 0;
|
send_lengths = recv_lengths = 0;
|
||||||
send_buf_caps = recv_buf_caps = 0;
|
send_buf_caps = recv_buf_caps = 0;
|
||||||
send_bufs = recv_bufs = 0;
|
send_bufs = recv_bufs = 0;
|
||||||
|
send_buf_pinned = recv_buf_pinned = 0;
|
||||||
reqs = 0; stats = 0;
|
reqs = 0; stats = 0;
|
||||||
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
|
tc_req_node = 0; tc_req_is_recv = 0; tc_completed = 0;
|
||||||
cpusize = 0; max_reqs = 0;
|
cpusize = 0; max_reqs = 0;
|
||||||
@@ -4396,12 +4633,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
cache.recv_lengths[node] = rlength;
|
cache.recv_lengths[node] = rlength;
|
||||||
if (rlength > 0)
|
if (rlength > 0)
|
||||||
{
|
{
|
||||||
if (rlength > cache.recv_buf_caps[node])
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
|
||||||
{
|
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
|
||||||
cache.recv_bufs[node] = new double[rlength];
|
|
||||||
cache.recv_buf_caps[node] = rlength;
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
req_node[req_no] = node;
|
req_node[req_no] = node;
|
||||||
req_is_recv[req_no] = 1;
|
req_is_recv[req_no] = 1;
|
||||||
@@ -4415,12 +4647,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
cache.recv_lengths[myrank] = self_len;
|
cache.recv_lengths[myrank] = self_len;
|
||||||
if (self_len > 0)
|
if (self_len > 0)
|
||||||
{
|
{
|
||||||
if (self_len > cache.recv_buf_caps[myrank])
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len);
|
||||||
{
|
|
||||||
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
|
||||||
cache.recv_bufs[myrank] = new double[self_len];
|
|
||||||
cache.recv_buf_caps[myrank] = self_len;
|
|
||||||
}
|
|
||||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4433,12 +4660,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
cache.send_lengths[node] = slength;
|
cache.send_lengths[node] = slength;
|
||||||
if (slength > 0)
|
if (slength > 0)
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
|
||||||
{
|
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
|
||||||
cache.send_bufs[node] = new double[slength];
|
|
||||||
cache.send_buf_caps[node] = slength;
|
|
||||||
}
|
|
||||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
req_node[req_no] = node;
|
req_node[req_no] = node;
|
||||||
@@ -4471,82 +4693,57 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
|||||||
if (self_len > 0)
|
if (self_len > 0)
|
||||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||||
}
|
}
|
||||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
|
||||||
{
|
{
|
||||||
if (!cache.valid)
|
if (cache.valid)
|
||||||
|
return;
|
||||||
|
|
||||||
|
int cpusize;
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||||
|
cache.cpusize = cpusize;
|
||||||
|
|
||||||
|
if (!cache.combined_src)
|
||||||
{
|
{
|
||||||
int cpusize;
|
cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
|
||||||
cache.cpusize = cpusize;
|
cache.send_lengths = new int[cpusize];
|
||||||
|
cache.recv_lengths = new int[cpusize];
|
||||||
// Allocate cache arrays if needed
|
cache.send_bufs = new double *[cpusize];
|
||||||
if (!cache.combined_src)
|
cache.recv_bufs = new double *[cpusize];
|
||||||
|
cache.send_buf_caps = new int[cpusize];
|
||||||
|
cache.recv_buf_caps = new int[cpusize];
|
||||||
|
cache.send_buf_pinned = new unsigned char[cpusize];
|
||||||
|
cache.recv_buf_pinned = new unsigned char[cpusize];
|
||||||
|
for (int i = 0; i < cpusize; i++)
|
||||||
{
|
{
|
||||||
cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
|
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
||||||
cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
|
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
||||||
cache.send_lengths = new int[cpusize];
|
cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
|
||||||
cache.recv_lengths = new int[cpusize];
|
|
||||||
cache.send_bufs = new double *[cpusize];
|
|
||||||
cache.recv_bufs = new double *[cpusize];
|
|
||||||
cache.send_buf_caps = new int[cpusize];
|
|
||||||
cache.recv_buf_caps = new int[cpusize];
|
|
||||||
for (int i = 0; i < cpusize; i++)
|
|
||||||
{
|
|
||||||
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
|
||||||
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
|
||||||
}
|
|
||||||
cache.max_reqs = 2 * cpusize;
|
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
|
||||||
cache.tc_req_node = new int[cache.max_reqs];
|
|
||||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
|
||||||
cache.tc_completed = new int[cache.max_reqs];
|
|
||||||
}
|
}
|
||||||
|
cache.max_reqs = 2 * cpusize;
|
||||||
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
|
cache.stats = new MPI_Status[cache.max_reqs];
|
||||||
|
cache.tc_req_node = new int[cache.max_reqs];
|
||||||
|
cache.tc_req_is_recv = new int[cache.max_reqs];
|
||||||
|
cache.tc_completed = new int[cache.max_reqs];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
cache.combined_src[node] = cache.combined_dst[node] = 0;
|
||||||
|
cache.send_lengths[node] = cache.recv_lengths[node] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
MyList<Patch> *Pp = PatL;
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
Patch *Pat = Pp->data;
|
||||||
|
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
cache.combined_src[node] = cache.combined_dst[node] = 0;
|
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
|
||||||
cache.send_lengths[node] = cache.recv_lengths[node] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build intra-patch segments (same as Sync_merged Phase A)
|
|
||||||
MyList<Patch> *Pp = PatL;
|
|
||||||
while (Pp)
|
|
||||||
{
|
|
||||||
Patch *Pat = Pp->data;
|
|
||||||
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
|
||||||
{
|
|
||||||
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
|
|
||||||
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
|
||||||
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
|
|
||||||
if (tsrc)
|
|
||||||
{
|
|
||||||
if (cache.combined_src[node])
|
|
||||||
cache.combined_src[node]->catList(tsrc);
|
|
||||||
else
|
|
||||||
cache.combined_src[node] = tsrc;
|
|
||||||
}
|
|
||||||
if (tdst)
|
|
||||||
{
|
|
||||||
if (cache.combined_dst[node])
|
|
||||||
cache.combined_dst[node]->catList(tdst);
|
|
||||||
else
|
|
||||||
cache.combined_dst[node] = tdst;
|
|
||||||
}
|
|
||||||
if (src_owned) src_owned->destroyList();
|
|
||||||
}
|
|
||||||
if (dst_ghost) dst_ghost->destroyList();
|
|
||||||
Pp = Pp->next;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build inter-patch segments (same as Sync_merged Phase B)
|
|
||||||
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
|
||||||
{
|
|
||||||
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
|
|
||||||
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
||||||
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
|
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
|
||||||
if (tsrc)
|
if (tsrc)
|
||||||
{
|
{
|
||||||
if (cache.combined_src[node])
|
if (cache.combined_src[node])
|
||||||
@@ -4563,11 +4760,40 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
|||||||
}
|
}
|
||||||
if (src_owned) src_owned->destroyList();
|
if (src_owned) src_owned->destroyList();
|
||||||
}
|
}
|
||||||
if (dst_buffer) dst_buffer->destroyList();
|
if (dst_ghost) dst_ghost->destroyList();
|
||||||
|
Pp = Pp->next;
|
||||||
cache.valid = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
|
||||||
|
for (int node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
|
||||||
|
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
||||||
|
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
|
||||||
|
if (tsrc)
|
||||||
|
{
|
||||||
|
if (cache.combined_src[node])
|
||||||
|
cache.combined_src[node]->catList(tsrc);
|
||||||
|
else
|
||||||
|
cache.combined_src[node] = tsrc;
|
||||||
|
}
|
||||||
|
if (tdst)
|
||||||
|
{
|
||||||
|
if (cache.combined_dst[node])
|
||||||
|
cache.combined_dst[node]->catList(tdst);
|
||||||
|
else
|
||||||
|
cache.combined_dst[node] = tdst;
|
||||||
|
}
|
||||||
|
if (src_owned) src_owned->destroyList();
|
||||||
|
}
|
||||||
|
if (dst_buffer) dst_buffer->destroyList();
|
||||||
|
|
||||||
|
cache.valid = true;
|
||||||
|
}
|
||||||
|
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||||
|
{
|
||||||
|
Sync_ensure_cache(PatL, Symmetry, cache);
|
||||||
|
|
||||||
// Use cached lists with buffer-reusing transfer
|
// Use cached lists with buffer-reusing transfer
|
||||||
transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
|
transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
|
||||||
}
|
}
|
||||||
@@ -4575,98 +4801,8 @@ void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
|||||||
void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
||||||
SyncCache &cache, AsyncSyncState &state)
|
SyncCache &cache, AsyncSyncState &state)
|
||||||
{
|
{
|
||||||
// Ensure cache is built
|
const double t_start = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
if (!cache.valid)
|
Sync_ensure_cache(PatL, Symmetry, cache);
|
||||||
{
|
|
||||||
// Build cache (same logic as Sync_cached)
|
|
||||||
int cpusize;
|
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
|
||||||
cache.cpusize = cpusize;
|
|
||||||
|
|
||||||
if (!cache.combined_src)
|
|
||||||
{
|
|
||||||
cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
|
|
||||||
cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
|
|
||||||
cache.send_lengths = new int[cpusize];
|
|
||||||
cache.recv_lengths = new int[cpusize];
|
|
||||||
cache.send_bufs = new double *[cpusize];
|
|
||||||
cache.recv_bufs = new double *[cpusize];
|
|
||||||
cache.send_buf_caps = new int[cpusize];
|
|
||||||
cache.recv_buf_caps = new int[cpusize];
|
|
||||||
for (int i = 0; i < cpusize; i++)
|
|
||||||
{
|
|
||||||
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
|
||||||
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
|
||||||
}
|
|
||||||
cache.max_reqs = 2 * cpusize;
|
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
|
||||||
cache.stats = new MPI_Status[cache.max_reqs];
|
|
||||||
cache.tc_req_node = new int[cache.max_reqs];
|
|
||||||
cache.tc_req_is_recv = new int[cache.max_reqs];
|
|
||||||
cache.tc_completed = new int[cache.max_reqs];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
|
||||||
{
|
|
||||||
cache.combined_src[node] = cache.combined_dst[node] = 0;
|
|
||||||
cache.send_lengths[node] = cache.recv_lengths[node] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
MyList<Patch> *Pp = PatL;
|
|
||||||
while (Pp)
|
|
||||||
{
|
|
||||||
Patch *Pat = Pp->data;
|
|
||||||
MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
|
||||||
{
|
|
||||||
MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
|
|
||||||
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
|
||||||
build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
|
|
||||||
if (tsrc)
|
|
||||||
{
|
|
||||||
if (cache.combined_src[node])
|
|
||||||
cache.combined_src[node]->catList(tsrc);
|
|
||||||
else
|
|
||||||
cache.combined_src[node] = tsrc;
|
|
||||||
}
|
|
||||||
if (tdst)
|
|
||||||
{
|
|
||||||
if (cache.combined_dst[node])
|
|
||||||
cache.combined_dst[node]->catList(tdst);
|
|
||||||
else
|
|
||||||
cache.combined_dst[node] = tdst;
|
|
||||||
}
|
|
||||||
if (src_owned) src_owned->destroyList();
|
|
||||||
}
|
|
||||||
if (dst_ghost) dst_ghost->destroyList();
|
|
||||||
Pp = Pp->next;
|
|
||||||
}
|
|
||||||
|
|
||||||
MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
|
||||||
{
|
|
||||||
MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
|
|
||||||
MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
|
|
||||||
build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
|
|
||||||
if (tsrc)
|
|
||||||
{
|
|
||||||
if (cache.combined_src[node])
|
|
||||||
cache.combined_src[node]->catList(tsrc);
|
|
||||||
else
|
|
||||||
cache.combined_src[node] = tsrc;
|
|
||||||
}
|
|
||||||
if (tdst)
|
|
||||||
{
|
|
||||||
if (cache.combined_dst[node])
|
|
||||||
cache.combined_dst[node]->catList(tdst);
|
|
||||||
else
|
|
||||||
cache.combined_dst[node] = tdst;
|
|
||||||
}
|
|
||||||
if (src_owned) src_owned->destroyList();
|
|
||||||
}
|
|
||||||
if (dst_buffer) dst_buffer->destroyList();
|
|
||||||
cache.valid = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now pack and post async MPI operations
|
// Now pack and post async MPI operations
|
||||||
int myrank;
|
int myrank;
|
||||||
@@ -4683,6 +4819,27 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
MyList<Parallel::gridseg> **src = cache.combined_src;
|
MyList<Parallel::gridseg> **src = cache.combined_src;
|
||||||
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
||||||
|
|
||||||
|
for (int node = 0; node < cpusize; node++)
|
||||||
|
{
|
||||||
|
if (node == myrank)
|
||||||
|
continue;
|
||||||
|
int rlength;
|
||||||
|
if (!cache.lengths_valid) {
|
||||||
|
rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
|
||||||
|
cache.recv_lengths[node] = rlength;
|
||||||
|
} else {
|
||||||
|
rlength = cache.recv_lengths[node];
|
||||||
|
}
|
||||||
|
if (rlength > 0)
|
||||||
|
{
|
||||||
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
|
||||||
|
state.req_node[state.req_no] = node;
|
||||||
|
state.req_is_recv[state.req_no] = 1;
|
||||||
|
state.pending_recv++;
|
||||||
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int node = 0; node < cpusize; node++)
|
for (int node = 0; node < cpusize; node++)
|
||||||
{
|
{
|
||||||
if (node == myrank)
|
if (node == myrank)
|
||||||
@@ -4696,12 +4853,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
}
|
}
|
||||||
if (length > 0)
|
if (length > 0)
|
||||||
{
|
{
|
||||||
if (length > cache.recv_buf_caps[node])
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, length);
|
||||||
{
|
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
|
||||||
cache.recv_bufs[node] = new double[length];
|
|
||||||
cache.recv_buf_caps[node] = length;
|
|
||||||
}
|
|
||||||
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4716,40 +4868,21 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
|||||||
}
|
}
|
||||||
if (slength > 0)
|
if (slength > 0)
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
|
||||||
{
|
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
|
||||||
cache.send_bufs[node] = new double[slength];
|
|
||||||
cache.send_buf_caps[node] = slength;
|
|
||||||
}
|
|
||||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||||
state.req_node[state.req_no] = node;
|
state.req_node[state.req_no] = node;
|
||||||
state.req_is_recv[state.req_no] = 0;
|
state.req_is_recv[state.req_no] = 0;
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||||
}
|
}
|
||||||
int rlength;
|
|
||||||
if (!cache.lengths_valid) {
|
|
||||||
rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
|
|
||||||
cache.recv_lengths[node] = rlength;
|
|
||||||
} else {
|
|
||||||
rlength = cache.recv_lengths[node];
|
|
||||||
}
|
|
||||||
if (rlength > 0)
|
|
||||||
{
|
|
||||||
if (rlength > cache.recv_buf_caps[node])
|
|
||||||
{
|
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
|
||||||
cache.recv_bufs[node] = new double[rlength];
|
|
||||||
cache.recv_buf_caps[node] = rlength;
|
|
||||||
}
|
|
||||||
state.req_node[state.req_no] = node;
|
|
||||||
state.req_is_recv[state.req_no] = 1;
|
|
||||||
state.pending_recv++;
|
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cache.lengths_valid = true;
|
cache.lengths_valid = true;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
{
|
||||||
|
SyncProfileStats &stats = sync_profile_stats();
|
||||||
|
stats.start_calls++;
|
||||||
|
stats.start_sec += MPI_Wtime() - t_start;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Sync_finish: progressive unpack as receives complete, then wait for sends
|
// Sync_finish: progressive unpack as receives complete, then wait for sends
|
||||||
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
||||||
@@ -4762,6 +4895,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
|||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
MyList<Parallel::gridseg> **src = cache.combined_src;
|
MyList<Parallel::gridseg> **src = cache.combined_src;
|
||||||
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
MyList<Parallel::gridseg> **dst = cache.combined_dst;
|
||||||
|
const double t_finish = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
double wait_sec = 0.0;
|
||||||
|
|
||||||
// Unpack local data first (no MPI needed)
|
// Unpack local data first (no MPI needed)
|
||||||
if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
|
if (cache.recv_bufs[myrank] && cache.recv_lengths[myrank] > 0)
|
||||||
@@ -4771,15 +4906,17 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
|||||||
if (state.pending_recv > 0 && state.req_no > 0)
|
if (state.pending_recv > 0 && state.req_no > 0)
|
||||||
{
|
{
|
||||||
int pending = state.pending_recv;
|
int pending = state.pending_recv;
|
||||||
int *completed = new int[cache.max_reqs];
|
|
||||||
while (pending > 0)
|
while (pending > 0)
|
||||||
{
|
{
|
||||||
int outcount = 0;
|
int outcount = 0;
|
||||||
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
|
const double t_wait = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
MPI_Waitsome(state.req_no, cache.reqs, &outcount, cache.tc_completed, cache.stats);
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
wait_sec += MPI_Wtime() - t_wait;
|
||||||
if (outcount == MPI_UNDEFINED) break;
|
if (outcount == MPI_UNDEFINED) break;
|
||||||
for (int i = 0; i < outcount; i++)
|
for (int i = 0; i < outcount; i++)
|
||||||
{
|
{
|
||||||
int idx = completed[i];
|
int idx = cache.tc_completed[i];
|
||||||
if (idx >= 0 && state.req_is_recv[idx])
|
if (idx >= 0 && state.req_is_recv[idx])
|
||||||
{
|
{
|
||||||
int recv_node = state.req_node[idx];
|
int recv_node = state.req_node[idx];
|
||||||
@@ -4788,15 +4925,28 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete[] completed;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for remaining sends
|
// Wait for remaining sends
|
||||||
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
|
if (state.req_no > 0)
|
||||||
|
{
|
||||||
|
const double t_wait = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
|
MPI_Waitall(state.req_no, cache.reqs, cache.stats);
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
wait_sec += MPI_Wtime() - t_wait;
|
||||||
|
}
|
||||||
|
|
||||||
delete[] state.req_node; state.req_node = 0;
|
delete[] state.req_node; state.req_node = 0;
|
||||||
delete[] state.req_is_recv; state.req_is_recv = 0;
|
delete[] state.req_is_recv; state.req_is_recv = 0;
|
||||||
state.active = false;
|
state.active = false;
|
||||||
|
if (sync_profile_enabled())
|
||||||
|
{
|
||||||
|
SyncProfileStats &stats = sync_profile_stats();
|
||||||
|
stats.finish_calls++;
|
||||||
|
stats.finish_sec += MPI_Wtime() - t_finish;
|
||||||
|
stats.wait_sec += wait_sec;
|
||||||
|
sync_profile_maybe_log();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
|
// collect buffer grid segments or blocks for the periodic boundary condition of given patch
|
||||||
// ---------------------------------------------------
|
// ---------------------------------------------------
|
||||||
@@ -5854,10 +6004,13 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.recv_bufs = new double *[cpusize];
|
cache.recv_bufs = new double *[cpusize];
|
||||||
cache.send_buf_caps = new int[cpusize];
|
cache.send_buf_caps = new int[cpusize];
|
||||||
cache.recv_buf_caps = new int[cpusize];
|
cache.recv_buf_caps = new int[cpusize];
|
||||||
|
cache.send_buf_pinned = new unsigned char[cpusize];
|
||||||
|
cache.recv_buf_pinned = new unsigned char[cpusize];
|
||||||
for (int i = 0; i < cpusize; i++)
|
for (int i = 0; i < cpusize; i++)
|
||||||
{
|
{
|
||||||
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
||||||
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
||||||
|
cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
|
||||||
}
|
}
|
||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
@@ -5903,10 +6056,13 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.recv_bufs = new double *[cpusize];
|
cache.recv_bufs = new double *[cpusize];
|
||||||
cache.send_buf_caps = new int[cpusize];
|
cache.send_buf_caps = new int[cpusize];
|
||||||
cache.recv_buf_caps = new int[cpusize];
|
cache.recv_buf_caps = new int[cpusize];
|
||||||
|
cache.send_buf_pinned = new unsigned char[cpusize];
|
||||||
|
cache.recv_buf_pinned = new unsigned char[cpusize];
|
||||||
for (int i = 0; i < cpusize; i++)
|
for (int i = 0; i < cpusize; i++)
|
||||||
{
|
{
|
||||||
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
||||||
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
||||||
|
cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
|
||||||
}
|
}
|
||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
@@ -5952,10 +6108,13 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.recv_bufs = new double *[cpusize];
|
cache.recv_bufs = new double *[cpusize];
|
||||||
cache.send_buf_caps = new int[cpusize];
|
cache.send_buf_caps = new int[cpusize];
|
||||||
cache.recv_buf_caps = new int[cpusize];
|
cache.recv_buf_caps = new int[cpusize];
|
||||||
|
cache.send_buf_pinned = new unsigned char[cpusize];
|
||||||
|
cache.recv_buf_pinned = new unsigned char[cpusize];
|
||||||
for (int i = 0; i < cpusize; i++)
|
for (int i = 0; i < cpusize; i++)
|
||||||
{
|
{
|
||||||
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
cache.send_bufs[i] = cache.recv_bufs[i] = 0;
|
||||||
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
|
||||||
|
cache.send_buf_pinned[i] = cache.recv_buf_pinned[i] = 0;
|
||||||
}
|
}
|
||||||
cache.max_reqs = 2 * cpusize;
|
cache.max_reqs = 2 * cpusize;
|
||||||
cache.reqs = new MPI_Request[cache.max_reqs];
|
cache.reqs = new MPI_Request[cache.max_reqs];
|
||||||
@@ -5998,12 +6157,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.recv_lengths[node] = rlength;
|
cache.recv_lengths[node] = rlength;
|
||||||
if (rlength > 0)
|
if (rlength > 0)
|
||||||
{
|
{
|
||||||
if (rlength > cache.recv_buf_caps[node])
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, node, rlength);
|
||||||
{
|
|
||||||
if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
|
|
||||||
cache.recv_bufs[node] = new double[rlength];
|
|
||||||
cache.recv_buf_caps[node] = rlength;
|
|
||||||
}
|
|
||||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
req_node[req_no] = node;
|
req_node[req_no] = node;
|
||||||
req_is_recv[req_no] = 1;
|
req_is_recv[req_no] = 1;
|
||||||
@@ -6017,12 +6171,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.recv_lengths[myrank] = self_len;
|
cache.recv_lengths[myrank] = self_len;
|
||||||
if (self_len > 0)
|
if (self_len > 0)
|
||||||
{
|
{
|
||||||
if (self_len > cache.recv_buf_caps[myrank])
|
ensure_comm_buffer(cache.recv_bufs, cache.recv_buf_pinned, cache.recv_buf_caps, myrank, self_len);
|
||||||
{
|
|
||||||
if (cache.recv_bufs[myrank]) delete[] cache.recv_bufs[myrank];
|
|
||||||
cache.recv_bufs[myrank] = new double[self_len];
|
|
||||||
cache.recv_buf_caps[myrank] = self_len;
|
|
||||||
}
|
|
||||||
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -6035,12 +6184,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
|||||||
cache.send_lengths[node] = slength;
|
cache.send_lengths[node] = slength;
|
||||||
if (slength > 0)
|
if (slength > 0)
|
||||||
{
|
{
|
||||||
if (slength > cache.send_buf_caps[node])
|
ensure_comm_buffer(cache.send_bufs, cache.send_buf_pinned, cache.send_buf_caps, node, slength);
|
||||||
{
|
|
||||||
if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
|
|
||||||
cache.send_bufs[node] = new double[slength];
|
|
||||||
cache.send_buf_caps[node] = slength;
|
|
||||||
}
|
|
||||||
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||||
req_node[req_no] = node;
|
req_node[req_no] = node;
|
||||||
|
|||||||
@@ -104,6 +104,8 @@ namespace Parallel
|
|||||||
double **recv_bufs;
|
double **recv_bufs;
|
||||||
int *send_buf_caps;
|
int *send_buf_caps;
|
||||||
int *recv_buf_caps;
|
int *recv_buf_caps;
|
||||||
|
unsigned char *send_buf_pinned;
|
||||||
|
unsigned char *recv_buf_pinned;
|
||||||
MPI_Request *reqs;
|
MPI_Request *reqs;
|
||||||
MPI_Status *stats;
|
MPI_Status *stats;
|
||||||
int max_reqs;
|
int max_reqs;
|
||||||
@@ -117,6 +119,7 @@ namespace Parallel
|
|||||||
};
|
};
|
||||||
|
|
||||||
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
|
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
|
||||||
|
void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
|
||||||
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
|
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
|
||||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||||
int Symmetry, SyncCache &cache);
|
int Symmetry, SyncCache &cache);
|
||||||
|
|||||||
@@ -76,6 +76,48 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
|||||||
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
|
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool bssn_cuda_use_resident_sync(int lev)
|
||||||
|
{
|
||||||
|
#ifdef WithShell
|
||||||
|
(void)lev;
|
||||||
|
return false;
|
||||||
|
#else
|
||||||
|
return lev == 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
|
||||||
|
{
|
||||||
|
MyList<Patch> *Pp = PatL;
|
||||||
|
while (Pp)
|
||||||
|
{
|
||||||
|
MyList<Block> *BP = Pp->data->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *cg = BP->data;
|
||||||
|
if (myrank == cg->rank)
|
||||||
|
{
|
||||||
|
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||||
|
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||||
|
{
|
||||||
|
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
|
||||||
|
{
|
||||||
|
cout << "CUDA resident state download failed" << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
bssn_cuda_release_step_ctx(cg);
|
||||||
|
}
|
||||||
|
if (BP == Pp->data->ble)
|
||||||
|
break;
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
Pp = Pp->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -3063,6 +3105,11 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
||||||
|
|
||||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
const bool use_cuda_resident_sync = bssn_cuda_use_resident_sync(lev);
|
||||||
|
#else
|
||||||
|
const bool use_cuda_resident_sync = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
// new code 2013-2-15, zjcao
|
// new code 2013-2-15, zjcao
|
||||||
#if (MAPBH == 1)
|
#if (MAPBH == 1)
|
||||||
@@ -3129,14 +3176,16 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
if (myrank == cg->rank)
|
if (myrank == cg->rank)
|
||||||
{
|
{
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
f_enforce_ga(cg->shape,
|
if (!use_cuda_resident_sync)
|
||||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
f_enforce_ga(cg->shape,
|
||||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||||
|
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool used_gpu_substep = false;
|
bool used_gpu_substep = false;
|
||||||
|
bool used_gpu_resident_state = false;
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
{
|
{
|
||||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||||
@@ -3154,6 +3203,11 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
int apply_bam_bc = 0;
|
int apply_bam_bc = 0;
|
||||||
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
|
int apply_enforce_ga = 0;
|
||||||
|
#if (AGM == 0)
|
||||||
|
apply_enforce_ga = 1;
|
||||||
|
#endif
|
||||||
#if (SommerType == 0)
|
#if (SommerType == 0)
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||||
@@ -3164,7 +3218,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
state_in, state_out, matter,
|
state_in, state_out, matter,
|
||||||
propspeed, soa_flat, Pp->data->bbox,
|
propspeed, soa_flat, Pp->data->bbox,
|
||||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||||
Symmetry, lev, ndeps, pre))
|
Symmetry, lev, ndeps, pre,
|
||||||
|
keep_resident_state, apply_enforce_ga, chitiny))
|
||||||
{
|
{
|
||||||
cout << "CUDA predictor substep failed in domain: ("
|
cout << "CUDA predictor substep failed in domain: ("
|
||||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
@@ -3173,6 +3228,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
}
|
}
|
||||||
used_gpu_substep = true;
|
used_gpu_substep = true;
|
||||||
|
used_gpu_resident_state = (keep_resident_state != 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!used_gpu_substep)
|
if (!used_gpu_substep)
|
||||||
@@ -3277,7 +3333,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
varlrhs = varlrhs->next;
|
varlrhs = varlrhs->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
if (!used_gpu_resident_state)
|
||||||
|
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
break;
|
break;
|
||||||
@@ -3531,13 +3588,14 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
if (myrank == cg->rank)
|
if (myrank == cg->rank)
|
||||||
{
|
{
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
f_enforce_ga(cg->shape,
|
if (!use_cuda_resident_sync)
|
||||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
f_enforce_ga(cg->shape,
|
||||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||||
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||||
|
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||||
#elif (AGM == 1)
|
#elif (AGM == 1)
|
||||||
if (iter_count == 3)
|
if (iter_count == 3 && !use_cuda_resident_sync)
|
||||||
f_enforce_ga(cg->shape,
|
f_enforce_ga(cg->shape,
|
||||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||||
@@ -3546,6 +3604,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool used_gpu_substep = false;
|
bool used_gpu_substep = false;
|
||||||
|
bool used_gpu_resident_state = false;
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
{
|
{
|
||||||
double *state_in[BSSN_CUDA_STATE_COUNT];
|
double *state_in[BSSN_CUDA_STATE_COUNT];
|
||||||
@@ -3563,6 +3622,13 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
int apply_bam_bc = 0;
|
int apply_bam_bc = 0;
|
||||||
|
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||||
|
int apply_enforce_ga = 0;
|
||||||
|
#if (AGM == 0)
|
||||||
|
apply_enforce_ga = 1;
|
||||||
|
#elif (AGM == 1)
|
||||||
|
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
|
||||||
|
#endif
|
||||||
#if (SommerType == 0)
|
#if (SommerType == 0)
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||||
@@ -3573,7 +3639,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
state_in, state_out, matter,
|
state_in, state_out, matter,
|
||||||
propspeed, soa_flat, Pp->data->bbox,
|
propspeed, soa_flat, Pp->data->bbox,
|
||||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||||
Symmetry, lev, ndeps, cor))
|
Symmetry, lev, ndeps, cor,
|
||||||
|
keep_resident_state, apply_enforce_ga, chitiny))
|
||||||
{
|
{
|
||||||
cout << "CUDA corrector substep failed in domain: ("
|
cout << "CUDA corrector substep failed in domain: ("
|
||||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
@@ -3582,6 +3649,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
}
|
}
|
||||||
used_gpu_substep = true;
|
used_gpu_substep = true;
|
||||||
|
used_gpu_resident_state = (keep_resident_state != 0);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!used_gpu_substep)
|
if (!used_gpu_substep)
|
||||||
@@ -3686,7 +3754,8 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
varlrhs = varlrhs->next;
|
varlrhs = varlrhs->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
if (!used_gpu_resident_state)
|
||||||
|
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
break;
|
break;
|
||||||
@@ -3969,6 +4038,10 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if USE_CUDA_BSSN
|
||||||
|
if (use_cuda_resident_sync)
|
||||||
|
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
|
||||||
|
#endif
|
||||||
#if (RPS == 0)
|
#if (RPS == 0)
|
||||||
// mesh refinement boundary part
|
// mesh refinement boundary part
|
||||||
RestrictProlong(lev, YN, BB);
|
RestrictProlong(lev, YN, BB);
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
#include <chrono>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@@ -63,6 +64,72 @@ static void init_gpu_dispatch() {
|
|||||||
g_dispatch.inited = true;
|
g_dispatch.inited = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct CudaProfileStats {
|
||||||
|
long long calls;
|
||||||
|
double total_ms;
|
||||||
|
double state_ms;
|
||||||
|
double matter_ms;
|
||||||
|
double rhs_ms;
|
||||||
|
double bc_ms;
|
||||||
|
double finalize_ms;
|
||||||
|
double output_ms;
|
||||||
|
};
|
||||||
|
|
||||||
|
static CudaProfileStats &cuda_profile_stats() {
|
||||||
|
static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
|
||||||
|
return stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool cuda_profile_enabled() {
|
||||||
|
static int enabled = -1;
|
||||||
|
if (enabled < 0) {
|
||||||
|
const char *env = getenv("AMSS_PROFILE_CUDA");
|
||||||
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
return enabled != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int cuda_profile_every() {
|
||||||
|
static int every = -1;
|
||||||
|
if (every < 0) {
|
||||||
|
const char *env = getenv("AMSS_PROFILE_CUDA_EVERY");
|
||||||
|
every = (env && atoi(env) > 0) ? atoi(env) : 100;
|
||||||
|
}
|
||||||
|
return every;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double cuda_profile_now_ms() {
|
||||||
|
using clock = std::chrono::steady_clock;
|
||||||
|
return std::chrono::duration<double, std::milli>(
|
||||||
|
clock::now().time_since_epoch()).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_profile_sync() {
|
||||||
|
cudaError_t err = cudaDeviceSynchronize();
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
fprintf(stderr, "CUDA error %s:%d: %s\n",
|
||||||
|
__FILE__, __LINE__, cudaGetErrorString(err));
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void cuda_profile_maybe_log() {
|
||||||
|
if (!cuda_profile_enabled()) return;
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
|
||||||
|
fprintf(stderr,
|
||||||
|
"[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n",
|
||||||
|
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
|
||||||
|
stats.total_ms / (double)stats.calls,
|
||||||
|
stats.state_ms / (double)stats.calls,
|
||||||
|
stats.matter_ms / (double)stats.calls,
|
||||||
|
stats.rhs_ms / (double)stats.calls,
|
||||||
|
stats.bc_ms / (double)stats.calls,
|
||||||
|
stats.finalize_ms / (double)stats.calls,
|
||||||
|
stats.output_ms / (double)stats.calls);
|
||||||
|
fflush(stderr);
|
||||||
|
}
|
||||||
|
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
/* Error checking */
|
/* Error checking */
|
||||||
/* ------------------------------------------------------------------ */
|
/* ------------------------------------------------------------------ */
|
||||||
@@ -248,12 +315,17 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
|
|||||||
struct StepContext {
|
struct StepContext {
|
||||||
double *d_state0_mem;
|
double *d_state0_mem;
|
||||||
double *d_accum_mem;
|
double *d_accum_mem;
|
||||||
|
double *d_state_curr_mem;
|
||||||
|
double *d_state_next_mem;
|
||||||
double *d_matter_mem;
|
double *d_matter_mem;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||||
|
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||||
|
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||||
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
||||||
size_t cap_all;
|
size_t cap_all;
|
||||||
bool matter_ready;
|
bool matter_ready;
|
||||||
|
bool state_ready;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::unordered_map<void *, StepContext> g_step_ctx;
|
static std::unordered_map<void *, StepContext> g_step_ctx;
|
||||||
@@ -321,19 +393,32 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
|||||||
cudaFree(ctx.d_accum_mem);
|
cudaFree(ctx.d_accum_mem);
|
||||||
ctx.d_accum_mem = nullptr;
|
ctx.d_accum_mem = nullptr;
|
||||||
}
|
}
|
||||||
|
if (ctx.d_state_curr_mem) {
|
||||||
|
cudaFree(ctx.d_state_curr_mem);
|
||||||
|
ctx.d_state_curr_mem = nullptr;
|
||||||
|
}
|
||||||
|
if (ctx.d_state_next_mem) {
|
||||||
|
cudaFree(ctx.d_state_next_mem);
|
||||||
|
ctx.d_state_next_mem = nullptr;
|
||||||
|
}
|
||||||
if (ctx.d_matter_mem) {
|
if (ctx.d_matter_mem) {
|
||||||
cudaFree(ctx.d_matter_mem);
|
cudaFree(ctx.d_matter_mem);
|
||||||
ctx.d_matter_mem = nullptr;
|
ctx.d_matter_mem = nullptr;
|
||||||
}
|
}
|
||||||
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
|
CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
|
||||||
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
|
CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
|
||||||
ctx.cap_all = all;
|
ctx.cap_all = all;
|
||||||
ctx.matter_ready = false;
|
ctx.matter_ready = false;
|
||||||
|
ctx.state_ready = false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
||||||
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
||||||
|
ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all;
|
||||||
|
ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||||
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
|
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
|
||||||
@@ -347,6 +432,8 @@ static void release_step_ctx(void *block_tag)
|
|||||||
if (it == g_step_ctx.end()) return;
|
if (it == g_step_ctx.end()) return;
|
||||||
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
|
if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
|
||||||
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
|
if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
|
||||||
|
if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem);
|
||||||
|
if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem);
|
||||||
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
|
if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
|
||||||
g_step_ctx.erase(it);
|
g_step_ctx.erase(it);
|
||||||
}
|
}
|
||||||
@@ -1050,6 +1137,76 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
||||||
|
double * __restrict__ gxy,
|
||||||
|
double * __restrict__ gxz,
|
||||||
|
double * __restrict__ dyy,
|
||||||
|
double * __restrict__ gyz,
|
||||||
|
double * __restrict__ dzz,
|
||||||
|
double * __restrict__ Axx,
|
||||||
|
double * __restrict__ Axy,
|
||||||
|
double * __restrict__ Axz,
|
||||||
|
double * __restrict__ Ayy,
|
||||||
|
double * __restrict__ Ayz,
|
||||||
|
double * __restrict__ Azz)
|
||||||
|
{
|
||||||
|
constexpr double F1O3 = 1.0 / 3.0;
|
||||||
|
constexpr double ONE = 1.0;
|
||||||
|
constexpr double TWO = 2.0;
|
||||||
|
|
||||||
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
i < d_gp.all;
|
||||||
|
i += blockDim.x * gridDim.x)
|
||||||
|
{
|
||||||
|
double lgxx = dxx[i] + ONE;
|
||||||
|
double lgyy = dyy[i] + ONE;
|
||||||
|
double lgzz = dzz[i] + ONE;
|
||||||
|
double lgxy = gxy[i];
|
||||||
|
double lgxz = gxz[i];
|
||||||
|
double lgyz = gyz[i];
|
||||||
|
|
||||||
|
double lscale = lgxx * lgyy * lgzz
|
||||||
|
+ lgxy * lgyz * lgxz
|
||||||
|
+ lgxz * lgxy * lgyz
|
||||||
|
- lgxz * lgyy * lgxz
|
||||||
|
- lgxy * lgxy * lgzz
|
||||||
|
- lgxx * lgyz * lgyz;
|
||||||
|
|
||||||
|
lscale = ONE / cbrt(lscale);
|
||||||
|
|
||||||
|
lgxx *= lscale;
|
||||||
|
lgxy *= lscale;
|
||||||
|
lgxz *= lscale;
|
||||||
|
lgyy *= lscale;
|
||||||
|
lgyz *= lscale;
|
||||||
|
lgzz *= lscale;
|
||||||
|
|
||||||
|
dxx[i] = lgxx - ONE;
|
||||||
|
gxy[i] = lgxy;
|
||||||
|
gxz[i] = lgxz;
|
||||||
|
dyy[i] = lgyy - ONE;
|
||||||
|
gyz[i] = lgyz;
|
||||||
|
dzz[i] = lgzz - ONE;
|
||||||
|
|
||||||
|
const double lgupxx = (lgyy * lgzz - lgyz * lgyz);
|
||||||
|
const double lgupxy = - (lgxy * lgzz - lgyz * lgxz);
|
||||||
|
const double lgupxz = (lgxy * lgyz - lgyy * lgxz);
|
||||||
|
const double lgupyy = (lgxx * lgzz - lgxz * lgxz);
|
||||||
|
const double lgupyz = - (lgxx * lgyz - lgxy * lgxz);
|
||||||
|
const double lgupzz = (lgxx * lgyy - lgxy * lgxy);
|
||||||
|
|
||||||
|
const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i]
|
||||||
|
+ TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]);
|
||||||
|
|
||||||
|
Axx[i] -= F1O3 * lgxx * ltrA;
|
||||||
|
Axy[i] -= F1O3 * lgxy * ltrA;
|
||||||
|
Axz[i] -= F1O3 * lgxz * ltrA;
|
||||||
|
Ayy[i] -= F1O3 * lgyy * ltrA;
|
||||||
|
Ayz[i] -= F1O3 * lgyz * ltrA;
|
||||||
|
Azz[i] -= F1O3 * lgzz * ltrA;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
|
__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
|
||||||
{
|
{
|
||||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@@ -2429,6 +2586,20 @@ static void bind_matter_slots(const StepContext &ctx)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void bind_state_input_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
g_buf.slot[k_state_input_slots[i]] = state[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
g_buf.slot[k_state_rhs_slots[i]] = state[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void launch_rhs_pipeline(int all, double eps, int co)
|
static void launch_rhs_pipeline(int all, double eps, int co)
|
||||||
{
|
{
|
||||||
const double SYM = 1.0;
|
const double SYM = 1.0;
|
||||||
@@ -2763,6 +2934,87 @@ static void download_state_outputs(double **state_host_out, size_t all)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void copy_state_region_cuda(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_state,
|
||||||
|
const int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
cudaMemcpyKind kind)
|
||||||
|
{
|
||||||
|
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
|
||||||
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
||||||
|
|
||||||
|
const size_t pitch = (size_t)ex[0] * sizeof(double);
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||||
|
|
||||||
|
cudaMemcpy3DParms p = {};
|
||||||
|
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
|
||||||
|
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
||||||
|
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
||||||
|
|
||||||
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
|
p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
|
||||||
|
p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
|
||||||
|
} else {
|
||||||
|
p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
|
||||||
|
p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaMemcpy3D(&p));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void copy_state_region_packed_cuda(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_buffer,
|
||||||
|
const int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
cudaMemcpyKind kind)
|
||||||
|
{
|
||||||
|
if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
|
||||||
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
||||||
|
|
||||||
|
const size_t src_pitch = (size_t)ex[0] * sizeof(double);
|
||||||
|
const size_t dst_pitch = (size_t)sx * sizeof(double);
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||||
|
|
||||||
|
cudaMemcpy3DParms p = {};
|
||||||
|
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
|
||||||
|
|
||||||
|
if (kind == cudaMemcpyDeviceToHost) {
|
||||||
|
p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
|
||||||
|
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
||||||
|
p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
|
||||||
|
p.dstPos = make_cudaPos(0, 0, 0);
|
||||||
|
} else {
|
||||||
|
p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
|
||||||
|
p.srcPos = make_cudaPos(0, 0, 0);
|
||||||
|
p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
|
||||||
|
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy3D(&p));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
||||||
|
{
|
||||||
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
|
const size_t bytes = all * sizeof(double);
|
||||||
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
|
||||||
|
(size_t)BSSN_STATE_COUNT * bytes,
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool has_resident_state(void *block_tag)
|
||||||
|
{
|
||||||
|
auto it = g_step_ctx.find(block_tag);
|
||||||
|
return it != g_step_ctx.end() && it->second.state_ready;
|
||||||
|
}
|
||||||
|
|
||||||
/* ================================================================== */
|
/* ================================================================== */
|
||||||
/* Main host function — drop-in replacement for bssn_rhs_c.C */
|
/* Main host function — drop-in replacement for bssn_rhs_c.C */
|
||||||
/* ================================================================== */
|
/* ================================================================== */
|
||||||
@@ -3266,22 +3518,53 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
int &Symmetry,
|
int &Symmetry,
|
||||||
int &Lev,
|
int &Lev,
|
||||||
double &eps,
|
double &eps,
|
||||||
int &co)
|
int &co,
|
||||||
|
int &keep_resident_state,
|
||||||
|
int &apply_enforce_ga,
|
||||||
|
double &chitiny)
|
||||||
{
|
{
|
||||||
(void)T;
|
(void)T;
|
||||||
(void)Lev;
|
(void)state_host_out;
|
||||||
|
|
||||||
if (RK4 < 0 || RK4 > 3) return 1;
|
if (RK4 < 0 || RK4 > 3) return 1;
|
||||||
|
|
||||||
init_gpu_dispatch();
|
init_gpu_dispatch();
|
||||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
const bool profile = cuda_profile_enabled();
|
||||||
|
const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
|
double state_ms = 0.0;
|
||||||
|
double matter_ms = 0.0;
|
||||||
|
double rhs_ms = 0.0;
|
||||||
|
double bc_ms = 0.0;
|
||||||
|
double finalize_ms = 0.0;
|
||||||
|
double output_ms = 0.0;
|
||||||
|
|
||||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
|
|
||||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
upload_state_inputs(state_host_in, all);
|
const bool use_resident_state = (keep_resident_state != 0);
|
||||||
|
if (use_resident_state) {
|
||||||
|
bind_state_input_slots(ctx.d_state_curr);
|
||||||
|
bind_state_output_slots(ctx.d_state_next);
|
||||||
|
}
|
||||||
|
double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
|
if (!use_resident_state || RK4 == 0 || !ctx.state_ready) {
|
||||||
|
upload_state_inputs(state_host_in, all);
|
||||||
|
}
|
||||||
|
if (apply_enforce_ga) {
|
||||||
|
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
||||||
|
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
||||||
|
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
|
||||||
|
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
||||||
|
}
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
state_ms += cuda_profile_now_ms() - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
if (RK4 == 0) {
|
if (RK4 == 0) {
|
||||||
upload_matter_cache(ctx, matter_host, all);
|
upload_matter_cache(ctx, matter_host, all);
|
||||||
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
||||||
@@ -3291,9 +3574,19 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
upload_matter_cache(ctx, matter_host, all);
|
upload_matter_cache(ctx, matter_host, all);
|
||||||
}
|
}
|
||||||
bind_matter_slots(ctx);
|
bind_matter_slots(ctx);
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
matter_ms += cuda_profile_now_ms() - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
launch_rhs_pipeline((int)all, eps, co);
|
launch_rhs_pipeline((int)all, eps, co);
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
rhs_ms += cuda_profile_now_ms() - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
if (apply_bam_bc) {
|
if (apply_bam_bc) {
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
|
gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
|
||||||
@@ -3305,7 +3598,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
X, Y, Z, bbox, Symmetry);
|
X, Y, Z, bbox, Symmetry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
bc_ms += cuda_profile_now_ms() - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
|
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
|
||||||
g_buf.slot[k_state_rhs_slots[i]],
|
g_buf.slot[k_state_rhs_slots[i]],
|
||||||
@@ -3314,13 +3612,119 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
RK4);
|
RK4);
|
||||||
}
|
}
|
||||||
|
|
||||||
download_state_outputs(state_host_out, all);
|
kern_lowerboundset_cuda<<<grid(all), BLK>>>(g_buf.slot[S_chi_rhs], chitiny);
|
||||||
if (RK4 == 3) {
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
finalize_ms += cuda_profile_now_ms() - t0;
|
||||||
|
}
|
||||||
|
|
||||||
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
|
if (use_resident_state) {
|
||||||
|
std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
|
||||||
|
ctx.d_state_curr.swap(ctx.d_state_next);
|
||||||
|
ctx.state_ready = true;
|
||||||
|
} else {
|
||||||
|
download_state_outputs(state_host_out, all);
|
||||||
|
}
|
||||||
|
if (RK4 == 3 && !use_resident_state) {
|
||||||
release_step_ctx(block_tag);
|
release_step_ctx(block_tag);
|
||||||
}
|
}
|
||||||
|
if (profile) {
|
||||||
|
cuda_profile_sync();
|
||||||
|
output_ms += cuda_profile_now_ms() - t0;
|
||||||
|
CudaProfileStats &stats = cuda_profile_stats();
|
||||||
|
stats.calls++;
|
||||||
|
stats.total_ms += cuda_profile_now_ms() - t_total0;
|
||||||
|
stats.state_ms += state_ms;
|
||||||
|
stats.matter_ms += matter_ms;
|
||||||
|
stats.rhs_ms += rhs_ms;
|
||||||
|
stats.bc_ms += bc_ms;
|
||||||
|
stats.finalize_ms += finalize_ms;
|
||||||
|
stats.output_ms += output_ms;
|
||||||
|
cuda_profile_maybe_log();
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_state,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_cuda(block_tag, state_index, host_state, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_copy_state_region_from_host(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_state,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_cuda(block_tag, state_index, host_state, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_download_resident_state(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
double **state_host_out)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
download_resident_state(block_tag, ex, state_host_out);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
|
||||||
|
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
int bssn_cuda_has_resident_state(void *block_tag)
|
||||||
|
{
|
||||||
|
init_gpu_dispatch();
|
||||||
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||||
|
return has_resident_state(block_tag) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
void bssn_cuda_release_step_ctx(void *block_tag)
|
void bssn_cuda_release_step_ctx(void *block_tag)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -49,7 +49,44 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
int &Symmetry,
|
int &Symmetry,
|
||||||
int &Lev,
|
int &Lev,
|
||||||
double &eps,
|
double &eps,
|
||||||
int &co);
|
int &co,
|
||||||
|
int &keep_resident_state,
|
||||||
|
int &apply_enforce_ga,
|
||||||
|
double &chitiny);
|
||||||
|
|
||||||
|
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_state,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_copy_state_region_from_host(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_state,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_download_resident_state(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
double **state_host_out);
|
||||||
|
|
||||||
|
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||||
|
int state_index,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int bssn_cuda_has_resident_state(void *block_tag);
|
||||||
|
|
||||||
void bssn_cuda_release_step_ctx(void *block_tag);
|
void bssn_cuda_release_step_ctx(void *block_tag);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user