Stabilize GPU output path and MPI sync
This commit is contained in:
@@ -1,9 +1,115 @@
|
||||
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#include "prolongrestrict.h"
|
||||
#include "misc.h"
|
||||
#include "parameters.h"
|
||||
#include "fmisc.h"
|
||||
#include "prolongrestrict.h"
|
||||
#include "misc.h"
|
||||
#include "parameters.h"
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
|
||||
const char *g_parallel_transfer_context = "Parallel::transfer";
|
||||
int parallel_next_transfer_tag()
|
||||
{
|
||||
const char *ctx = g_parallel_transfer_context ? g_parallel_transfer_context : "Parallel::transfer";
|
||||
unsigned int hash = 5381u;
|
||||
while (*ctx)
|
||||
{
|
||||
hash = ((hash << 5) + hash) + static_cast<unsigned char>(*ctx);
|
||||
ctx++;
|
||||
}
|
||||
return 32 + static_cast<int>(hash % 32000u);
|
||||
}
|
||||
|
||||
struct ParallelTransferContextGuard
|
||||
{
|
||||
const char *prev;
|
||||
explicit ParallelTransferContextGuard(const char *context) : prev(g_parallel_transfer_context)
|
||||
{
|
||||
g_parallel_transfer_context = context;
|
||||
}
|
||||
~ParallelTransferContextGuard()
|
||||
{
|
||||
g_parallel_transfer_context = prev;
|
||||
}
|
||||
};
|
||||
|
||||
void parallel_report_mpi_error(const char *context, int errcode, int req_no)
|
||||
{
|
||||
char errstr[MPI_MAX_ERROR_STRING];
|
||||
int len = 0;
|
||||
MPI_Error_string(errcode, errstr, &len);
|
||||
int myrank = -1;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
fprintf(stderr, "MPI failure on rank %d in %s (transfer_ctx=%s, req_no=%d): %.*s\n",
|
||||
myrank, context, g_parallel_transfer_context, req_no, len, errstr);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
void parallel_report_mpi_status_errors(const char *context, int req_no,
|
||||
int status_count, MPI_Status *stats)
|
||||
{
|
||||
if (!stats)
|
||||
return;
|
||||
|
||||
int myrank = -1;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
const int count = (status_count >= 0 && status_count <= req_no) ? status_count : req_no;
|
||||
for (int i = 0; i < count; i++)
|
||||
{
|
||||
if (stats[i].MPI_ERROR != MPI_SUCCESS)
|
||||
{
|
||||
char errstr[MPI_MAX_ERROR_STRING];
|
||||
int len = 0;
|
||||
MPI_Error_string(stats[i].MPI_ERROR, errstr, &len);
|
||||
fprintf(stderr,
|
||||
"MPI request failure on rank %d in %s (transfer_ctx=%s, status_idx=%d, source=%d, tag=%d): %.*s\n",
|
||||
myrank, context, g_parallel_transfer_context, i,
|
||||
stats[i].MPI_SOURCE, stats[i].MPI_TAG, len, errstr);
|
||||
}
|
||||
}
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
int parallel_waitsome_checked(int req_no, MPI_Request *reqs, int *outcount,
|
||||
int *completed, MPI_Status *stats,
|
||||
const char *context)
|
||||
{
|
||||
MPI_Errhandler old_handler;
|
||||
MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler);
|
||||
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
|
||||
int rc = MPI_Waitsome(req_no, reqs, outcount, completed, stats);
|
||||
MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler);
|
||||
MPI_Errhandler_free(&old_handler);
|
||||
if (rc != MPI_SUCCESS)
|
||||
{
|
||||
parallel_report_mpi_status_errors(context, req_no, outcount ? *outcount : req_no, stats);
|
||||
parallel_report_mpi_error(context, rc, req_no);
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
int parallel_waitall_checked(int req_no, MPI_Request *reqs, MPI_Status *stats,
|
||||
const char *context)
|
||||
{
|
||||
MPI_Errhandler old_handler;
|
||||
MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler);
|
||||
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
|
||||
int rc = MPI_Waitall(req_no, reqs, stats);
|
||||
MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler);
|
||||
MPI_Errhandler_free(&old_handler);
|
||||
if (rc != MPI_SUCCESS)
|
||||
{
|
||||
parallel_report_mpi_status_errors(context, req_no, req_no, stats);
|
||||
parallel_report_mpi_error(context, rc, req_no);
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
|
||||
{
|
||||
@@ -3896,10 +4002,11 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
||||
MPI_Request *reqs = new MPI_Request[2 * cpusize];
|
||||
MPI_Status *stats = new MPI_Status[2 * cpusize];
|
||||
int *req_node = new int[2 * cpusize];
|
||||
int *req_is_recv = new int[2 * cpusize];
|
||||
int *completed = new int[2 * cpusize];
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int *req_is_recv = new int[2 * cpusize];
|
||||
int *completed = new int[2 * cpusize];
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
const int mpi_tag = parallel_next_transfer_tag();
|
||||
|
||||
double **send_data = new double *[cpusize];
|
||||
double **rec_data = new double *[cpusize];
|
||||
@@ -3913,31 +4020,31 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
||||
}
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[node] > 0)
|
||||
{
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[node] > 0)
|
||||
{
|
||||
rec_data[node] = new double[recv_lengths[node]];
|
||||
if (!rec_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
pending_recv++;
|
||||
}
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[myrank] > 0)
|
||||
{
|
||||
}
|
||||
|
||||
// Local transfer on this rank.
|
||||
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
if (recv_lengths[myrank] > 0)
|
||||
{
|
||||
rec_data[myrank] = new double[recv_lengths[myrank]];
|
||||
if (!rec_data[myrank])
|
||||
{
|
||||
@@ -3947,33 +4054,33 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
||||
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
// Pack and post sends.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
if (send_lengths[node] > 0)
|
||||
{
|
||||
send_data[node] = new double[send_lengths[node]];
|
||||
// Pack and post sends.
|
||||
for (node = 0; node < cpusize; node++)
|
||||
{
|
||||
if (node == myrank) continue;
|
||||
|
||||
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
if (send_lengths[node] > 0)
|
||||
{
|
||||
send_data[node] = new double[send_lengths[node]];
|
||||
if (!send_data[node])
|
||||
{
|
||||
cout << "out of memory when new in short transfer, place 3" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
req_no++;
|
||||
}
|
||||
}
|
||||
// Unpack as soon as receive completes to reduce pure wait time.
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||
parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats,
|
||||
"Parallel::transfer");
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
@@ -3988,7 +4095,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||
if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfer");
|
||||
|
||||
if (rec_data[myrank])
|
||||
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
@@ -4005,12 +4112,12 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
|
||||
delete[] stats;
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
//
|
||||
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
@@ -4027,8 +4134,9 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
int *req_node = new int[2 * cpusize];
|
||||
int *req_is_recv = new int[2 * cpusize];
|
||||
int *completed = new int[2 * cpusize];
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
const int mpi_tag = parallel_next_transfer_tag();
|
||||
|
||||
double **send_data = new double *[cpusize];
|
||||
double **rec_data = new double *[cpusize];
|
||||
@@ -4055,7 +4163,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
cout << "out of memory when new in short transfer, place 1" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
@@ -4091,7 +4199,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
|
||||
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
@@ -4102,7 +4210,8 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
|
||||
parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats,
|
||||
"Parallel::transfermix");
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
@@ -4117,7 +4226,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
|
||||
if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfermix");
|
||||
|
||||
if (rec_data[myrank])
|
||||
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
@@ -4134,16 +4243,21 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
|
||||
delete[] stats;
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
delete[] completed;
|
||||
delete[] send_data;
|
||||
delete[] rec_data;
|
||||
delete[] send_lengths;
|
||||
delete[] recv_lengths;
|
||||
}
|
||||
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
Sync(Pat, VarList, Symmetry, "Parallel::Sync(Patch)");
|
||||
}
|
||||
|
||||
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
|
||||
MyList<Parallel::gridseg> *dst;
|
||||
MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
|
||||
@@ -4159,7 +4273,10 @@ void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
// but for transfer_dst[node] the data may locate on any node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync(Patch)");
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -4177,15 +4294,26 @@ void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
delete[] transfer_src;
|
||||
delete[] transfer_dst;
|
||||
}
|
||||
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
// Patch inner Synch
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
Sync(Pp->data, VarList, Symmetry);
|
||||
Pp = Pp->next;
|
||||
}
|
||||
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
Sync(PatL, VarList, Symmetry, "Parallel::Sync(PatchList)");
|
||||
}
|
||||
|
||||
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context)
|
||||
{
|
||||
std::string patchlist_context = context ? std::string(context) + " [patchlist]" : "Parallel::Sync(PatchList)";
|
||||
|
||||
// Patch inner Synch
|
||||
MyList<Patch> *Pp = PatL;
|
||||
int patch_index = 0;
|
||||
while (Pp)
|
||||
{
|
||||
std::string patch_context = context ? std::string(context) + " [patch " + std::to_string(patch_index) + "]"
|
||||
: "Parallel::Sync(Patch)";
|
||||
Sync(Pp->data, VarList, Symmetry, patch_context.c_str());
|
||||
Pp = Pp->next;
|
||||
patch_index++;
|
||||
}
|
||||
|
||||
// Patch inter Synch
|
||||
int cpusize;
|
||||
@@ -4204,7 +4332,10 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx(patchlist_context.c_str());
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -4224,10 +4355,15 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
}
|
||||
// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
|
||||
// then issue a single transfer() call instead of N+1 separate ones.
|
||||
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
|
||||
{
|
||||
Sync_merged(PatL, VarList, Symmetry, "Parallel::Sync_merged");
|
||||
}
|
||||
|
||||
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context)
|
||||
{
|
||||
int cpusize;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
|
||||
|
||||
MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
|
||||
MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
|
||||
@@ -4301,8 +4437,11 @@ void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
|
||||
if (dst_buffer)
|
||||
dst_buffer->destroyList();
|
||||
|
||||
// Phase C: Single transfer
|
||||
transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
|
||||
// Phase C: Single transfer
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync_merged");
|
||||
transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
|
||||
// Phase D: Cleanup
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
@@ -4380,8 +4519,9 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
const int mpi_tag = parallel_next_transfer_tag();
|
||||
int node;
|
||||
int *req_node = cache.tc_req_node;
|
||||
int *req_is_recv = cache.tc_req_is_recv;
|
||||
@@ -4402,7 +4542,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
@@ -4440,7 +4580,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
@@ -4451,7 +4591,8 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||
parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats,
|
||||
"Parallel::transfer_cached");
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
@@ -4466,11 +4607,11 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfer_cached");
|
||||
|
||||
if (self_len > 0)
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
if (self_len > 0)
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
|
||||
{
|
||||
if (!cache.valid)
|
||||
@@ -4669,12 +4810,13 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
||||
}
|
||||
|
||||
// Now pack and post async MPI operations
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
state.req_no = 0;
|
||||
state.active = true;
|
||||
state.pending_recv = 0;
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
state.req_no = 0;
|
||||
state.active = true;
|
||||
state.mpi_tag = parallel_next_transfer_tag();
|
||||
state.pending_recv = 0;
|
||||
// Allocate tracking arrays
|
||||
delete[] state.req_node; delete[] state.req_is_recv;
|
||||
state.req_node = new int[cache.max_reqs];
|
||||
@@ -4725,7 +4867,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
||||
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
|
||||
state.req_node[state.req_no] = node;
|
||||
state.req_is_recv[state.req_no] = 0;
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||
}
|
||||
int rlength;
|
||||
if (!cache.lengths_valid) {
|
||||
@@ -4745,7 +4887,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
|
||||
state.req_node[state.req_no] = node;
|
||||
state.req_is_recv[state.req_no] = 1;
|
||||
state.pending_recv++;
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4775,7 +4917,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
||||
while (pending > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||
parallel_waitsome_checked(state.req_no, cache.reqs, &outcount, completed, cache.stats,
|
||||
"Parallel::Sync_finish");
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
for (int i = 0; i < outcount; i++)
|
||||
{
|
||||
@@ -4791,8 +4934,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
|
||||
delete[] completed;
|
||||
}
|
||||
|
||||
// Wait for remaining sends
|
||||
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
|
||||
// Wait for remaining sends
|
||||
if (state.req_no > 0) parallel_waitall_checked(state.req_no, cache.reqs, cache.stats, "Parallel::Sync_finish");
|
||||
|
||||
delete[] state.req_node; state.req_node = 0;
|
||||
delete[] state.req_is_recv; state.req_is_recv = 0;
|
||||
@@ -5235,7 +5378,10 @@ void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
|
||||
build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::PeriodicBD");
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5506,7 +5652,10 @@ void Parallel::Prolong(Patch *Patc, Patch *Patf,
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Prolong");
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5565,7 +5714,10 @@ void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Restrict");
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5609,7 +5761,10 @@ void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Restrict_after");
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5654,7 +5809,10 @@ void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(Patch)");
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5710,7 +5868,10 @@ void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(PatchList)");
|
||||
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
|
||||
if (dst)
|
||||
dst->destroyList();
|
||||
@@ -5981,13 +6142,14 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
int myrank;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
int cpusize = cache.cpusize;
|
||||
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
int *req_node = new int[cache.max_reqs];
|
||||
int *req_is_recv = new int[cache.max_reqs];
|
||||
int *completed = new int[cache.max_reqs];
|
||||
int cpusize = cache.cpusize;
|
||||
|
||||
int req_no = 0;
|
||||
int pending_recv = 0;
|
||||
const int mpi_tag = parallel_next_transfer_tag();
|
||||
int *req_node = new int[cache.max_reqs];
|
||||
int *req_is_recv = new int[cache.max_reqs];
|
||||
int *completed = new int[cache.max_reqs];
|
||||
|
||||
// Post receives first so peers can progress rendezvous early.
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
@@ -6004,7 +6166,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
cache.recv_bufs[node] = new double[rlength];
|
||||
cache.recv_buf_caps[node] = rlength;
|
||||
}
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 1;
|
||||
req_no++;
|
||||
@@ -6042,7 +6204,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
cache.send_buf_caps[node] = slength;
|
||||
}
|
||||
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
|
||||
req_node[req_no] = node;
|
||||
req_is_recv[req_no] = 0;
|
||||
req_no++;
|
||||
@@ -6053,7 +6215,8 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
while (pending_recv > 0)
|
||||
{
|
||||
int outcount = 0;
|
||||
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
|
||||
parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats,
|
||||
"Parallel::transfermix_cached");
|
||||
if (outcount == MPI_UNDEFINED) break;
|
||||
|
||||
for (int i = 0; i < outcount; i++)
|
||||
@@ -6068,10 +6231,10 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
|
||||
}
|
||||
}
|
||||
|
||||
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
|
||||
if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfermix_cached");
|
||||
|
||||
if (self_len > 0)
|
||||
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
if (self_len > 0)
|
||||
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
|
||||
delete[] req_node;
|
||||
delete[] req_is_recv;
|
||||
@@ -6787,7 +6950,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
|
||||
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
|
||||
}
|
||||
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Interp copy");
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
{
|
||||
@@ -6826,7 +6992,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
|
||||
Sync(PatcL, FutureList, Symmetry);
|
||||
}
|
||||
//<~~~prolong then
|
||||
transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong FutureList");
|
||||
transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
|
||||
}
|
||||
|
||||
// for StateList
|
||||
// time interpolation part
|
||||
@@ -6842,7 +7011,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
|
||||
Sync(PatcL, tmList, Symmetry);
|
||||
}
|
||||
//<~~~prolong then
|
||||
transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong StateList");
|
||||
transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -6853,7 +7025,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
|
||||
Sync(PatcL, VarList, Symmetry);
|
||||
}
|
||||
//<~~~prolong then
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
{
|
||||
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong VarList");
|
||||
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
|
||||
}
|
||||
}
|
||||
|
||||
for (int node = 0; node < cpusize; node++)
|
||||
|
||||
@@ -89,9 +89,12 @@ namespace Parallel
|
||||
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
|
||||
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
|
||||
int Symmetry);
|
||||
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
|
||||
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
|
||||
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context);
|
||||
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
|
||||
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
|
||||
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
|
||||
|
||||
struct SyncCache {
|
||||
bool valid;
|
||||
@@ -121,14 +124,15 @@ namespace Parallel
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
|
||||
struct AsyncSyncState {
|
||||
int req_no;
|
||||
bool active;
|
||||
int *req_node;
|
||||
int *req_is_recv;
|
||||
int pending_recv;
|
||||
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
|
||||
};
|
||||
struct AsyncSyncState {
|
||||
int req_no;
|
||||
bool active;
|
||||
int mpi_tag;
|
||||
int *req_node;
|
||||
int *req_is_recv;
|
||||
int pending_recv;
|
||||
AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {}
|
||||
};
|
||||
|
||||
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
|
||||
SyncCache &cache, AsyncSyncState &state);
|
||||
|
||||
@@ -14,7 +14,8 @@ using namespace std;
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include <time.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "macrodef.h"
|
||||
#include "misc.h"
|
||||
@@ -2036,9 +2037,10 @@ void bssn_class::Read_Ansorg()
|
||||
|
||||
void bssn_class::Evolve(int Steps)
|
||||
{
|
||||
clock_t prev_clock, curr_clock;
|
||||
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
|
||||
LastAnas = 0;
|
||||
clock_t prev_clock, curr_clock;
|
||||
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
|
||||
LastAnas = 0;
|
||||
LastConsOut = 0;
|
||||
#if 0
|
||||
//initial checkpoint for special uasge
|
||||
{
|
||||
@@ -2296,18 +2298,21 @@ void bssn_class::Evolve(int Steps)
|
||||
////////////////////////////////////////////////////////////
|
||||
|
||||
// When LastCheck >= CheckTime, perform runtime checks and output status data
|
||||
if (LastCheck >= CheckTime)
|
||||
{
|
||||
LastCheck = 0;
|
||||
|
||||
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
|
||||
if (LastCheck >= CheckTime)
|
||||
{
|
||||
LastCheck = 0;
|
||||
|
||||
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
|
||||
CheckPoint->writecheck_cgh(PhysTime, GH);
|
||||
#ifdef WithShell
|
||||
CheckPoint->writecheck_sh(PhysTime, SH);
|
||||
#endif
|
||||
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
|
||||
}
|
||||
|
||||
// Keep output/analysis phases aligned across ranks before the next coarse step.
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
/*
|
||||
#ifdef With_AHF
|
||||
// final apparent horizon finding
|
||||
@@ -6253,7 +6258,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
|
||||
for(int ilev=GH->levels-1;ilev>lev;ilev--)
|
||||
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
|
||||
#else
|
||||
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
|
||||
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4");
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
@@ -6908,10 +6913,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
{
|
||||
LastAnas += dT_lev;
|
||||
|
||||
if (LastAnas >= AnasTime)
|
||||
{
|
||||
#ifdef Point_Psi4
|
||||
#error "not support parallel levels yet"
|
||||
if (LastAnas >= AnasTime)
|
||||
{
|
||||
#ifdef Point_Psi4
|
||||
#error "not support parallel levels yet"
|
||||
// Gam_ijk and R_ij have been calculated in Interp_Constraint()
|
||||
double SYM = 1, ANT = -1;
|
||||
for (int levh = lev; levh < GH->levels; levh++)
|
||||
@@ -7255,9 +7260,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
|
||||
//================================================================================================
|
||||
|
||||
void bssn_class::Constraint_Out()
|
||||
{
|
||||
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
|
||||
void bssn_class::Constraint_Out()
|
||||
{
|
||||
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
|
||||
|
||||
if (LastConsOut >= AnasTime)
|
||||
// Constraint violation
|
||||
@@ -7322,7 +7327,7 @@ void bssn_class::Constraint_Out()
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
|
||||
}
|
||||
#ifdef WithShell
|
||||
if (0) // if the constrait quantities can be reused from the step rhs calculation
|
||||
@@ -7544,7 +7549,7 @@ void bssn_class::AH_Prepare_derivatives()
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
|
||||
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7825,7 +7830,7 @@ void bssn_class::Interp_Constraint(bool infg)
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
|
||||
}
|
||||
#ifdef WithShell
|
||||
if (0) // if the constrait quantities can be reused from the step rhs calculation
|
||||
@@ -8083,7 +8088,7 @@ void bssn_class::Compute_Constraint()
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
|
||||
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
|
||||
}
|
||||
// prolong restrict constraint quantities
|
||||
for (lev = GH->levels - 1; lev > 0; lev--)
|
||||
@@ -8396,12 +8401,18 @@ void bssn_class::Enforce_algcon(int lev, int fg)
|
||||
|
||||
//================================================================================================
|
||||
|
||||
bool bssn_class::check_Stdin_Abort()
|
||||
{
|
||||
|
||||
fd_set readfds;
|
||||
|
||||
struct timeval timeout;
|
||||
bool bssn_class::check_Stdin_Abort()
|
||||
{
|
||||
// Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
|
||||
// should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
|
||||
// fail when rank 0 polls/consumes it.
|
||||
if (!isatty(STDIN_FILENO)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
fd_set readfds;
|
||||
|
||||
struct timeval timeout;
|
||||
|
||||
FD_ZERO(&readfds);
|
||||
FD_SET(STDIN_FILENO, &readfds);
|
||||
@@ -8410,14 +8421,17 @@ bool bssn_class::check_Stdin_Abort()
|
||||
timeout.tv_sec = 0;
|
||||
timeout.tv_usec = 0;
|
||||
|
||||
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
|
||||
|
||||
if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
|
||||
string input_abort;
|
||||
if (cin >> input_abort) {
|
||||
if (input_abort == "stop") {
|
||||
return true;
|
||||
}
|
||||
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
|
||||
if (activity <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (FD_ISSET(STDIN_FILENO, &readfds)) {
|
||||
string input_abort;
|
||||
if (cin >> input_abort) {
|
||||
if (input_abort == "stop") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "bssn_cuda_ops.h"
|
||||
#include "bssn_gpu.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -79,6 +80,29 @@ inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
|
||||
{
|
||||
if (!ensure_capacity(dst, bytes))
|
||||
return false;
|
||||
|
||||
const double *device_src = bssn_gpu_find_device_buffer(src);
|
||||
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
|
||||
const void *copy_src = src;
|
||||
if (device_src)
|
||||
{
|
||||
copy_src = device_src;
|
||||
kind = cudaMemcpyDeviceToDevice;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
__global__ void enforce_ga_kernel(int n,
|
||||
double *dxx, double *gxy, double *gxz,
|
||||
double *dyy, double *gyz, double *dzz,
|
||||
@@ -374,7 +398,7 @@ __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
||||
}
|
||||
}
|
||||
|
||||
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
|
||||
inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
|
||||
{
|
||||
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
|
||||
if (err != cudaSuccess)
|
||||
@@ -382,10 +406,10 @@ inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **ar
|
||||
report_cuda_error("cudaLaunchKernel", err);
|
||||
return false;
|
||||
}
|
||||
err = cudaDeviceSynchronize();
|
||||
err = cudaPeekAtLastError();
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaDeviceSynchronize", err);
|
||||
report_cuda_error("cudaPeekAtLastError", err);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@@ -411,18 +435,18 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
|
||||
copy_to_device(cache.gxy, gxy, bytes) &&
|
||||
copy_to_device(cache.gxz, gxz, bytes) &&
|
||||
copy_to_device(cache.dyy, dyy, bytes) &&
|
||||
copy_to_device(cache.gyz, gyz, bytes) &&
|
||||
copy_to_device(cache.dzz, dzz, bytes) &&
|
||||
copy_to_device(cache.Axx, Axx, bytes) &&
|
||||
copy_to_device(cache.Axy, Axy, bytes) &&
|
||||
copy_to_device(cache.Axz, Axz, bytes) &&
|
||||
copy_to_device(cache.Ayy, Ayy, bytes) &&
|
||||
copy_to_device(cache.Ayz, Ayz, bytes) &&
|
||||
copy_to_device(cache.Azz, Azz, bytes);
|
||||
bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
|
||||
copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
|
||||
copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
|
||||
copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
|
||||
copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
|
||||
copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
|
||||
copy_to_device_preferring_device(cache.Azz, Azz, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
@@ -432,7 +456,7 @@ int bssn_cuda_enforce_ga(int *ex,
|
||||
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
|
||||
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
||||
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
||||
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
|
||||
ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
@@ -527,10 +551,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
||||
|
||||
ok = ok &&
|
||||
(!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
|
||||
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
||||
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
||||
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
|
||||
(!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));
|
||||
(!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) &&
|
||||
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
||||
|
||||
if (ok && !need_stage_input)
|
||||
ok = ensure_capacity(cache.stage, bytes);
|
||||
@@ -539,11 +563,15 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
return 1;
|
||||
|
||||
if (refresh_state0)
|
||||
{
|
||||
cache.host_state0 = state0;
|
||||
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
|
||||
}
|
||||
if (refresh_rhs)
|
||||
{
|
||||
cache.host_rhs = rhs_accum;
|
||||
cache.rhs_resident = true;
|
||||
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
|
||||
}
|
||||
|
||||
double dX = X[1] - X[0];
|
||||
@@ -582,14 +610,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
&bam_source, &bam_target,
|
||||
&soa0, &soa1, &soa2,
|
||||
&symmetry};
|
||||
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
|
||||
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
|
||||
}
|
||||
|
||||
if (ok && lev > 0)
|
||||
@@ -599,11 +627,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
&d_state0, &d_stage};
|
||||
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr);
|
||||
|
||||
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
@@ -621,13 +651,13 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
dim3 block(256);
|
||||
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
||||
|
||||
bool ok = copy_to_device(d_chi, chi, bytes);
|
||||
bool ok = copy_to_device_preferring_device(d_chi, chi, bytes);
|
||||
|
||||
if (ok)
|
||||
{
|
||||
double *ptr = d_chi.ptr;
|
||||
void *args[] = {&n, &ptr, &tinny};
|
||||
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
|
||||
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
|
||||
@@ -17,6 +17,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
#error "Step_MainPath_GPU currently supports Patch grids only."
|
||||
#endif
|
||||
|
||||
if (bssn_gpu_bind_process_device(myrank))
|
||||
{
|
||||
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
|
||||
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
||||
|
||||
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
@@ -150,6 +157,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Parallel::AsyncSyncState async_pre;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
|
||||
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
@@ -265,6 +273,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Parallel::AsyncSyncState async_cor;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
if (ERROR)
|
||||
@@ -334,6 +343,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
RestrictProlong(lev, YN, BB);
|
||||
#endif
|
||||
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
|
||||
@@ -135,6 +135,18 @@ struct GpuRhsCache
|
||||
const double *last_y = nullptr;
|
||||
const double *last_z = nullptr;
|
||||
bool meta_uploaded = false;
|
||||
static const int max_mapped_buffers = 128;
|
||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
struct ExternalBufferRegistry
|
||||
{
|
||||
static const int max_mapped_buffers = 256;
|
||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||
const double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
GpuRhsCache &gpu_rhs_cache()
|
||||
@@ -143,11 +155,81 @@ GpuRhsCache &gpu_rhs_cache()
|
||||
return cache;
|
||||
}
|
||||
|
||||
ExternalBufferRegistry &external_buffer_registry()
|
||||
{
|
||||
static thread_local ExternalBufferRegistry registry;
|
||||
return registry;
|
||||
}
|
||||
|
||||
void reset_meta(Meta *meta)
|
||||
{
|
||||
memset(meta, 0, sizeof(Meta));
|
||||
}
|
||||
|
||||
void reset_buffer_map(GpuRhsCache &cache)
|
||||
{
|
||||
cache.mapped_buffer_count = 0;
|
||||
for (int i = 0; i < GpuRhsCache::max_mapped_buffers; ++i)
|
||||
{
|
||||
cache.host_buffers[i] = nullptr;
|
||||
cache.device_buffers[i] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device_ptr)
|
||||
{
|
||||
if (!host_ptr || !device_ptr)
|
||||
return;
|
||||
|
||||
for (int i = 0; i < cache.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (cache.host_buffers[i] == host_ptr)
|
||||
{
|
||||
cache.device_buffers[i] = device_ptr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
|
||||
return;
|
||||
|
||||
cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
|
||||
cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
|
||||
cache.mapped_buffer_count++;
|
||||
}
|
||||
|
||||
void reset_external_buffer_map(ExternalBufferRegistry ®istry)
|
||||
{
|
||||
registry.mapped_buffer_count = 0;
|
||||
for (int i = 0; i < ExternalBufferRegistry::max_mapped_buffers; ++i)
|
||||
{
|
||||
registry.host_buffers[i] = nullptr;
|
||||
registry.device_buffers[i] = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_ptr, const double *device_ptr)
|
||||
{
|
||||
if (!host_ptr || !device_ptr)
|
||||
return;
|
||||
|
||||
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (registry.host_buffers[i] == host_ptr)
|
||||
{
|
||||
registry.device_buffers[i] = device_ptr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
|
||||
return;
|
||||
|
||||
registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
|
||||
registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
|
||||
registry.mapped_buffer_count++;
|
||||
}
|
||||
|
||||
bool ensure_device_buffer(double **ptr, size_t count)
|
||||
{
|
||||
if (*ptr)
|
||||
@@ -188,6 +270,45 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
|
||||
return true;
|
||||
}
|
||||
|
||||
const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr)
|
||||
{
|
||||
if (!host_ptr)
|
||||
return nullptr;
|
||||
|
||||
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (registry.host_buffers[i] == host_ptr)
|
||||
return registry.device_buffers[i];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool copy_buffers_to_device_preferring_device(const CopySpec *specs, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
{
|
||||
const double *device_src = bssn_gpu_find_device_buffer(specs[i].src);
|
||||
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
|
||||
const void *copy_src = specs[i].src;
|
||||
if (device_src)
|
||||
{
|
||||
copy_src = device_src;
|
||||
kind = cudaMemcpyDeviceToDevice;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMemcpy(specs[i].dst, copy_src,
|
||||
specs[i].count * sizeof(double),
|
||||
kind);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMemcpy(" << (kind == cudaMemcpyDeviceToDevice ? "D2D" : "H2D")
|
||||
<< ") failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool zero_buffers(const ZeroSpec *specs, size_t count)
|
||||
{
|
||||
for (size_t i = 0; i < count; ++i)
|
||||
@@ -219,6 +340,8 @@ void cleanup_gpu_rhs_cache()
|
||||
cache.last_x = nullptr;
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
reset_buffer_map(cache);
|
||||
reset_external_buffer_map(external_buffer_registry());
|
||||
}
|
||||
|
||||
bool register_gpu_rhs_cleanup()
|
||||
@@ -285,6 +408,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
cache.meta_uploaded = false;
|
||||
reset_buffer_map(cache);
|
||||
|
||||
Meta *meta = &cache.meta;
|
||||
const int matrix_size = cache.matrix_size;
|
||||
@@ -467,6 +591,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
|
||||
cache.last_x = nullptr;
|
||||
cache.last_y = nullptr;
|
||||
cache.last_z = nullptr;
|
||||
reset_buffer_map(cache);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -491,8 +616,56 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
|
||||
return true;
|
||||
}
|
||||
|
||||
const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *host_ptr)
|
||||
{
|
||||
if (!host_ptr)
|
||||
return nullptr;
|
||||
|
||||
for (int i = 0; i < cache.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (cache.host_buffers[i] == host_ptr)
|
||||
return cache.device_buffers[i];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
|
||||
{
|
||||
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
|
||||
if (device_ptr)
|
||||
return device_ptr;
|
||||
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
|
||||
}
|
||||
|
||||
int bssn_gpu_bind_process_device(int mpi_rank)
|
||||
{
|
||||
const int device = select_cuda_device_for_process(mpi_rank);
|
||||
if (device < 0)
|
||||
return 1;
|
||||
|
||||
cudaError_t err = cudaSetDevice(device);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaSetDevice(" << device << ") failed: "
|
||||
<< cudaGetErrorString(err) << endl;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bssn_gpu_clear_cached_device_buffers()
|
||||
{
|
||||
reset_external_buffer_map(external_buffer_registry());
|
||||
reset_buffer_map(gpu_rhs_cache());
|
||||
}
|
||||
|
||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
||||
{
|
||||
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
||||
}
|
||||
|
||||
__global__ void test_const_address(double * testd){
|
||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
if(_t == 0)
|
||||
@@ -739,16 +912,12 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
|
||||
}
|
||||
#endif //ifdef Cell
|
||||
#endif //ifdef Vertex
|
||||
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
|
||||
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
|
||||
cudaThreadSynchronize();
|
||||
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
|
||||
cudaThreadSynchronize();
|
||||
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
|
||||
cudaThreadSynchronize();
|
||||
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
|
||||
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
|
||||
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
|
||||
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
|
||||
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
|
||||
}
|
||||
|
||||
|
||||
__global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz)
|
||||
@@ -837,19 +1006,17 @@ __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
|
||||
{
|
||||
sub_symmetry_bd(2,f,fh,SoA);
|
||||
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaThreadSynchronize();
|
||||
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
|
||||
{
|
||||
sub_symmetry_bd(2,f,fh,SoA);
|
||||
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
|
||||
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
}
|
||||
|
||||
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
|
||||
{
|
||||
@@ -905,18 +1072,16 @@ __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,d
|
||||
}
|
||||
}
|
||||
|
||||
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
|
||||
{
|
||||
sub_symmetry_bd(2,f,fh,SoA);
|
||||
|
||||
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
|
||||
|
||||
cudaThreadSynchronize();
|
||||
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
|
||||
{
|
||||
sub_symmetry_bd(2,f,fh,SoA);
|
||||
|
||||
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
|
||||
|
||||
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
|
||||
}
|
||||
|
||||
__global__ void computeRicci_part1(double * dst)
|
||||
{
|
||||
@@ -930,14 +1095,12 @@ __global__ void computeRicci_part1(double * dst)
|
||||
}
|
||||
}
|
||||
|
||||
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
|
||||
{
|
||||
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
|
||||
cudaThreadSynchronize();
|
||||
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
cudaThreadSynchronize();
|
||||
|
||||
}/*Exception*/
|
||||
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
|
||||
{
|
||||
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
|
||||
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
|
||||
}/*Exception*/
|
||||
|
||||
__global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
|
||||
{
|
||||
@@ -989,13 +1152,11 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
|
||||
}
|
||||
}
|
||||
|
||||
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
|
||||
{
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
cudaThreadSynchronize();
|
||||
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
|
||||
{
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
|
||||
}
|
||||
|
||||
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
|
||||
{
|
||||
@@ -1083,12 +1244,10 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
|
||||
}
|
||||
|
||||
|
||||
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
cudaThreadSynchronize();
|
||||
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
|
||||
}
|
||||
|
||||
__global__ void compute_rhs_bssn_part1()
|
||||
{
|
||||
@@ -2866,6 +3025,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
cache.last_z = Z;
|
||||
}
|
||||
|
||||
reset_buffer_map(cache);
|
||||
|
||||
const CopySpec state_copies[] = {
|
||||
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
|
||||
@@ -2892,7 +3053,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
|
||||
};
|
||||
if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
|
||||
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
|
||||
return 1;
|
||||
|
||||
const ZeroSpec zero_specs[] = {
|
||||
@@ -2977,9 +3138,57 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
};
|
||||
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
|
||||
return 1;
|
||||
|
||||
|
||||
double sss[3] = {1,1,1};
|
||||
map_buffer(cache, chi, Mh_ chi);
|
||||
map_buffer(cache, trK, Mh_ trK);
|
||||
map_buffer(cache, dxx, Mh_ dxx);
|
||||
map_buffer(cache, gxy, Mh_ gxy);
|
||||
map_buffer(cache, gxz, Mh_ gxz);
|
||||
map_buffer(cache, dyy, Mh_ dyy);
|
||||
map_buffer(cache, gyz, Mh_ gyz);
|
||||
map_buffer(cache, dzz, Mh_ dzz);
|
||||
map_buffer(cache, Axx, Mh_ Axx);
|
||||
map_buffer(cache, Axy, Mh_ Axy);
|
||||
map_buffer(cache, Axz, Mh_ Axz);
|
||||
map_buffer(cache, Ayy, Mh_ Ayy);
|
||||
map_buffer(cache, Ayz, Mh_ Ayz);
|
||||
map_buffer(cache, Azz, Mh_ Azz);
|
||||
map_buffer(cache, Gamx, Mh_ Gamx);
|
||||
map_buffer(cache, Gamy, Mh_ Gamy);
|
||||
map_buffer(cache, Gamz, Mh_ Gamz);
|
||||
map_buffer(cache, Lap, Mh_ Lap);
|
||||
map_buffer(cache, betax, Mh_ betax);
|
||||
map_buffer(cache, betay, Mh_ betay);
|
||||
map_buffer(cache, betaz, Mh_ betaz);
|
||||
map_buffer(cache, dtSfx, Mh_ dtSfx);
|
||||
map_buffer(cache, dtSfy, Mh_ dtSfy);
|
||||
map_buffer(cache, dtSfz, Mh_ dtSfz);
|
||||
map_buffer(cache, chi_rhs, Mh_ chi_rhs);
|
||||
map_buffer(cache, trK_rhs, Mh_ trK_rhs);
|
||||
map_buffer(cache, gxx_rhs, Mh_ gxx_rhs);
|
||||
map_buffer(cache, gxy_rhs, Mh_ gxy_rhs);
|
||||
map_buffer(cache, gxz_rhs, Mh_ gxz_rhs);
|
||||
map_buffer(cache, gyy_rhs, Mh_ gyy_rhs);
|
||||
map_buffer(cache, gyz_rhs, Mh_ gyz_rhs);
|
||||
map_buffer(cache, gzz_rhs, Mh_ gzz_rhs);
|
||||
map_buffer(cache, Axx_rhs, Mh_ Axx_rhs);
|
||||
map_buffer(cache, Axy_rhs, Mh_ Axy_rhs);
|
||||
map_buffer(cache, Axz_rhs, Mh_ Axz_rhs);
|
||||
map_buffer(cache, Ayy_rhs, Mh_ Ayy_rhs);
|
||||
map_buffer(cache, Ayz_rhs, Mh_ Ayz_rhs);
|
||||
map_buffer(cache, Azz_rhs, Mh_ Azz_rhs);
|
||||
map_buffer(cache, Gamx_rhs, Mh_ Gamx_rhs);
|
||||
map_buffer(cache, Gamy_rhs, Mh_ Gamy_rhs);
|
||||
map_buffer(cache, Gamz_rhs, Mh_ Gamz_rhs);
|
||||
map_buffer(cache, Lap_rhs, Mh_ Lap_rhs);
|
||||
map_buffer(cache, betax_rhs, Mh_ betax_rhs);
|
||||
map_buffer(cache, betay_rhs, Mh_ betay_rhs);
|
||||
map_buffer(cache, betaz_rhs, Mh_ betaz_rhs);
|
||||
map_buffer(cache, dtSfx_rhs, Mh_ dtSfx_rhs);
|
||||
map_buffer(cache, dtSfy_rhs, Mh_ dtSfy_rhs);
|
||||
map_buffer(cache, dtSfz_rhs, Mh_ dtSfz_rhs);
|
||||
|
||||
|
||||
double sss[3] = {1,1,1};
|
||||
double aas[3] = {-1,-1,1};
|
||||
double asa[3] = {-1,1,-1};
|
||||
double saa[3] = {1,-1,-1};
|
||||
@@ -3116,9 +3325,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
#endif
|
||||
//cout<<"GPU meta data ready.\n";
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
//--------------test constant memory address & value--------------
|
||||
//--------------test constant memory address & value--------------
|
||||
/* double rank = mpi_rank;
|
||||
cudaMemcpyToSymbol(F1o3,&rank, sizeof(double));
|
||||
double ctest1 = -1;
|
||||
@@ -3137,11 +3344,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
//#4-----------------------calculate------------------------------
|
||||
//4.0------enforce_ga---------
|
||||
//sub_enforce_ga(matrix_size);
|
||||
//4.1-----compute rhs---------
|
||||
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
|
||||
//4.1-----compute rhs---------
|
||||
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
|
||||
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
|
||||
sub_fderivs(Mh_ betaz,Mh_ fh,Mh_ betazx,Mh_ betazy,Mh_ betazz,ssa);
|
||||
sub_fderivs(Mh_ chi,Mh_ fh,Mh_ chix,Mh_ chiy,Mh_ chiz, sss);
|
||||
@@ -3153,44 +3359,37 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs(Mh_ gxy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz, aas);
|
||||
sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
|
||||
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
|
||||
|
||||
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
|
||||
|
||||
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
|
||||
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
|
||||
sub_fdderivs(Mh_ betaz,Mh_ fh,Mh_ gxxz,Mh_ gxyz,Mh_ gxzz,Mh_ gyyz,Mh_ gyzz,Mh_ gzzz,ssa);
|
||||
sub_fderivs( Mh_ Gamx, Mh_ fh,Mh_ Gamxx, Mh_ Gamxy, Mh_ Gamxz,ass);
|
||||
sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
|
||||
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
|
||||
|
||||
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
|
||||
|
||||
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
|
||||
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
|
||||
computeRicci(Mh_ dzz,Mh_ Rzz,sss, meta);
|
||||
computeRicci(Mh_ gxy,Mh_ Rxy,aas, meta);
|
||||
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
|
||||
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
|
||||
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
|
||||
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
|
||||
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
|
||||
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
|
||||
compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
#endif
|
||||
|
||||
@@ -3257,19 +3456,17 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
|
||||
}
|
||||
|
||||
if(co == 0){
|
||||
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
|
||||
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
|
||||
if(co == 0){
|
||||
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
|
||||
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
|
||||
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
|
||||
sub_fderivs(Mh_ Axz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz,asa);
|
||||
sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
|
||||
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
|
||||
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
|
||||
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
|
||||
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
|
||||
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
}
|
||||
|
||||
#if (ABV == 1)
|
||||
cout<<"TODO: bssn_gpu.cu::2373 (ABV == 1)"<<endl;
|
||||
@@ -3283,33 +3480,11 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
cudaMemcpy(betax, Mh_ betax, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(betay, Mh_ betay, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(betaz, Mh_ betaz, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);*/
|
||||
if(calledby == CALLED_BY_STEP)
|
||||
{
|
||||
cudaMemcpy(chi_rhs, Mh_ chi_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(trK_rhs, Mh_ trK_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gxx_rhs, Mh_ gxx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gxy_rhs, Mh_ gxy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gxz_rhs, Mh_ gxz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gyy_rhs, Mh_ gyy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gyz_rhs, Mh_ gyz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(gzz_rhs, Mh_ gzz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Axx_rhs, Mh_ Axx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Axy_rhs, Mh_ Axy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Axz_rhs, Mh_ Axz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Ayy_rhs, Mh_ Ayy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Ayz_rhs, Mh_ Ayz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Azz_rhs, Mh_ Azz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Gamx_rhs, Mh_ Gamx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Gamy_rhs, Mh_ Gamy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Gamz_rhs, Mh_ Gamz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(Lap_rhs, Mh_ Lap_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(betax_rhs, Mh_ betax_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(betay_rhs, Mh_ betay_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(betaz_rhs, Mh_ betaz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(dtSfx_rhs, Mh_ dtSfx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(dtSfy_rhs, Mh_ dtSfy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(dtSfz_rhs, Mh_ dtSfz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
if(calledby == CALLED_BY_STEP)
|
||||
{
|
||||
// Main GPU evolution path now consumes RHS directly from device caches.
|
||||
// Avoiding this bulk D2H copy removes one full round-trip per RK stage.
|
||||
}
|
||||
else if(calledby == CALLED_BY_CONSTRAINT)
|
||||
{
|
||||
cudaMemcpy(Gamxxx, Mh_ Gamxxx, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
|
||||
@@ -63,9 +63,14 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co);
|
||||
|
||||
int gpu_rhs_ss(RHS_SS_PARA);
|
||||
|
||||
/** Init GPU side data in GPUMeta. */
|
||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||
int gpu_rhs_ss(RHS_SS_PARA);
|
||||
|
||||
int bssn_gpu_bind_process_device(int mpi_rank);
|
||||
void bssn_gpu_clear_cached_device_buffers();
|
||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
||||
|
||||
/** Init GPU side data in GPUMeta. */
|
||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -30,8 +30,8 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||
endif
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
.f90.o:
|
||||
$(f90) $(f90appflags) -c $< -o $@
|
||||
|
||||
Reference in New Issue
Block a user