Stabilize GPU output path and MPI sync

This commit is contained in:
2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions

View File

@@ -1,9 +1,115 @@
#include "Parallel.h"
#include "fmisc.h"
#include "prolongrestrict.h"
#include "misc.h"
#include "parameters.h"
#include "fmisc.h"
#include "prolongrestrict.h"
#include "misc.h"
#include "parameters.h"
#include <cstring>
namespace {
const char *g_parallel_transfer_context = "Parallel::transfer";
int parallel_next_transfer_tag()
{
const char *ctx = g_parallel_transfer_context ? g_parallel_transfer_context : "Parallel::transfer";
unsigned int hash = 5381u;
while (*ctx)
{
hash = ((hash << 5) + hash) + static_cast<unsigned char>(*ctx);
ctx++;
}
return 32 + static_cast<int>(hash % 32000u);
}
struct ParallelTransferContextGuard
{
const char *prev;
explicit ParallelTransferContextGuard(const char *context) : prev(g_parallel_transfer_context)
{
g_parallel_transfer_context = context;
}
~ParallelTransferContextGuard()
{
g_parallel_transfer_context = prev;
}
};
void parallel_report_mpi_error(const char *context, int errcode, int req_no)
{
char errstr[MPI_MAX_ERROR_STRING];
int len = 0;
MPI_Error_string(errcode, errstr, &len);
int myrank = -1;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
fprintf(stderr, "MPI failure on rank %d in %s (transfer_ctx=%s, req_no=%d): %.*s\n",
myrank, context, g_parallel_transfer_context, req_no, len, errstr);
fflush(stderr);
}
void parallel_report_mpi_status_errors(const char *context, int req_no,
int status_count, MPI_Status *stats)
{
if (!stats)
return;
int myrank = -1;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
const int count = (status_count >= 0 && status_count <= req_no) ? status_count : req_no;
for (int i = 0; i < count; i++)
{
if (stats[i].MPI_ERROR != MPI_SUCCESS)
{
char errstr[MPI_MAX_ERROR_STRING];
int len = 0;
MPI_Error_string(stats[i].MPI_ERROR, errstr, &len);
fprintf(stderr,
"MPI request failure on rank %d in %s (transfer_ctx=%s, status_idx=%d, source=%d, tag=%d): %.*s\n",
myrank, context, g_parallel_transfer_context, i,
stats[i].MPI_SOURCE, stats[i].MPI_TAG, len, errstr);
}
}
fflush(stderr);
}
int parallel_waitsome_checked(int req_no, MPI_Request *reqs, int *outcount,
int *completed, MPI_Status *stats,
const char *context)
{
MPI_Errhandler old_handler;
MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
int rc = MPI_Waitsome(req_no, reqs, outcount, completed, stats);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler);
MPI_Errhandler_free(&old_handler);
if (rc != MPI_SUCCESS)
{
parallel_report_mpi_status_errors(context, req_no, outcount ? *outcount : req_no, stats);
parallel_report_mpi_error(context, rc, req_no);
MPI_Abort(MPI_COMM_WORLD, 1);
}
return rc;
}
int parallel_waitall_checked(int req_no, MPI_Request *reqs, MPI_Status *stats,
const char *context)
{
MPI_Errhandler old_handler;
MPI_Comm_get_errhandler(MPI_COMM_WORLD, &old_handler);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
int rc = MPI_Waitall(req_no, reqs, stats);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, old_handler);
MPI_Errhandler_free(&old_handler);
if (rc != MPI_SUCCESS)
{
parallel_report_mpi_status_errors(context, req_no, req_no, stats);
parallel_report_mpi_error(context, rc, req_no);
MPI_Abort(MPI_COMM_WORLD, 1);
}
return rc;
}
} // namespace
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
{
@@ -3896,10 +4002,11 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
MPI_Request *reqs = new MPI_Request[2 * cpusize];
MPI_Status *stats = new MPI_Status[2 * cpusize];
int *req_node = new int[2 * cpusize];
int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0;
int pending_recv = 0;
int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0;
int pending_recv = 0;
const int mpi_tag = parallel_next_transfer_tag();
double **send_data = new double *[cpusize];
double **rec_data = new double *[cpusize];
@@ -3913,31 +4020,31 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
}
// Post receives first so peers can progress rendezvous early.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
recv_lengths[node] = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
if (recv_lengths[node] > 0)
{
rec_data[node] = new double[recv_lengths[node]];
if (!rec_data[node])
{
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
pending_recv++;
}
}
// Local transfer on this rank.
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
}
// Local transfer on this rank.
recv_lengths[myrank] = data_packer(0, src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
if (recv_lengths[myrank] > 0)
{
rec_data[myrank] = new double[recv_lengths[myrank]];
if (!rec_data[myrank])
{
@@ -3947,33 +4054,33 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, PACK, VarList1, VarList2, Symmetry);
}
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
// Pack and post sends.
for (node = 0; node < cpusize; node++)
{
if (node == myrank) continue;
send_lengths[node] = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
if (send_lengths[node] > 0)
{
send_data[node] = new double[send_lengths[node]];
if (!send_data[node])
{
cout << "out of memory when new in short transfer, place 3" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
req_no++;
}
}
// Unpack as soon as receive completes to reduce pure wait time.
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats,
"Parallel::transfer");
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
@@ -3988,7 +4095,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
}
}
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfer");
if (rec_data[myrank])
data_packer(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
@@ -4005,12 +4112,12 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
//
void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
@@ -4027,8 +4134,9 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
int *req_node = new int[2 * cpusize];
int *req_is_recv = new int[2 * cpusize];
int *completed = new int[2 * cpusize];
int req_no = 0;
int pending_recv = 0;
int req_no = 0;
int pending_recv = 0;
const int mpi_tag = parallel_next_transfer_tag();
double **send_data = new double *[cpusize];
double **rec_data = new double *[cpusize];
@@ -4055,7 +4163,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
cout << "out of memory when new in short transfer, place 1" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
MPI_Irecv((void *)rec_data[node], recv_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
@@ -4091,7 +4199,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
MPI_Abort(MPI_COMM_WORLD, 1);
}
data_packermix(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no);
MPI_Isend((void *)send_data[node], send_lengths[node], MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
@@ -4102,7 +4210,8 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, reqs, &outcount, completed, stats);
parallel_waitsome_checked(req_no, reqs, &outcount, completed, stats,
"Parallel::transfermix");
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
@@ -4117,7 +4226,7 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
}
}
if (req_no > 0) MPI_Waitall(req_no, reqs, stats);
if (req_no > 0) parallel_waitall_checked(req_no, reqs, stats, "Parallel::transfermix");
if (rec_data[myrank])
data_packermix(rec_data[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
@@ -4134,16 +4243,21 @@ void Parallel::transfermix(MyList<Parallel::gridseg> **src, MyList<Parallel::gri
delete[] stats;
delete[] req_node;
delete[] req_is_recv;
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
delete[] completed;
delete[] send_data;
delete[] rec_data;
delete[] send_lengths;
delete[] recv_lengths;
}
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
{
Sync(Pat, VarList, Symmetry, "Parallel::Sync(Patch)");
}
void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
MyList<Parallel::gridseg> *dst;
MyList<Parallel::gridseg> **src, **transfer_src, **transfer_dst;
@@ -4159,7 +4273,10 @@ void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
// but for transfer_dst[node] the data may locate on any node
}
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync(Patch)");
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
}
if (dst)
dst->destroyList();
@@ -4177,15 +4294,26 @@ void Parallel::Sync(Patch *Pat, MyList<var> *VarList, int Symmetry)
delete[] transfer_src;
delete[] transfer_dst;
}
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
// Patch inner Synch
MyList<Patch> *Pp = PatL;
while (Pp)
{
Sync(Pp->data, VarList, Symmetry);
Pp = Pp->next;
}
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
Sync(PatL, VarList, Symmetry, "Parallel::Sync(PatchList)");
}
void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context)
{
std::string patchlist_context = context ? std::string(context) + " [patchlist]" : "Parallel::Sync(PatchList)";
// Patch inner Synch
MyList<Patch> *Pp = PatL;
int patch_index = 0;
while (Pp)
{
std::string patch_context = context ? std::string(context) + " [patch " + std::to_string(patch_index) + "]"
: "Parallel::Sync(Patch)";
Sync(Pp->data, VarList, Symmetry, patch_context.c_str());
Pp = Pp->next;
patch_index++;
}
// Patch inter Synch
int cpusize;
@@ -4204,7 +4332,10 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx(patchlist_context.c_str());
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
}
if (dst)
dst->destroyList();
@@ -4224,10 +4355,15 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
}
// Merged Sync: collect all intra-patch and inter-patch grid segment lists,
// then issue a single transfer() call instead of N+1 separate ones.
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
Sync_merged(PatL, VarList, Symmetry, "Parallel::Sync_merged");
}
void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context)
{
int cpusize;
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
@@ -4301,8 +4437,11 @@ void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
if (dst_buffer)
dst_buffer->destroyList();
// Phase C: Single transfer
transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
// Phase C: Single transfer
{
ParallelTransferContextGuard transfer_ctx(context ? context : "Parallel::Sync_merged");
transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
}
// Phase D: Cleanup
for (int node = 0; node < cpusize; node++)
@@ -4380,8 +4519,9 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
int req_no = 0;
int pending_recv = 0;
int req_no = 0;
int pending_recv = 0;
const int mpi_tag = parallel_next_transfer_tag();
int node;
int *req_node = cache.tc_req_node;
int *req_is_recv = cache.tc_req_is_recv;
@@ -4402,7 +4542,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
@@ -4440,7 +4580,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
cache.send_buf_caps[node] = slength;
}
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
@@ -4451,7 +4591,8 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats,
"Parallel::transfer_cached");
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
@@ -4466,11 +4607,11 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
}
}
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfer_cached");
if (self_len > 0)
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
if (self_len > 0)
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
{
if (!cache.valid)
@@ -4669,12 +4810,13 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
}
// Now pack and post async MPI operations
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
state.req_no = 0;
state.active = true;
state.pending_recv = 0;
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
state.req_no = 0;
state.active = true;
state.mpi_tag = parallel_next_transfer_tag();
state.pending_recv = 0;
// Allocate tracking arrays
delete[] state.req_node; delete[] state.req_is_recv;
state.req_node = new int[cache.max_reqs];
@@ -4725,7 +4867,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 0;
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
int rlength;
if (!cache.lengths_valid) {
@@ -4745,7 +4887,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
state.req_node[state.req_no] = node;
state.req_is_recv[state.req_no] = 1;
state.pending_recv++;
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, state.mpi_tag, MPI_COMM_WORLD, cache.reqs + state.req_no++);
}
}
}
@@ -4775,7 +4917,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
while (pending > 0)
{
int outcount = 0;
MPI_Waitsome(state.req_no, cache.reqs, &outcount, completed, cache.stats);
parallel_waitsome_checked(state.req_no, cache.reqs, &outcount, completed, cache.stats,
"Parallel::Sync_finish");
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
{
@@ -4791,8 +4934,8 @@ void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
delete[] completed;
}
// Wait for remaining sends
if (state.req_no > 0) MPI_Waitall(state.req_no, cache.reqs, cache.stats);
// Wait for remaining sends
if (state.req_no > 0) parallel_waitall_checked(state.req_no, cache.reqs, cache.stats, "Parallel::Sync_finish");
delete[] state.req_node; state.req_node = 0;
delete[] state.req_is_recv; state.req_is_recv = 0;
@@ -5235,7 +5378,10 @@ void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
build_PhysBD_gstl(Pat, src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::PeriodicBD");
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5506,7 +5652,10 @@ void Parallel::Prolong(Patch *Patc, Patch *Patf,
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Prolong");
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5565,7 +5714,10 @@ void Parallel::Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Restrict");
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5609,7 +5761,10 @@ void Parallel::Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Restrict_after");
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5654,7 +5809,10 @@ void Parallel::OutBdLow2Hi(Patch *Patc, Patch *Patf,
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(Patch)");
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5710,7 +5868,10 @@ void Parallel::OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::OutBdLow2Hi(PatchList)");
transfer(transfer_src, transfer_dst, VarList1, VarList2, Symmetry);
}
if (dst)
dst->destroyList();
@@ -5981,13 +6142,14 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
int myrank;
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
int cpusize = cache.cpusize;
int req_no = 0;
int pending_recv = 0;
int *req_node = new int[cache.max_reqs];
int *req_is_recv = new int[cache.max_reqs];
int *completed = new int[cache.max_reqs];
int cpusize = cache.cpusize;
int req_no = 0;
int pending_recv = 0;
const int mpi_tag = parallel_next_transfer_tag();
int *req_node = new int[cache.max_reqs];
int *req_is_recv = new int[cache.max_reqs];
int *completed = new int[cache.max_reqs];
// Post receives first so peers can progress rendezvous early.
for (int node = 0; node < cpusize; node++)
@@ -6004,7 +6166,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.recv_bufs[node] = new double[rlength];
cache.recv_buf_caps[node] = rlength;
}
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 1;
req_no++;
@@ -6042,7 +6204,7 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.send_buf_caps[node] = slength;
}
data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no);
MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, mpi_tag, MPI_COMM_WORLD, cache.reqs + req_no);
req_node[req_no] = node;
req_is_recv[req_no] = 0;
req_no++;
@@ -6053,7 +6215,8 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
while (pending_recv > 0)
{
int outcount = 0;
MPI_Waitsome(req_no, cache.reqs, &outcount, completed, cache.stats);
parallel_waitsome_checked(req_no, cache.reqs, &outcount, completed, cache.stats,
"Parallel::transfermix_cached");
if (outcount == MPI_UNDEFINED) break;
for (int i = 0; i < outcount; i++)
@@ -6068,10 +6231,10 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
}
}
if (req_no > 0) MPI_Waitall(req_no, cache.reqs, cache.stats);
if (req_no > 0) parallel_waitall_checked(req_no, cache.reqs, cache.stats, "Parallel::transfermix_cached");
if (self_len > 0)
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
if (self_len > 0)
data_packermix(cache.recv_bufs[myrank], cache.combined_src[myrank], cache.combined_dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
delete[] req_node;
delete[] req_is_recv;
@@ -6787,7 +6950,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
build_gstl(src[node], dst, &transfer_src[node], &transfer_dst[node]); // for transfer[node], data locate on cpu#node
}
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Interp copy");
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
}
for (int node = 0; node < cpusize; node++)
{
@@ -6826,7 +6992,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
Sync(PatcL, FutureList, Symmetry);
}
//<~~~prolong then
transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong FutureList");
transfer(transfer_src, transfer_dst, FutureList, FutureList, Symmetry);
}
// for StateList
// time interpolation part
@@ -6842,7 +7011,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
Sync(PatcL, tmList, Symmetry);
}
//<~~~prolong then
transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong StateList");
transfer(transfer_src, transfer_dst, tmList, StateList, Symmetry);
}
}
else
{
@@ -6853,7 +7025,10 @@ void Parallel::fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyLis
Sync(PatcL, VarList, Symmetry);
}
//<~~~prolong then
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
{
ParallelTransferContextGuard transfer_ctx("Parallel::Interp prolong VarList");
transfer(transfer_src, transfer_dst, VarList, VarList, Symmetry);
}
}
for (int node = 0; node < cpusize; node++)

View File

@@ -89,9 +89,12 @@ namespace Parallel
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
struct SyncCache {
bool valid;
@@ -121,14 +124,15 @@ namespace Parallel
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
struct AsyncSyncState {
int req_no;
bool active;
int *req_node;
int *req_is_recv;
int pending_recv;
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
};
struct AsyncSyncState {
int req_no;
bool active;
int mpi_tag;
int *req_node;
int *req_is_recv;
int pending_recv;
AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state);

View File

@@ -14,7 +14,8 @@ using namespace std;
#include <string.h>
#endif
#include <time.h>
#include <time.h>
#include <unistd.h>
#include "macrodef.h"
#include "misc.h"
@@ -2036,9 +2037,10 @@ void bssn_class::Read_Ansorg()
void bssn_class::Evolve(int Steps)
{
clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0;
clock_t prev_clock, curr_clock;
double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
LastAnas = 0;
LastConsOut = 0;
#if 0
//initial checkpoint for special uasge
{
@@ -2296,18 +2298,21 @@ void bssn_class::Evolve(int Steps)
////////////////////////////////////////////////////////////
// When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime)
{
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
if (LastCheck >= CheckTime)
{
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
CheckPoint->writecheck_cgh(PhysTime, GH);
#ifdef WithShell
CheckPoint->writecheck_sh(PhysTime, SH);
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
}
}
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
}
// Keep output/analysis phases aligned across ranks before the next coarse step.
MPI_Barrier(MPI_COMM_WORLD);
}
/*
#ifdef With_AHF
// final apparent horizon finding
@@ -6253,7 +6258,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
for(int ilev=GH->levels-1;ilev>lev;ilev--)
RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
#else
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
Parallel::Sync(GH->PatL[lev], DG_List, Symmetry, "bssn_class::Compute_Psi4");
#endif
#ifdef WithShell
@@ -6908,10 +6913,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
{
LastAnas += dT_lev;
if (LastAnas >= AnasTime)
{
#ifdef Point_Psi4
#error "not support parallel levels yet"
if (LastAnas >= AnasTime)
{
#ifdef Point_Psi4
#error "not support parallel levels yet"
// Gam_ijk and R_ij have been calculated in Interp_Constraint()
double SYM = 1, ANT = -1;
for (int levh = lev; levh < GH->levels; levh++)
@@ -7255,9 +7260,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
//================================================================================================
void bssn_class::Constraint_Out()
{
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
void bssn_class::Constraint_Out()
{
LastConsOut += dT * pow(0.5, Mymax(0, trfls));
if (LastConsOut >= AnasTime)
// Constraint violation
@@ -7322,7 +7327,7 @@ void bssn_class::Constraint_Out()
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
}
#ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7544,7 +7549,7 @@ void bssn_class::AH_Prepare_derivatives()
}
Pp = Pp->next;
}
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
}
}
@@ -7825,7 +7830,7 @@ void bssn_class::Interp_Constraint(bool infg)
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
}
#ifdef WithShell
if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8083,7 +8088,7 @@ void bssn_class::Compute_Constraint()
Pp = Pp->next;
}
}
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
}
// prolong restrict constraint quantities
for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8396,12 +8401,18 @@ void bssn_class::Enforce_algcon(int lev, int fg)
//================================================================================================
bool bssn_class::check_Stdin_Abort()
{
fd_set readfds;
struct timeval timeout;
bool bssn_class::check_Stdin_Abort()
{
// Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
// should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
// fail when rank 0 polls/consumes it.
if (!isatty(STDIN_FILENO)) {
return false;
}
fd_set readfds;
struct timeval timeout;
FD_ZERO(&readfds);
FD_SET(STDIN_FILENO, &readfds);
@@ -8410,14 +8421,17 @@ bool bssn_class::check_Stdin_Abort()
timeout.tv_sec = 0;
timeout.tv_usec = 0;
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
string input_abort;
if (cin >> input_abort) {
if (input_abort == "stop") {
return true;
}
int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
if (activity <= 0) {
return false;
}
if (FD_ISSET(STDIN_FILENO, &readfds)) {
string input_abort;
if (cin >> input_abort) {
if (input_abort == "stop") {
return true;
}
}
}

View File

@@ -1,4 +1,5 @@
#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include <cmath>
#include <cstdio>
@@ -79,6 +80,29 @@ inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
return true;
}
inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
{
if (!ensure_capacity(dst, bytes))
return false;
const double *device_src = bssn_gpu_find_device_buffer(src);
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
const void *copy_src = src;
if (device_src)
{
copy_src = device_src;
kind = cudaMemcpyDeviceToDevice;
}
cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
if (err != cudaSuccess)
{
report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
return false;
}
return true;
}
__global__ void enforce_ga_kernel(int n,
double *dxx, double *gxy, double *gxz,
double *dyy, double *gyz, double *dzz,
@@ -374,7 +398,7 @@ __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
}
}
inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **args)
inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
{
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
if (err != cudaSuccess)
@@ -382,10 +406,10 @@ inline bool launch_and_sync(dim3 grid, dim3 block, const void *kernel, void **ar
report_cuda_error("cudaLaunchKernel", err);
return false;
}
err = cudaDeviceSynchronize();
err = cudaPeekAtLastError();
if (err != cudaSuccess)
{
report_cuda_error("cudaDeviceSynchronize", err);
report_cuda_error("cudaPeekAtLastError", err);
return false;
}
return true;
@@ -411,18 +435,18 @@ int bssn_cuda_enforce_ga(int *ex,
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
copy_to_device(cache.gxy, gxy, bytes) &&
copy_to_device(cache.gxz, gxz, bytes) &&
copy_to_device(cache.dyy, dyy, bytes) &&
copy_to_device(cache.gyz, gyz, bytes) &&
copy_to_device(cache.dzz, dzz, bytes) &&
copy_to_device(cache.Axx, Axx, bytes) &&
copy_to_device(cache.Axy, Axy, bytes) &&
copy_to_device(cache.Axz, Axz, bytes) &&
copy_to_device(cache.Ayy, Ayy, bytes) &&
copy_to_device(cache.Ayz, Ayz, bytes) &&
copy_to_device(cache.Azz, Azz, bytes);
bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
copy_to_device_preferring_device(cache.Azz, Azz, bytes);
if (ok)
{
@@ -432,7 +456,7 @@ int bssn_cuda_enforce_ga(int *ex,
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
}
if (ok)
@@ -527,10 +551,10 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
ok = ok &&
(!refresh_state0 || copy_to_device(cache.state0, state0, bytes)) &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
(!need_stage_input || copy_to_device(cache.stage, stage_data, bytes)) &&
(!refresh_rhs || copy_to_device(cache.rhs, rhs_accum, bytes));
(!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) &&
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
if (ok && !need_stage_input)
ok = ensure_capacity(cache.stage, bytes);
@@ -539,11 +563,15 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
return 1;
if (refresh_state0)
{
cache.host_state0 = state0;
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
}
if (refresh_rhs)
{
cache.host_rhs = rhs_accum;
cache.rhs_resident = true;
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
}
double dX = X[1] - X[0];
@@ -582,14 +610,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
&bam_source, &bam_target,
&soa0, &soa1, &soa2,
&symmetry};
ok = launch_and_sync(grid, block, (const void *)sommerfeld_bam_kernel, args);
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
}
if (ok)
{
double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
}
if (ok && lev > 0)
@@ -599,11 +627,13 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
&has_xmin, &has_ymin, &has_zmin,
&has_xmax, &has_ymax, &has_zmax,
&d_state0, &d_stage};
ok = launch_and_sync(grid, block, (const void *)copy_physical_boundary_kernel, args);
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
}
if (ok)
{
bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr);
cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
@@ -621,13 +651,13 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
dim3 block(256);
dim3 grid(div_up(n, static_cast<int>(block.x)));
bool ok = copy_to_device(d_chi, chi, bytes);
bool ok = copy_to_device_preferring_device(d_chi, chi, bytes);
if (ok)
{
double *ptr = d_chi.ptr;
void *args[] = {&n, &ptr, &tinny};
ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
}
if (ok)

View File

@@ -17,6 +17,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
#error "Step_MainPath_GPU currently supports Patch grids only."
#endif
if (bssn_gpu_bind_process_device(myrank))
{
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
bssn_gpu_clear_cached_device_buffers();
setpbh(BH_num, Porg0, Mass, BH_num_input);
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
@@ -150,6 +157,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
bssn_gpu_clear_cached_device_buffers();
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
if (ERROR)
@@ -265,6 +273,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
bssn_gpu_clear_cached_device_buffers();
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
@@ -334,6 +343,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
RestrictProlong(lev, YN, BB);
#endif
bssn_gpu_clear_cached_device_buffers();
Pp = GH->PatL[lev];
while (Pp)
{

View File

@@ -135,6 +135,18 @@ struct GpuRhsCache
const double *last_y = nullptr;
const double *last_z = nullptr;
bool meta_uploaded = false;
static const int max_mapped_buffers = 128;
const double *host_buffers[max_mapped_buffers] = {nullptr};
const double *device_buffers[max_mapped_buffers] = {nullptr};
int mapped_buffer_count = 0;
};
struct ExternalBufferRegistry
{
static const int max_mapped_buffers = 256;
const double *host_buffers[max_mapped_buffers] = {nullptr};
const double *device_buffers[max_mapped_buffers] = {nullptr};
int mapped_buffer_count = 0;
};
GpuRhsCache &gpu_rhs_cache()
@@ -143,11 +155,81 @@ GpuRhsCache &gpu_rhs_cache()
return cache;
}
ExternalBufferRegistry &external_buffer_registry()
{
static thread_local ExternalBufferRegistry registry;
return registry;
}
void reset_meta(Meta *meta)
{
memset(meta, 0, sizeof(Meta));
}
void reset_buffer_map(GpuRhsCache &cache)
{
cache.mapped_buffer_count = 0;
for (int i = 0; i < GpuRhsCache::max_mapped_buffers; ++i)
{
cache.host_buffers[i] = nullptr;
cache.device_buffers[i] = nullptr;
}
}
void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device_ptr)
{
if (!host_ptr || !device_ptr)
return;
for (int i = 0; i < cache.mapped_buffer_count; ++i)
{
if (cache.host_buffers[i] == host_ptr)
{
cache.device_buffers[i] = device_ptr;
return;
}
}
if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
return;
cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
cache.mapped_buffer_count++;
}
void reset_external_buffer_map(ExternalBufferRegistry &registry)
{
registry.mapped_buffer_count = 0;
for (int i = 0; i < ExternalBufferRegistry::max_mapped_buffers; ++i)
{
registry.host_buffers[i] = nullptr;
registry.device_buffers[i] = nullptr;
}
}
void map_external_buffer(ExternalBufferRegistry &registry, const double *host_ptr, const double *device_ptr)
{
if (!host_ptr || !device_ptr)
return;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
{
registry.device_buffers[i] = device_ptr;
return;
}
}
if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
return;
registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
registry.mapped_buffer_count++;
}
bool ensure_device_buffer(double **ptr, size_t count)
{
if (*ptr)
@@ -188,6 +270,45 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
return true;
}
const double *find_external_device_buffer(const ExternalBufferRegistry &registry, const double *host_ptr)
{
if (!host_ptr)
return nullptr;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
return registry.device_buffers[i];
}
return nullptr;
}
bool copy_buffers_to_device_preferring_device(const CopySpec *specs, size_t count)
{
for (size_t i = 0; i < count; ++i)
{
const double *device_src = bssn_gpu_find_device_buffer(specs[i].src);
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
const void *copy_src = specs[i].src;
if (device_src)
{
copy_src = device_src;
kind = cudaMemcpyDeviceToDevice;
}
cudaError_t err = cudaMemcpy(specs[i].dst, copy_src,
specs[i].count * sizeof(double),
kind);
if (err != cudaSuccess)
{
cerr << "cudaMemcpy(" << (kind == cudaMemcpyDeviceToDevice ? "D2D" : "H2D")
<< ") failed: " << cudaGetErrorString(err) << endl;
return false;
}
}
return true;
}
bool zero_buffers(const ZeroSpec *specs, size_t count)
{
for (size_t i = 0; i < count; ++i)
@@ -219,6 +340,8 @@ void cleanup_gpu_rhs_cache()
cache.last_x = nullptr;
cache.last_y = nullptr;
cache.last_z = nullptr;
reset_buffer_map(cache);
reset_external_buffer_map(external_buffer_registry());
}
bool register_gpu_rhs_cleanup()
@@ -285,6 +408,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
cache.last_y = nullptr;
cache.last_z = nullptr;
cache.meta_uploaded = false;
reset_buffer_map(cache);
Meta *meta = &cache.meta;
const int matrix_size = cache.matrix_size;
@@ -467,6 +591,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
cache.last_x = nullptr;
cache.last_y = nullptr;
cache.last_z = nullptr;
reset_buffer_map(cache);
return false;
}
@@ -491,8 +616,56 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
return true;
}
const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *host_ptr)
{
if (!host_ptr)
return nullptr;
for (int i = 0; i < cache.mapped_buffer_count; ++i)
{
if (cache.host_buffers[i] == host_ptr)
return cache.device_buffers[i];
}
return nullptr;
}
} // namespace
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
{
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
if (device_ptr)
return device_ptr;
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
}
int bssn_gpu_bind_process_device(int mpi_rank)
{
const int device = select_cuda_device_for_process(mpi_rank);
if (device < 0)
return 1;
cudaError_t err = cudaSetDevice(device);
if (err != cudaSuccess)
{
cerr << "cudaSetDevice(" << device << ") failed: "
<< cudaGetErrorString(err) << endl;
return 1;
}
return 0;
}
void bssn_gpu_clear_cached_device_buffers()
{
reset_external_buffer_map(external_buffer_registry());
reset_buffer_map(gpu_rhs_cache());
}
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
{
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
}
__global__ void test_const_address(double * testd){
int _t = blockIdx.x*blockDim.x+threadIdx.x;
if(_t == 0)
@@ -739,16 +912,12 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
}
#endif //ifdef Cell
#endif //ifdef Vertex
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
cudaThreadSynchronize();
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
cudaThreadSynchronize();
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
cudaThreadSynchronize();
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
cudaThreadSynchronize();
}
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
}
__global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz)
@@ -837,19 +1006,17 @@ __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy
__syncthreads();
}
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
cudaThreadSynchronize();
}
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
}
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
{
@@ -905,18 +1072,16 @@ __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,d
}
}
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
cudaThreadSynchronize();
}
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
}
__global__ void computeRicci_part1(double * dst)
{
@@ -930,14 +1095,12 @@ __global__ void computeRicci_part1(double * dst)
}
}
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
cudaThreadSynchronize();
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
}/*Exception*/
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
}/*Exception*/
__global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
{
@@ -989,13 +1152,11 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
}
}
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
cudaThreadSynchronize();
}
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
}
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
{
@@ -1083,12 +1244,10 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
}
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
cudaThreadSynchronize();
}
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
}
__global__ void compute_rhs_bssn_part1()
{
@@ -2866,6 +3025,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
cache.last_z = Z;
}
reset_buffer_map(cache);
const CopySpec state_copies[] = {
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
@@ -2892,7 +3053,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
};
if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
return 1;
const ZeroSpec zero_specs[] = {
@@ -2977,9 +3138,57 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
};
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
return 1;
double sss[3] = {1,1,1};
map_buffer(cache, chi, Mh_ chi);
map_buffer(cache, trK, Mh_ trK);
map_buffer(cache, dxx, Mh_ dxx);
map_buffer(cache, gxy, Mh_ gxy);
map_buffer(cache, gxz, Mh_ gxz);
map_buffer(cache, dyy, Mh_ dyy);
map_buffer(cache, gyz, Mh_ gyz);
map_buffer(cache, dzz, Mh_ dzz);
map_buffer(cache, Axx, Mh_ Axx);
map_buffer(cache, Axy, Mh_ Axy);
map_buffer(cache, Axz, Mh_ Axz);
map_buffer(cache, Ayy, Mh_ Ayy);
map_buffer(cache, Ayz, Mh_ Ayz);
map_buffer(cache, Azz, Mh_ Azz);
map_buffer(cache, Gamx, Mh_ Gamx);
map_buffer(cache, Gamy, Mh_ Gamy);
map_buffer(cache, Gamz, Mh_ Gamz);
map_buffer(cache, Lap, Mh_ Lap);
map_buffer(cache, betax, Mh_ betax);
map_buffer(cache, betay, Mh_ betay);
map_buffer(cache, betaz, Mh_ betaz);
map_buffer(cache, dtSfx, Mh_ dtSfx);
map_buffer(cache, dtSfy, Mh_ dtSfy);
map_buffer(cache, dtSfz, Mh_ dtSfz);
map_buffer(cache, chi_rhs, Mh_ chi_rhs);
map_buffer(cache, trK_rhs, Mh_ trK_rhs);
map_buffer(cache, gxx_rhs, Mh_ gxx_rhs);
map_buffer(cache, gxy_rhs, Mh_ gxy_rhs);
map_buffer(cache, gxz_rhs, Mh_ gxz_rhs);
map_buffer(cache, gyy_rhs, Mh_ gyy_rhs);
map_buffer(cache, gyz_rhs, Mh_ gyz_rhs);
map_buffer(cache, gzz_rhs, Mh_ gzz_rhs);
map_buffer(cache, Axx_rhs, Mh_ Axx_rhs);
map_buffer(cache, Axy_rhs, Mh_ Axy_rhs);
map_buffer(cache, Axz_rhs, Mh_ Axz_rhs);
map_buffer(cache, Ayy_rhs, Mh_ Ayy_rhs);
map_buffer(cache, Ayz_rhs, Mh_ Ayz_rhs);
map_buffer(cache, Azz_rhs, Mh_ Azz_rhs);
map_buffer(cache, Gamx_rhs, Mh_ Gamx_rhs);
map_buffer(cache, Gamy_rhs, Mh_ Gamy_rhs);
map_buffer(cache, Gamz_rhs, Mh_ Gamz_rhs);
map_buffer(cache, Lap_rhs, Mh_ Lap_rhs);
map_buffer(cache, betax_rhs, Mh_ betax_rhs);
map_buffer(cache, betay_rhs, Mh_ betay_rhs);
map_buffer(cache, betaz_rhs, Mh_ betaz_rhs);
map_buffer(cache, dtSfx_rhs, Mh_ dtSfx_rhs);
map_buffer(cache, dtSfy_rhs, Mh_ dtSfy_rhs);
map_buffer(cache, dtSfz_rhs, Mh_ dtSfz_rhs);
double sss[3] = {1,1,1};
double aas[3] = {-1,-1,1};
double asa[3] = {-1,1,-1};
double saa[3] = {1,-1,-1};
@@ -3116,9 +3325,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#endif
//cout<<"GPU meta data ready.\n";
cudaThreadSynchronize();
//--------------test constant memory address & value--------------
//--------------test constant memory address & value--------------
/* double rank = mpi_rank;
cudaMemcpyToSymbol(F1o3,&rank, sizeof(double));
double ctest1 = -1;
@@ -3137,11 +3344,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//#4-----------------------calculate------------------------------
//4.0------enforce_ga---------
//sub_enforce_ga(matrix_size);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
sub_fderivs(Mh_ betaz,Mh_ fh,Mh_ betazx,Mh_ betazy,Mh_ betazz,ssa);
sub_fderivs(Mh_ chi,Mh_ fh,Mh_ chix,Mh_ chiy,Mh_ chiz, sss);
@@ -3153,44 +3359,37 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ gxy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz, aas);
sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
sub_fdderivs(Mh_ betaz,Mh_ fh,Mh_ gxxz,Mh_ gxyz,Mh_ gxzz,Mh_ gyyz,Mh_ gyzz,Mh_ gzzz,ssa);
sub_fderivs( Mh_ Gamx, Mh_ fh,Mh_ Gamxx, Mh_ Gamxy, Mh_ Gamxz,ass);
sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
computeRicci(Mh_ dzz,Mh_ Rzz,sss, meta);
computeRicci(Mh_ gxy,Mh_ Rxy,aas, meta);
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
cudaThreadSynchronize();
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
#endif
@@ -3257,19 +3456,17 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
}
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
sub_fderivs(Mh_ Axz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz,asa);
sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
}
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
}
#if (ABV == 1)
cout<<"TODO: bssn_gpu.cu::2373 (ABV == 1)"<<endl;
@@ -3283,33 +3480,11 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
cudaMemcpy(betax, Mh_ betax, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betay, Mh_ betay, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betaz, Mh_ betaz, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);*/
if(calledby == CALLED_BY_STEP)
{
cudaMemcpy(chi_rhs, Mh_ chi_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(trK_rhs, Mh_ trK_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxx_rhs, Mh_ gxx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxy_rhs, Mh_ gxy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxz_rhs, Mh_ gxz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gyy_rhs, Mh_ gyy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gyz_rhs, Mh_ gyz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gzz_rhs, Mh_ gzz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axx_rhs, Mh_ Axx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axy_rhs, Mh_ Axy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axz_rhs, Mh_ Axz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ayy_rhs, Mh_ Ayy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ayz_rhs, Mh_ Ayz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Azz_rhs, Mh_ Azz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamx_rhs, Mh_ Gamx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamy_rhs, Mh_ Gamy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamz_rhs, Mh_ Gamz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Lap_rhs, Mh_ Lap_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betax_rhs, Mh_ betax_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betay_rhs, Mh_ betay_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betaz_rhs, Mh_ betaz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfx_rhs, Mh_ dtSfx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfy_rhs, Mh_ dtSfy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfz_rhs, Mh_ dtSfz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
}
if(calledby == CALLED_BY_STEP)
{
// Main GPU evolution path now consumes RHS directly from device caches.
// Avoiding this bulk D2H copy removes one full round-trip per RK stage.
}
else if(calledby == CALLED_BY_CONSTRAINT)
{
cudaMemcpy(Gamxxx, Mh_ Gamxxx, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);

View File

@@ -63,9 +63,14 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
int gpu_rhs_ss(RHS_SS_PARA);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
int gpu_rhs_ss(RHS_SS_PARA);
int bssn_gpu_bind_process_device(int mpi_rank);
void bssn_gpu_clear_cached_device_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
#endif

View File

@@ -30,8 +30,8 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif
.SUFFIXES: .o .f90 .C .for .cu
.SUFFIXES: .o .f90 .C .for .cu
.f90.o:
$(f90) $(f90appflags) -c $< -o $@