Optimize BSSN-EScalar CUDA path

This commit is contained in:
2026-05-05 10:47:46 +08:00
parent 06f62dee36
commit 85fe29cc2e
9 changed files with 1821 additions and 276 deletions

View File

@@ -484,7 +484,11 @@ int main(int argc, char *argv[])
cout << endl; cout << endl;
} }
delete ADM; // Let the process teardown reclaim the simulation object. Some derived
// equation classes keep MPI/CUDA-backed state whose destructor ordering
// is fragile at program shutdown.
if (getenv("AMSS_DELETE_ADM_ON_EXIT"))
delete ADM;
//=======================caculation done============================================================= //=======================caculation done=============================================================

View File

@@ -18,6 +18,7 @@
#endif #endif
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h" #include "bssn_rhs_cuda.h"
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
#endif #endif
#if USE_CUDA_Z4C #if USE_CUDA_Z4C
#include "z4c_rhs_cuda.h" #include "z4c_rhs_cuda.h"
@@ -179,10 +180,12 @@ bool cuda_build_bssn_host_views(Block *block,
int state_count, int state_count,
double **views) double **views)
{ {
if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT) if (!block || !vars || !views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false; return false;
MyList<var> *v = vars; MyList<var> *v = vars;
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) for (int i = 0; i < state_count; ++i)
{ {
if (!v) if (!v)
return false; return false;
@@ -196,10 +199,12 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
int state_count, int state_count,
double *soa_flat) double *soa_flat)
{ {
if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT) if (!vars || !soa_flat ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false; return false;
MyList<var> *v = vars; MyList<var> *v = vars;
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) for (int i = 0; i < state_count; ++i)
{ {
if (!v) if (!v)
return false; return false;
@@ -317,7 +322,7 @@ bool cuda_state_count_direct_supported(int state_count)
#if USE_CUDA_Z4C && (ABEtype == 2) #if USE_CUDA_Z4C && (ABEtype == 2)
return state_count == Z4C_CUDA_STATE_COUNT; return state_count == Z4C_CUDA_STATE_COUNT;
#elif USE_CUDA_BSSN #elif USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
#else #else
(void)state_count; (void)state_count;
return false; return false;
@@ -372,22 +377,68 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
#endif #endif
} }
bool cuda_amr_host_staged_enabled();
double *alloc_device_comm_buffer(int length);
void free_device_comm_buffer(double *&ptr);
bool cuda_direct_pack_segment_to_device(double *buffer,
const Parallel::gridseg *src,
const Parallel::gridseg *dst,
int state_count,
int type,
MyList<var> *VarLists,
int Symmetry);
bool cuda_direct_pack_segment(double *buffer, bool cuda_direct_pack_segment(double *buffer,
const Parallel::gridseg *src, const Parallel::gridseg *src,
const Parallel::gridseg *dst, const Parallel::gridseg *dst,
int state_count, int state_count,
MyList<var> *VarLists) int type,
MyList<var> *VarLists,
int Symmetry)
{ {
#if USE_CUDA_Z4C && (ABEtype == 2) #if USE_CUDA_Z4C && (ABEtype == 2)
if (state_count != Z4C_CUDA_STATE_COUNT) if (state_count != Z4C_CUDA_STATE_COUNT)
return false; return false;
#elif USE_CUDA_BSSN #elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
#else #else
return false; return false;
#endif #endif
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
if (type == 2 || type == 3)
{
#if USE_CUDA_BSSN
if (!cuda_amr_host_staged_enabled())
return false;
const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
const int total = state_count * region_all;
static double *stage_dev = 0;
static int stage_cap = 0;
if (total > stage_cap)
{
free_device_comm_buffer(stage_dev);
stage_dev = alloc_device_comm_buffer(total);
stage_cap = total;
}
if (!cuda_direct_pack_segment_to_device(stage_dev, src, dst, state_count, type, VarLists, Symmetry))
return false;
cudaError_t cerr = cudaMemcpy(buffer, stage_dev, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
if (cerr != cudaSuccess)
{
fprintf(stderr, "Parallel: CUDA host-staged AMR pack cudaMemcpy failed, err=%d\n", (int)cerr);
return false;
}
if (sync_profile_enabled())
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
return true;
#else
return false;
#endif
}
const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int i0 = cuda_seg_begin(dst, src->Bg, 0);
const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int j0 = cuda_seg_begin(dst, src->Bg, 1);
const int k0 = cuda_seg_begin(dst, src->Bg, 2); const int k0 = cuda_seg_begin(dst, src->Bg, 2);
@@ -396,7 +447,7 @@ bool cuda_direct_pack_segment(double *buffer,
i0, j0, k0, i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else #else
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
const bool ok = have_views const bool ok = have_views
? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(
@@ -422,7 +473,7 @@ bool cuda_direct_unpack_segment(double *buffer,
if (state_count != Z4C_CUDA_STATE_COUNT) if (state_count != Z4C_CUDA_STATE_COUNT)
return false; return false;
#elif USE_CUDA_BSSN #elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
#else #else
return false; return false;
@@ -436,7 +487,7 @@ bool cuda_direct_unpack_segment(double *buffer,
i0, j0, k0, i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else #else
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
const bool ok = have_views const bool ok = have_views
? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views( ? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(
@@ -464,6 +515,17 @@ bool cuda_aware_mpi_enabled()
return enabled != 0; return enabled != 0;
} }
bool cuda_cached_device_buffers_enabled(int state_count)
{
#if USE_CUDA_BSSN
if (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
return false;
#else
(void)state_count;
#endif
return cuda_aware_mpi_enabled();
}
bool cuda_amr_restrict_device_enabled() bool cuda_amr_restrict_device_enabled()
{ {
static int enabled = -1; static int enabled = -1;
@@ -486,6 +548,17 @@ bool cuda_amr_prolong_device_enabled()
return enabled != 0; return enabled != 0;
} }
bool cuda_amr_host_staged_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_HOST_STAGED");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_amr_restrict_compare_enabled() bool cuda_amr_restrict_compare_enabled()
{ {
static int enabled = -1; static int enabled = -1;
@@ -627,12 +700,12 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
} }
#endif #endif
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
bool ok = false; bool ok = false;
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat); const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
if (type == 1) if (type == 1)
@@ -812,13 +885,13 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
} }
#endif #endif
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
const int k0 = cuda_seg_begin(dst, dst->Bg, 2); const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
const bool ok = have_views const bool ok = have_views
? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views( ? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(
@@ -843,12 +916,12 @@ bool cuda_download_resident_subset_to_host(Block *block,
int state_count) int state_count)
{ {
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) if (!block || state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
if (bssn_cuda_has_resident_state(block) == 0) if (bssn_cuda_has_resident_state(block) == 0)
return true; return true;
int indices[BSSN_CUDA_STATE_COUNT]; int indices[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
MyList<var> *v = vars; MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i) for (int i = 0; i < state_count; ++i)
{ {
@@ -871,7 +944,7 @@ bool cuda_unpack_host_region_to_resident(Block *block,
const Parallel::gridseg *dst) const Parallel::gridseg *dst)
{ {
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT) if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false; return false;
if (bssn_cuda_has_resident_state(block) == 0) if (bssn_cuda_has_resident_state(block) == 0)
return true; return true;
@@ -895,7 +968,7 @@ bool cuda_device_state_count_supported(int state_count)
return true; return true;
#endif #endif
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
#else #else
(void)state_count; (void)state_count;
return false; return false;
@@ -915,8 +988,8 @@ bool cuda_flush_device_segment_batch(Block *block,
return true; return true;
const int stride = (dir == PACK && type == 3) ? 11 : 8; const int stride = (dir == PACK && type == 3) ? 11 : 8;
const int segment_count = (int)(meta.size() / stride); const int segment_count = (int)(meta.size() / stride);
double *views[BSSN_CUDA_STATE_COUNT]; double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat); const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
if (dir == PACK) if (dir == PACK)
@@ -5022,14 +5095,17 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
{ {
#if USE_CUDA_BSSN || USE_CUDA_Z4C #if USE_CUDA_BSSN || USE_CUDA_Z4C
bool handled_by_cuda = false; bool handled_by_cuda = false;
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) && const bool host_staged_amr =
dir == PACK && !s_cuda_aware_pack_active && (type == 2 || type == 3) &&
cuda_amr_host_staged_enabled();
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
cuda_state_count_direct_supported(state_count) && cuda_state_count_direct_supported(state_count) &&
cuda_can_direct_pack(src->data, dst->data, type)) cuda_can_direct_pack(src->data, dst->data, type))
{ {
if (s_cuda_aware_pack_active) { if (s_cuda_aware_pack_active) {
handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry); handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
} else { } else {
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists); handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
} }
if (!handled_by_cuda) if (!handled_by_cuda)
{ {
@@ -5037,7 +5113,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) && else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
cuda_state_count_direct_supported(state_count) && cuda_state_count_direct_supported(state_count) &&
cuda_can_direct_unpack(dst->data, type)) cuda_can_direct_unpack(dst->data, type))
{ {
@@ -5102,7 +5178,8 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
if (cuda_state_count_direct_supported(state_count) && if (cuda_state_count_direct_supported(state_count) &&
dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg))
{ {
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) if (type != 2 && type != 3 &&
!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
{ {
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
@@ -5775,7 +5852,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
cout << "Parallel::transfer_cached: variable lists do not match." << endl; cout << "Parallel::transfer_cached: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
if (cuda_aware_mpi_enabled()) if (cuda_cached_device_buffers_enabled(state_count))
{ {
for (int n = 0; n < cpusize; n++) for (int n = 0; n < cpusize; n++)
{ {
@@ -6094,7 +6171,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cout << "Parallel::Sync_start: variable lists do not match." << endl; cout << "Parallel::Sync_start: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
if (cuda_aware_mpi_enabled()) if (cuda_cached_device_buffers_enabled(state_count))
{ {
for (int n = 0; n < cpusize; n++) for (int n = 0; n < cpusize; n++)
{ {
@@ -6976,16 +7053,16 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src2_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2); const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count == BSSN_CUDA_STATE_COUNT && if (cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) && bssn_cuda_has_resident_state(cg) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape, bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
src1_views, src2_views, 0, dst_views, src1_views, src2_views, 0, dst_views,
2, tindex) == 0) 2, tindex) == 0)
{ {
@@ -7051,18 +7128,18 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src2_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src3_views[BSSN_CUDA_STATE_COUNT]; double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2); const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count == BSSN_CUDA_STATE_COUNT && if (cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) && cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) && bssn_cuda_has_resident_state(cg) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape, bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
src1_views, src2_views, src3_views, dst_views, src1_views, src2_views, src3_views, dst_views,
3, tindex) == 0) 3, tindex) == 0)
{ {
@@ -7500,6 +7577,8 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs];
} }
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL); MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)
@@ -7561,6 +7640,8 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs];
} }
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL); MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)
@@ -7613,6 +7694,8 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs];
} }
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL); MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
for (int node = 0; node < cpusize; node++) for (int node = 0; node < cpusize; node++)

View File

@@ -24,16 +24,165 @@ using namespace std;
#include "sommerfeld_rout.h" #include "sommerfeld_rout.h"
#include "getnp4.h" #include "getnp4.h"
#include "shellfunctions.h" #include "shellfunctions.h"
#include "parameters.h" #include "parameters.h"
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#ifdef With_AHF #ifdef With_AHF
#include "derivatives.h" #include "derivatives.h"
#include "myglobal.h" #include "myglobal.h"
#endif #endif
//================================================================================================ //================================================================================================
// Define bssnEScalar_class namespace
{
#if USE_CUDA_BSSN
bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = 0,
double *soa_flat = 0)
{
int idx = 0;
while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0;
}
bool bssn_escalar_cuda_use_resident_sync(int lev)
{
#ifdef WithShell
(void)lev;
return false;
#else
return true;
#endif
}
bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
{
static int keep_all_levels = -1;
if (keep_all_levels < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS");
keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
}
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
if (!enabled)
return false;
if (lev == analysis_lev)
return false;
if (keep_all_levels)
return true;
return lev < trfls_in;
}
bool bssn_escalar_sync_merged_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_sync_level(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
if (bssn_escalar_sync_merged_enabled())
Parallel::Sync_merged(PatL, VarList, Symmetry);
else
Parallel::Sync(PatL, VarList, Symmetry);
}
bool bssn_escalar_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_STEP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
double sync, double bh, double analysis, double swap,
double resident, double rp)
{
if (!bssn_escalar_timing_enabled())
return;
double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp};
double maxv[8] = {};
MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr,
"[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f "
"bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n",
lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5],
maxv[6], maxv[7],
maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]);
}
void bssn_escalar_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_out))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out))
{
cout << "CUDA BSSN-EScalar resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (release_ctx)
bssn_cuda_release_step_ctx(cg);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
#endif
}
//================================================================================================
// Define bssnEScalar_class
// It inherits some members and methods from the parent class bssn_class and modifies others. // It inherits some members and methods from the parent class bssn_class and modifies others.
// The modified members and methods are defined below (and in the header bssnEScalar_class.h). // The modified members and methods are defined below (and in the header bssnEScalar_class.h).
@@ -177,11 +326,16 @@ void bssnEScalar_class::Initialize()
//================================================================================================ //================================================================================================
bssnEScalar_class::~bssnEScalar_class() bssnEScalar_class::~bssnEScalar_class()
{ {
delete Sphio; #if USE_CUDA_BSSN
delete Spio; for (int lev = 0; GH && lev < GH->levels; ++lev)
delete Sphi0; bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true);
#endif
delete Sphio;
delete Spio;
delete Sphi0;
delete Spi0; delete Spi0;
delete Sphi; delete Sphi;
delete Spi; delete Spi;
@@ -707,7 +861,12 @@ void bssnEScalar_class::Read_Pablo()
void bssnEScalar_class::Step(int lev, int YN) void bssnEScalar_class::Step(int lev, int YN)
{ {
double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if USE_CUDA_BSSN
const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev);
#else
const bool use_cuda_resident_sync = false;
#endif
#ifdef With_AHF #ifdef With_AHF
AH_Step_Find(lev, dT_lev); AH_Step_Find(lev, dT_lev);
#endif #endif
@@ -716,13 +875,23 @@ void bssnEScalar_class::Step(int lev, int YN)
if (lev < GH->movls) if (lev < GH->movls)
ndeps = numepsb; ndeps = numepsb;
double TRK4 = PhysTime; double TRK4 = PhysTime;
int iter_count = 0; // count RK4 substeps int iter_count = 0; // count RK4 substeps
int pre = 0, cor = 1; int pre = 0, cor = 1;
int ERROR = 0; int ERROR = 0;
const bool escalar_step_timing = bssn_escalar_timing_enabled();
MyList<ss_patch> *sPp; const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
// Predictor double escalar_t_rhs = 0.0;
MyList<Patch> *Pp = GH->PatL[lev]; double escalar_t_sync = 0.0;
double escalar_t_bh = 0.0;
double escalar_t_analysis = 0.0;
double escalar_t_swap = 0.0;
double escalar_t_resident = 0.0;
double escalar_t_rp = 0.0;
MyList<ss_patch> *sPp;
// Predictor
double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp) while (Pp)
{ {
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
@@ -731,15 +900,60 @@ void bssnEScalar_class::Step(int lev, int YN)
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, #if !USE_CUDA_BSSN
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], f_enforce_ga(cg->shape,
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
#endif cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], #endif
bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out))
{
cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_escalar_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA BSSN-EScalar predictor substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep &&
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -783,9 +997,11 @@ void bssnEScalar_class::Step(int lev, int YN)
ERROR = 1; ERROR = 1;
} }
// rk4 substep and boundary if (!used_gpu_substep)
{ {
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here // rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
while (varl0) while (varl0)
{ {
#ifndef WithShell #ifndef WithShell
@@ -820,9 +1036,10 @@ void bssnEScalar_class::Step(int lev, int YN)
varl = varl->next; varl = varl->next;
varlrhs = varlrhs->next; varlrhs = varlrhs->next;
} }
} }
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
} }
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -834,19 +1051,21 @@ void bssnEScalar_class::Step(int lev, int YN)
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
if (ErrorMonitor->outfile) if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl; << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
if (escalar_step_timing)
#ifdef WithShell escalar_t_rhs += MPI_Wtime() - escalar_t0;
#ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
if (lev == 0) if (lev == 0)
{ {
@@ -993,7 +1212,14 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
#endif #endif
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry);
#else
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#endif
if (escalar_step_timing)
escalar_t_sync += MPI_Wtime() - escalar_t0;
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -1013,10 +1239,14 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
#endif
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++) for (int ithBH = 0; ithBH < BH_num; ithBH++)
{ {
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
@@ -1041,19 +1271,29 @@ void bssnEScalar_class::Step(int lev, int YN)
DG_List->insert(Sfy0); DG_List->insert(Sfy0);
DG_List->insert(Sfz0); DG_List->insert(Sfz0);
Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
DG_List->clearList(); DG_List->clearList();
} }
} }
} if (escalar_step_timing)
escalar_t_bh += MPI_Wtime() - escalar_t0;
}
// data analysis part // data analysis part
// Warning NOTE: the variables1 are used as temp storege room // Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev) if (lev == a_lev)
{ {
AnalysisStuff_EScalar(lev, dT_lev); escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
} #if USE_CUDA_BSSN
// corrector if (use_cuda_resident_sync)
for (iter_count = 1; iter_count < 4; iter_count++) bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
{ #endif
AnalysisStuff_EScalar(lev, dT_lev);
if (escalar_step_timing)
escalar_t_analysis += MPI_Wtime() - escalar_t0;
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3) if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2; TRK4 += dT_lev / 2;
@@ -1066,22 +1306,67 @@ void bssnEScalar_class::Step(int lev, int YN)
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, #if !USE_CUDA_BSSN
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], f_enforce_ga(cg->shape,
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
#elif (AGM == 1) cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
if (iter_count == 3) #endif
f_enforce_ga(cg->shape, #elif (AGM == 1)
if (iter_count == 3)
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
!fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out))
{
cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_escalar_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA BSSN-EScalar corrector substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep &&
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1125,9 +1410,11 @@ void bssnEScalar_class::Step(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1; ERROR = 1;
} }
// rk4 substep and boundary if (!used_gpu_substep)
{ {
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
// we do not check the correspondence here // we do not check the correspondence here
while (varl0) while (varl0)
@@ -1165,9 +1452,10 @@ void bssnEScalar_class::Step(int lev, int YN)
varl1 = varl1->next; varl1 = varl1->next;
varlrhs = varlrhs->next; varlrhs = varlrhs->next;
} }
} }
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
} }
}
if (BP == Pp->data->ble) if (BP == Pp->data->ble)
break; break;
BP = BP->next; BP = BP->next;
@@ -1180,8 +1468,8 @@ void bssnEScalar_class::Step(int lev, int YN)
int erh = ERROR; int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
} }
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
@@ -1189,11 +1477,13 @@ void bssnEScalar_class::Step(int lev, int YN)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << " variables at t = " << PhysTime
<< ", lev = " << lev << endl; << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
if (escalar_step_timing)
#ifdef WithShell escalar_t_rhs += MPI_Wtime() - escalar_t0;
#ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
if (lev == 0) if (lev == 0)
{ {
@@ -1349,7 +1639,14 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
#endif #endif
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#endif
if (escalar_step_timing)
escalar_t_sync += MPI_Wtime() - escalar_t0;
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -1368,10 +1665,14 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
} }
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
#endif
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++) for (int ithBH = 0; ithBH < BH_num; ithBH++)
{ {
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
@@ -1396,14 +1697,17 @@ void bssnEScalar_class::Step(int lev, int YN)
DG_List->insert(Sfy0); DG_List->insert(Sfy0);
DG_List->insert(Sfz0); DG_List->insert(Sfz0);
Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
DG_List->clearList(); DG_List->clearList();
} }
} }
} if (escalar_step_timing)
// swap time level escalar_t_bh += MPI_Wtime() - escalar_t0;
if (iter_count < 3) }
{ // swap time level
Pp = GH->PatL[lev]; if (iter_count < 3)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
Pp = GH->PatL[lev];
while (Pp) while (Pp)
{ {
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
@@ -1444,16 +1748,32 @@ void bssnEScalar_class::Step(int lev, int YN)
Porg[ithBH][0] = Porg1[ithBH][0]; Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1]; Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2]; Porg[ithBH][2] = Porg1[ithBH][2];
} }
} }
} if (escalar_step_timing)
} escalar_t_swap += MPI_Wtime() - escalar_t0;
}
#if (RPS == 0) }
// mesh refinement boundary part
RestrictProlong(lev, YN, BB); #if USE_CUDA_BSSN
if (use_cuda_resident_sync)
#ifdef WithShell {
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
#endif
#if (RPS == 0)
// mesh refinement boundary part
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
RestrictProlong(lev, YN, BB);
if (escalar_step_timing)
escalar_t_rp += MPI_Wtime() - escalar_t0;
#ifdef WithShell
if (lev == 0) if (lev == 0)
{ {
clock_t prev_clock, curr_clock; clock_t prev_clock, curr_clock;
@@ -1477,8 +1797,9 @@ void bssnEScalar_class::Step(int lev, int YN)
// StateList 0 ----------- // StateList 0 -----------
// //
// OldStateList old ----------- // OldStateList old -----------
// update // update
Pp = GH->PatL[lev]; escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
Pp = GH->PatL[lev];
while (Pp) while (Pp)
{ {
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
@@ -1520,10 +1841,18 @@ void bssnEScalar_class::Step(int lev, int YN)
{ {
Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1]; Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2]; Porg0[ithBH][2] = Porg1[ithBH][2];
} }
} }
} if (escalar_step_timing)
{
escalar_t_swap += MPI_Wtime() - escalar_t0;
bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0,
escalar_t_rhs, escalar_t_sync, escalar_t_bh,
escalar_t_analysis, escalar_t_swap,
escalar_t_resident, escalar_t_rp);
}
}
//================================================================================================ //================================================================================================
@@ -2074,14 +2403,44 @@ void bssnEScalar_class::Constraint_Out()
MyList<Block> *BP = Pp->data->blb; MyList<Block> *BP = Pp->data->blb;
while (BP) while (BP)
{ {
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
if (lev > 0) bool used_cuda_constraints = false;
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], #if USE_CUDA_BSSN
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], {
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in))
{
cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
double *constraint_out[8] = {
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]};
int lev_arg = lev;
int sym_arg = Symmetry;
double eps_arg = ndeps;
if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, constraint_out,
sym_arg, lev_arg, eps_arg))
{
cout << "CUDA BSSN-EScalar constraint compute failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
used_cuda_constraints = true;
}
#endif
if (!used_cuda_constraints && lev > 0)
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn], cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -2110,15 +2469,16 @@ void bssnEScalar_class::Constraint_Out()
cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],
cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre); Symmetry, lev, ndeps, pre);
f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2], if (!used_cuda_constraints)
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn],

View File

@@ -70,6 +70,125 @@ int amss_analysis_map_every()
return every; return every;
} }
bool amss_rp_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_rp_detail_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_DETAIL_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_env_flag_enabled(const char *name)
{
const char *env = getenv(name);
return env && atoi(env) != 0;
}
bool amss_cached_rp_restrict_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_outbd_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_fine_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_rp_skip_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_evolve_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0;
return enabled != 0;
}
struct AmssEvolveTimingStats
{
double step;
double rp;
double regrid;
double constraint;
};
AmssEvolveTimingStats &amss_evolve_timing_stats()
{
static AmssEvolveTimingStats stats = {};
return stats;
}
void amss_evolve_timing_reset()
{
AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
stats.step = 0.0;
stats.rp = 0.0;
stats.regrid = 0.0;
stats.constraint = 0.0;
}
void amss_evolve_timing_add_step(double sec)
{
amss_evolve_timing_stats().step += sec;
}
void amss_evolve_timing_add_rp(double sec)
{
amss_evolve_timing_stats().rp += sec;
}
void amss_evolve_timing_add_regrid(double sec)
{
amss_evolve_timing_stats().regrid += sec;
}
void amss_evolve_timing_add_constraint(double sec)
{
amss_evolve_timing_stats().constraint += sec;
}
} }
// Compile-time switch for per-timestep memory usage collection/printing. // Compile-time switch for per-timestep memory usage collection/printing.
@@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
return idx == BSSN_CUDA_STATE_COUNT && vars == 0; return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
} }
int count_bssn_cuda_state_list(MyList<var> *vars)
{
int count = 0;
while (vars)
{
++count;
vars = vars->next;
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
return -1;
}
return count;
}
bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
int state_count,
double **host_views)
{
if (!cg || !host_views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false;
int idx = 0;
while (vars && idx < state_count)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
vars = vars->next;
++idx;
}
return idx == state_count && vars == 0;
}
bool bssn_cuda_use_resident_sync(int lev) bool bssn_cuda_use_resident_sync(int lev)
{ {
#ifdef WithShell #ifdef WithShell
@@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{ {
var *vars[3] = {forx, fory, forz}; var *vars[3] = {forx, fory, forz};
double *bh_host_key[3] = {
block->fgfs[forx->sgfn],
block->fgfs[fory->sgfn],
block->fgfs[forz->sgfn]
};
double soa3[9]; double soa3[9];
for (int f = 0; f < 3; f++) for (int f = 0; f < 3; f++)
{ {
@@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
DH[0], DH[1], DH[2], DH[0], DH[1], DH[2],
x, y, z, x, y, z,
interp_ordn, interp_sym, interp_ordn, interp_sym,
bh_host_key,
soa3, shellf) != 0) soa3, shellf) != 0)
{ {
const int sx = ordn; const int sx = ordn;
@@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx) void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{ {
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL; MyList<Patch> *Pp = PatL;
while (Pp) while (Pp)
{ {
@@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{ {
double *state_out[BSSN_CUDA_STATE_COUNT]; double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out)) if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{ {
cout << "CUDA BSSN state list mismatch on resident state download" << endl; cout << "CUDA BSSN state list mismatch on resident state download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
if (bssn_cuda_download_resident_state(cg, cg->shape, state_out)) const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
if (rc)
{ {
cout << "CUDA resident state download failed" << endl; cout << "CUDA resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
@@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank) void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
{ {
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL; MyList<Patch> *Pp = PatL;
while (Pp) while (Pp)
{ {
@@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
Block *cg = BP->data; Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{ {
double *state_out[BSSN_CUDA_STATE_COUNT]; double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out)) if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{ {
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl; cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out)) if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count))
{ {
cout << "CUDA resident state conditional download failed" << endl; cout << "CUDA resident state conditional download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
@@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++) for (int ncount = 1; ncount < Steps + 1; ncount++)
{ {
const bool evolve_timing = amss_evolve_timing_enabled();
const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0;
if (evolve_timing)
amss_evolve_timing_reset();
cuda_level0_constraint_cache_valid = false; cuda_level0_constraint_cache_valid = false;
#if BSSN_FINE_TIMING #if BSSN_FINE_TIMING
step_timing::reset(); step_timing::reset();
@@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps)
// misc::tillherecheck("before Constraint_Out"); // misc::tillherecheck("before Constraint_Out");
const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_constraint_out); STEP_TIMER_DECL(timer_constraint_out);
Constraint_Out(); // this will affect the Dump_List Constraint_Out(); // this will affect the Dump_List
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out); STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
if (evolve_timing)
amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
LastDump += dT_mon; LastDump += dT_mon;
Last2dDump += dT_mon; Last2dDump += dT_mon;
@@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps)
if (ncount % BSSN_FINE_TIMING_EVERY == 0) if (ncount % BSSN_FINE_TIMING_EVERY == 0)
rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start); rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start);
#endif #endif
if (evolve_timing)
{
const AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint};
double maxv[4] = {};
MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
{
const double wall = MPI_Wtime() - evolve_t0;
const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3];
fprintf(stderr,
"[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f "
"regrid=%.6f constraint=%.6f other=%.6f\n",
ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known);
}
}
} }
/* /*
#ifdef With_AHF #ifdef With_AHF
@@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev)
{ {
// if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl; // if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl;
YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level
const bool evolve_timing = amss_evolve_timing_enabled();
const double step_t0 = evolve_timing ? MPI_Wtime() : 0.0;
Step(lev, YN); Step(lev, YN);
if (evolve_timing)
amss_evolve_timing_add_step(MPI_Wtime() - step_t0);
#if (AGM == 2) #if (AGM == 2)
if (GH->levels == 1) if (GH->levels == 1)
@@ -3195,7 +3383,10 @@ void bssn_class::RecursiveStep(int lev)
// //
// till here the PhysTime has updated dT_lev // till here the PhysTime has updated dT_lev
// if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl; // if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl;
const double rp_t0 = evolve_timing ? MPI_Wtime() : 0.0;
RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor); RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor);
if (evolve_timing)
amss_evolve_timing_add_rp(MPI_Wtime() - rp_t0);
// RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor); // RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
#ifdef WithShell #ifdef WithShell
@@ -3224,6 +3415,8 @@ void bssn_class::RecursiveStep(int lev)
#endif #endif
#if (REGLEV == 0) #if (REGLEV == 0)
const bool evolve_timing = amss_evolve_timing_enabled();
const double regrid_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_regrid_onelevel); STEP_TIMER_DECL(timer_regrid_onelevel);
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0)) if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0))
@@ -3242,6 +3435,8 @@ void bssn_class::RecursiveStep(int lev)
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif #endif
} }
if (evolve_timing)
amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0);
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif #endif
} }
@@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// //
// SynchList_cor old ----------- // SynchList_cor old -----------
{ {
const bool rp_runtime_timing = amss_rp_timing_enabled();
const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0;
const bool rp_detail_timing = amss_rp_detail_timing_enabled();
double rp_t_prepare = 0.0;
double rp_t_restrict = 0.0;
double rp_t_coarse_sync = 0.0;
double rp_t_outbd = 0.0;
double rp_t_fine_sync = 0.0;
double rp_t0 = 0.0;
STEP_TIMER_DECL(timer_restrict_prolong); STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// stringstream a_stream; // stringstream a_stream;
@@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
MyList<Patch> *Pp, *Ppc; MyList<Patch> *Pp, *Ppc;
if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level
{ {
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Pp = GH->PatL[lev - 1]; Pp = GH->PatL[lev - 1];
while (Pp) while (Pp)
{ {
@@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
Pp = Pp->next; Pp = Pp->next;
} }
if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0;
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// Pp=GH->PatL[lev]; // Pp=GH->PatL[lev];
@@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
#else #else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#endif #endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1) #elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif #endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
@@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else #else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif #endif
#endif #endif
@@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
#endif #endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1) #elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif #endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
@@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#else #else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
#endif #endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1) #elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif #endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
@@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else #else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif #endif
#endif #endif
@@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1]; Ppc = GH->PatL[lev - 1];
@@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#endif #endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1) #elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif #endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
@@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
} }
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev], SL, Symmetry); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#else #else
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#endif #endif
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
@@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
} }
if (rp_runtime_timing)
{
const double local_sec = MPI_Wtime() - rp_runtime_start;
double max_sec = 0.0;
MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n",
lev, YN, BB ? 1 : 0, max_sec);
}
if (rp_detail_timing)
{
double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync};
double max_detail[5] = {};
MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr,
"[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f "
"coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n",
lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1],
max_detail[2], max_detail[3], max_detail[4]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
} }
@@ -7229,7 +7484,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
#else #else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#endif #endif
@@ -7239,7 +7497,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif #endif
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else #else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7249,16 +7513,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1]; if (amss_cached_rp_outbd_enabled())
while (Ppc)
{ {
Pp = GH->PatL[lev]; Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
while (Pp) }
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{ {
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); Pp = GH->PatL[lev];
Pp = Pp->next; while (Pp)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
Pp = Pp->next;
}
Ppc = Ppc->next;
} }
Ppc = Ppc->next;
} }
#else #else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7277,7 +7548,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
#if (RPB == 0) #if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#else #else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
#endif #endif
@@ -7287,7 +7561,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif #endif
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#else #else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7297,16 +7577,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0) #if (RPB == 0)
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1]; if (amss_cached_rp_outbd_enabled())
while (Ppc)
{ {
Pp = GH->PatL[lev]; Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
while (Pp) }
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{ {
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); Pp = GH->PatL[lev];
Pp = Pp->next; while (Pp)
{
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
Pp = Pp->next;
}
Ppc = Ppc->next;
} }
Ppc = Ppc->next;
} }
#else #else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7321,7 +7608,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
} }
#if (ABEtype == 1 || ABEtype == 2) #if (ABEtype == 1 || ABEtype == 2)
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); if (amss_cached_rp_fine_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
else
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else #else
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
#endif #endif

View File

@@ -144,7 +144,7 @@ public:
bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei, bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi, int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
int a_levi, int maxli, int decni, double maxrexi, double drexi); int a_levi, int maxli, int decni, double maxrexi, double drexi);
~bssn_class(); virtual ~bssn_class();
void Evolve(int Steps); void Evolve(int Steps);
void RecursiveStep(int lev); void RecursiveStep(int lev);

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ extern "C" {
enum { enum {
BSSN_CUDA_STATE_COUNT = 24, BSSN_CUDA_STATE_COUNT = 24,
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
BSSN_CUDA_MATTER_COUNT = 10 BSSN_CUDA_MATTER_COUNT = 10
}; };
@@ -55,6 +56,32 @@ int bssn_cuda_rk4_substep(void *block_tag,
int &apply_enforce_ga, int &apply_enforce_ga,
double &chitiny); double &chitiny);
int bssn_escalar_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **constraint_host_out,
int &Symmetry,
int &Lev,
double &eps);
int bssn_cuda_copy_state_region_to_host(void *block_tag, int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index, int state_index,
double *host_state, double *host_state,
@@ -73,6 +100,15 @@ int bssn_cuda_download_resident_state(void *block_tag,
int *ex, int *ex,
double **state_host_out); double **state_host_out);
int bssn_escalar_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex,
double **state_host_out,
int state_count);
int bssn_cuda_download_resident_state_if_present(void *block_tag, int bssn_cuda_download_resident_state_if_present(void *block_tag,
int *ex, int *ex,
double **state_host_out); double **state_host_out);
@@ -103,6 +139,7 @@ int bssn_cuda_interp_state_point3(void *block_tag,
double pz, double pz,
int ordn, int ordn,
int symmetry, int symmetry,
double **state_host_key,
const double *soa3, const double *soa3,
double *out3); double *out3);
@@ -302,6 +339,7 @@ int bssn_cuda_upload_state_subset(void *block_tag,
int bssn_cuda_prepare_inter_time_level(void *block_tag, int bssn_cuda_prepare_inter_time_level(void *block_tag,
int *ex, int *ex,
int state_count,
double **src1_host_key, double **src1_host_key,
double **src2_host_key, double **src2_host_key,
double **src3_host_key, double **src3_host_key,

View File

@@ -35,7 +35,6 @@ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
endif endif
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(MKL_INC) -Dfortran3 -Dnewc $(MKL_INC)
else else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers

View File

@@ -146,6 +146,7 @@ def _gpu_runtime_env():
"AMSS_CUDA_AWARE_MPI": "1", "AMSS_CUDA_AWARE_MPI": "1",
"AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1", "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
"AMSS_CUDA_KEEP_ALL_LEVELS": "1", "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
"AMSS_CUDA_AMR_HOST_STAGED": "1",
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
@@ -277,6 +278,7 @@ def run_ABE():
print(f" AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}") print(f" AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}") print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}") print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}") print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")