Add fine-grained step timing and trim BH RHS overhead
(cherry picked from commit 968522995b)
This commit is contained in:
@@ -51,6 +51,112 @@ using namespace std;
|
|||||||
#define BSSN_ENABLE_MEM_USAGE_LOG 0
|
#define BSSN_ENABLE_MEM_USAGE_LOG 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef BSSN_FINE_TIMING
|
||||||
|
#define BSSN_FINE_TIMING 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef BSSN_FINE_TIMING_EVERY
|
||||||
|
#define BSSN_FINE_TIMING_EVERY 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef BSSN_FINE_TIMING_TOPN
|
||||||
|
#define BSSN_FINE_TIMING_TOPN 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if BSSN_FINE_TIMING
|
||||||
|
namespace step_timing
|
||||||
|
{
|
||||||
|
enum Bucket
|
||||||
|
{
|
||||||
|
TB_ANALYSIS_PSI4 = 0,
|
||||||
|
TB_ANALYSIS_SURFACE,
|
||||||
|
TB_ANALYSIS_IO,
|
||||||
|
TB_BH_PREDICTOR,
|
||||||
|
TB_PREDICTOR_RHS,
|
||||||
|
TB_PREDICTOR_SYNC,
|
||||||
|
TB_BH_CORRECTOR,
|
||||||
|
TB_CORRECTOR_RHS,
|
||||||
|
TB_CORRECTOR_SYNC,
|
||||||
|
TB_STATE_SWAP,
|
||||||
|
TB_RESTRICT_PROLONG,
|
||||||
|
TB_CONSTRAINT_OUT,
|
||||||
|
TB_DUMP_3D,
|
||||||
|
TB_DUMP_2D,
|
||||||
|
TB_CHECKPOINT,
|
||||||
|
TB_REGRID,
|
||||||
|
TB_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
|
static double local_bucket_seconds[TB_COUNT];
|
||||||
|
|
||||||
|
static const char *bucket_labels[TB_COUNT] =
|
||||||
|
{
|
||||||
|
"analysis_psi4",
|
||||||
|
"analysis_surface",
|
||||||
|
"analysis_io",
|
||||||
|
"bh_predictor",
|
||||||
|
"predictor_rhs",
|
||||||
|
"predictor_sync",
|
||||||
|
"bh_corrector",
|
||||||
|
"corrector_rhs",
|
||||||
|
"corrector_sync",
|
||||||
|
"state_swap",
|
||||||
|
"restrict_prolong",
|
||||||
|
"constraint_out",
|
||||||
|
"dump_3d",
|
||||||
|
"dump_2d",
|
||||||
|
"checkpoint",
|
||||||
|
"regrid"
|
||||||
|
};
|
||||||
|
|
||||||
|
void reset()
|
||||||
|
{
|
||||||
|
for (int i = 0; i < TB_COUNT; i++)
|
||||||
|
local_bucket_seconds[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void add(Bucket bucket, double seconds)
|
||||||
|
{
|
||||||
|
local_bucket_seconds[int(bucket)] += seconds;
|
||||||
|
}
|
||||||
|
|
||||||
|
void report(int myrank, int nprocs, monitor *TimingMonitor,
|
||||||
|
int step_index, double phys_time, double step_wall_seconds)
|
||||||
|
{
|
||||||
|
double max_bucket_seconds[TB_COUNT];
|
||||||
|
double avg_bucket_seconds[TB_COUNT];
|
||||||
|
|
||||||
|
MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
if (myrank != 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (int i = 0; i < TB_COUNT; i++)
|
||||||
|
avg_bucket_seconds[i] /= Mymax(1, nprocs);
|
||||||
|
|
||||||
|
if (TimingMonitor)
|
||||||
|
{
|
||||||
|
double row[2 + 2 * TB_COUNT];
|
||||||
|
row[0] = double(step_index);
|
||||||
|
row[1] = step_wall_seconds;
|
||||||
|
for (int i = 0; i < TB_COUNT; i++)
|
||||||
|
{
|
||||||
|
row[2 + i] = max_bucket_seconds[i];
|
||||||
|
row[2 + TB_COUNT + i] = avg_bucket_seconds[i];
|
||||||
|
}
|
||||||
|
TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime()
|
||||||
|
#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name))
|
||||||
|
#else
|
||||||
|
#define STEP_TIMER_DECL(var_name)
|
||||||
|
#define STEP_TIMER_ADD(bucket_name, var_name)
|
||||||
|
#endif
|
||||||
|
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@@ -420,6 +526,24 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
|
|||||||
a_stream.str("");
|
a_stream.str("");
|
||||||
a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz";
|
a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz";
|
||||||
ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str());
|
ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str());
|
||||||
|
|
||||||
|
#if BSSN_FINE_TIMING
|
||||||
|
a_stream.clear();
|
||||||
|
a_stream.str("");
|
||||||
|
a_stream << setw(8) << "# step";
|
||||||
|
a_stream << setw(14) << "wall";
|
||||||
|
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
|
||||||
|
a_stream << setw(18) << step_timing::bucket_labels[ib];
|
||||||
|
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
|
||||||
|
{
|
||||||
|
char str_avg[64];
|
||||||
|
sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]);
|
||||||
|
a_stream << setw(18) << str_avg;
|
||||||
|
}
|
||||||
|
TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str());
|
||||||
|
#else
|
||||||
|
TimingMonitor = 0;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
// setup sphere integration engine
|
// setup sphere integration engine
|
||||||
Waveshell = new surface_integral(Symmetry);
|
Waveshell = new surface_integral(Symmetry);
|
||||||
@@ -1368,6 +1492,7 @@ bssn_class::~bssn_class()
|
|||||||
delete BHMonitor;
|
delete BHMonitor;
|
||||||
delete MAPMonitor;
|
delete MAPMonitor;
|
||||||
delete ConVMonitor;
|
delete ConVMonitor;
|
||||||
|
delete TimingMonitor;
|
||||||
delete Waveshell;
|
delete Waveshell;
|
||||||
|
|
||||||
delete CheckPoint;
|
delete CheckPoint;
|
||||||
@@ -2466,6 +2591,10 @@ void bssn_class::Evolve(int Steps)
|
|||||||
for (int ncount = 1; ncount < Steps + 1; ncount++)
|
for (int ncount = 1; ncount < Steps + 1; ncount++)
|
||||||
{
|
{
|
||||||
cuda_level0_constraint_cache_valid = false;
|
cuda_level0_constraint_cache_valid = false;
|
||||||
|
#if BSSN_FINE_TIMING
|
||||||
|
step_timing::reset();
|
||||||
|
STEP_TIMER_DECL(step_wall_start);
|
||||||
|
#endif
|
||||||
|
|
||||||
// special for large mass ratio consideration
|
// special for large mass ratio consideration
|
||||||
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
|
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
|
||||||
@@ -2484,7 +2613,9 @@ void bssn_class::Evolve(int Steps)
|
|||||||
|
|
||||||
// misc::tillherecheck("before Constraint_Out");
|
// misc::tillherecheck("before Constraint_Out");
|
||||||
|
|
||||||
|
STEP_TIMER_DECL(timer_constraint_out);
|
||||||
Constraint_Out(); // this will affect the Dump_List
|
Constraint_Out(); // this will affect the Dump_List
|
||||||
|
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
|
||||||
|
|
||||||
LastDump += dT_mon;
|
LastDump += dT_mon;
|
||||||
Last2dDump += dT_mon;
|
Last2dDump += dT_mon;
|
||||||
@@ -2493,6 +2624,7 @@ void bssn_class::Evolve(int Steps)
|
|||||||
// When LastDump >= DumpTime, output corresponding binary data
|
// When LastDump >= DumpTime, output corresponding binary data
|
||||||
if (LastDump >= DumpTime)
|
if (LastDump >= DumpTime)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_dump3d);
|
||||||
// misc::tillherecheck("before Dump_Data");
|
// misc::tillherecheck("before Dump_Data");
|
||||||
|
|
||||||
for (int lev = 0; lev < GH->levels; lev++)
|
for (int lev = 0; lev < GH->levels; lev++)
|
||||||
@@ -2500,6 +2632,7 @@ void bssn_class::Evolve(int Steps)
|
|||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
SH->Dump_Data(DumpList, 0, PhysTime, dT_mon);
|
SH->Dump_Data(DumpList, 0, PhysTime, dT_mon);
|
||||||
#endif
|
#endif
|
||||||
|
STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d);
|
||||||
|
|
||||||
LastDump = 0;
|
LastDump = 0;
|
||||||
|
|
||||||
@@ -2512,10 +2645,12 @@ void bssn_class::Evolve(int Steps)
|
|||||||
// When Last2dDump >= d2DumpTime, output corresponding 2D data
|
// When Last2dDump >= d2DumpTime, output corresponding 2D data
|
||||||
if (Last2dDump >= d2DumpTime)
|
if (Last2dDump >= d2DumpTime)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_dump2d);
|
||||||
// misc::tillherecheck("before 2dDump_Data");
|
// misc::tillherecheck("before 2dDump_Data");
|
||||||
|
|
||||||
for (int lev = 0; lev < GH->levels; lev++)
|
for (int lev = 0; lev < GH->levels; lev++)
|
||||||
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
|
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
|
||||||
|
STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
|
||||||
|
|
||||||
Last2dDump = 0;
|
Last2dDump = 0;
|
||||||
|
|
||||||
@@ -2540,10 +2675,12 @@ void bssn_class::Evolve(int Steps)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
#if (REGLEV == 1)
|
#if (REGLEV == 1)
|
||||||
|
STEP_TIMER_DECL(timer_regrid);
|
||||||
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
|
STEP_TIMER_ADD(TB_REGRID, timer_regrid);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
|
||||||
@@ -2618,6 +2755,7 @@ void bssn_class::Evolve(int Steps)
|
|||||||
// When LastCheck >= CheckTime, perform runtime checks and output status data
|
// When LastCheck >= CheckTime, perform runtime checks and output status data
|
||||||
if (LastCheck >= CheckTime)
|
if (LastCheck >= CheckTime)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_checkpoint);
|
||||||
LastCheck = 0;
|
LastCheck = 0;
|
||||||
|
|
||||||
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
|
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
|
||||||
@@ -2626,7 +2764,13 @@ void bssn_class::Evolve(int Steps)
|
|||||||
CheckPoint->writecheck_sh(PhysTime, SH);
|
CheckPoint->writecheck_sh(PhysTime, SH);
|
||||||
#endif
|
#endif
|
||||||
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
|
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
|
||||||
|
STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if BSSN_FINE_TIMING
|
||||||
|
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
|
||||||
|
step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
#ifdef With_AHF
|
#ifdef With_AHF
|
||||||
@@ -2758,6 +2902,7 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0)
|
#if (REGLEV == 0)
|
||||||
|
STEP_TIMER_DECL(timer_regrid_onelevel);
|
||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
||||||
@@ -2766,6 +2911,7 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
ConstraintRefreshLevels[lev] = 1;
|
ConstraintRefreshLevels[lev] = 1;
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3365,6 +3511,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
|
|
||||||
// new code 2013-2-15, zjcao
|
// new code 2013-2-15, zjcao
|
||||||
#if (MAPBH == 1)
|
#if (MAPBH == 1)
|
||||||
|
STEP_TIMER_DECL(timer_bh_predictor);
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
@@ -3402,6 +3549,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
AnalysisStuff(lev, dT_lev);
|
AnalysisStuff(lev, dT_lev);
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef With_AHF
|
#ifdef With_AHF
|
||||||
@@ -3418,6 +3566,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
|
|
||||||
MyList<ss_patch> *sPp;
|
MyList<ss_patch> *sPp;
|
||||||
// Predictor
|
// Predictor
|
||||||
|
STEP_TIMER_DECL(timer_predictor_rhs);
|
||||||
MyList<Patch> *Pp = GH->PatL[lev];
|
MyList<Patch> *Pp = GH->PatL[lev];
|
||||||
while (Pp)
|
while (Pp)
|
||||||
{
|
{
|
||||||
@@ -3759,6 +3908,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs);
|
||||||
|
|
||||||
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||||
MPI_Request err_req;
|
MPI_Request err_req;
|
||||||
@@ -3768,6 +3918,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
STEP_TIMER_DECL(timer_predictor_sync);
|
||||||
Parallel::AsyncSyncState async_pre;
|
Parallel::AsyncSyncState async_pre;
|
||||||
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||||
|
|
||||||
@@ -3789,6 +3940,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
|
||||||
|
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
const bool need_analysis_state_after_predictor =
|
const bool need_analysis_state_after_predictor =
|
||||||
@@ -3856,6 +4008,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
// corrector
|
// corrector
|
||||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_corrector_rhs);
|
||||||
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
|
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
|
||||||
if (iter_count == 1 || iter_count == 3)
|
if (iter_count == 1 || iter_count == 3)
|
||||||
TRK4 += dT_lev / 2;
|
TRK4 += dT_lev / 2;
|
||||||
@@ -4198,7 +4351,9 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs);
|
||||||
|
|
||||||
|
STEP_TIMER_DECL(timer_corrector_sync);
|
||||||
Parallel::AsyncSyncState async_cor;
|
Parallel::AsyncSyncState async_cor;
|
||||||
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||||
|
|
||||||
@@ -4220,6 +4375,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||||
|
STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
// Complete non-blocking error reduction and check
|
// Complete non-blocking error reduction and check
|
||||||
@@ -4240,6 +4396,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (MAPBH == 0)
|
#if (MAPBH == 0)
|
||||||
|
STEP_TIMER_DECL(timer_bh_corrector);
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
@@ -4272,11 +4429,13 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// swap time level
|
// swap time level
|
||||||
if (iter_count < 3)
|
if (iter_count < 3)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_state_swap);
|
||||||
Pp = GH->PatL[lev];
|
Pp = GH->PatL[lev];
|
||||||
while (Pp)
|
while (Pp)
|
||||||
{
|
{
|
||||||
@@ -4323,6 +4482,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
@@ -4358,6 +4518,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
//
|
//
|
||||||
// OldStateList old -----------
|
// OldStateList old -----------
|
||||||
// update
|
// update
|
||||||
|
STEP_TIMER_DECL(timer_state_commit);
|
||||||
Pp = GH->PatL[lev];
|
Pp = GH->PatL[lev];
|
||||||
while (Pp)
|
while (Pp)
|
||||||
{
|
{
|
||||||
@@ -4404,6 +4565,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit);
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
{
|
{
|
||||||
@@ -6255,6 +6417,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
//
|
//
|
||||||
// SynchList_cor old -----------
|
// SynchList_cor old -----------
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_restrict_prolong);
|
||||||
#if (PSTR == 1 || PSTR == 2)
|
#if (PSTR == 1 || PSTR == 2)
|
||||||
// stringstream a_stream;
|
// stringstream a_stream;
|
||||||
// a_stream.setf(ios::left);
|
// a_stream.setf(ios::left);
|
||||||
@@ -6401,6 +6564,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
|
|||||||
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -6420,6 +6584,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
|||||||
//
|
//
|
||||||
// SynchList_cor old -----------
|
// SynchList_cor old -----------
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_restrict_prolong);
|
||||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
|
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
|
||||||
|
|
||||||
if (lev >= GH->levels - 1)
|
if (lev >= GH->levels - 1)
|
||||||
@@ -6492,6 +6657,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
|||||||
|
|
||||||
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
|
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -6502,6 +6668,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
|
|||||||
|
|
||||||
void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
||||||
{
|
{
|
||||||
|
STEP_TIMER_DECL(timer_restrict_prolong);
|
||||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||||
// we assume for fine
|
// we assume for fine
|
||||||
// SynchList_cor 1 -----------
|
// SynchList_cor 1 -----------
|
||||||
@@ -6585,6 +6752,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
|
|||||||
|
|
||||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
|
||||||
}
|
}
|
||||||
|
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
|
|||||||
@@ -136,7 +136,7 @@ public:
|
|||||||
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
|
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
|
||||||
|
|
||||||
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
||||||
monitor *ConVMonitor;
|
monitor *ConVMonitor, *TimingMonitor;
|
||||||
surface_integral *Waveshell;
|
surface_integral *Waveshell;
|
||||||
checkpoint *CheckPoint;
|
checkpoint *CheckPoint;
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,12 @@
|
|||||||
|
|
||||||
#define REGLEV 0
|
#define REGLEV 0
|
||||||
|
|
||||||
|
#define BSSN_FINE_TIMING 1
|
||||||
|
|
||||||
|
#define BSSN_FINE_TIMING_EVERY 1
|
||||||
|
|
||||||
|
#define BSSN_FINE_TIMING_TOPN 8
|
||||||
|
|
||||||
//#define USE_GPU
|
//#define USE_GPU
|
||||||
|
|
||||||
//#define CHECKDETAIL
|
//#define CHECKDETAIL
|
||||||
@@ -88,6 +94,15 @@
|
|||||||
// 0: for every level;
|
// 0: for every level;
|
||||||
// 1: for all
|
// 1: for all
|
||||||
//
|
//
|
||||||
|
// define BSSN_FINE_TIMING
|
||||||
|
// enable fine-grained per-timestep timing monitor
|
||||||
|
//
|
||||||
|
// define BSSN_FINE_TIMING_EVERY
|
||||||
|
// report timing every N coarse timesteps
|
||||||
|
//
|
||||||
|
// define BSSN_FINE_TIMING_TOPN
|
||||||
|
// number of hottest timing buckets shown in stdout
|
||||||
|
//
|
||||||
// define USE_GPU
|
// define USE_GPU
|
||||||
// use gpu or not
|
// use gpu or not
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -144,6 +144,34 @@ def generate_macrodef_h():
|
|||||||
print( "#define REGLEV 0", file=file1 )
|
print( "#define REGLEV 0", file=file1 )
|
||||||
print( file=file1 )
|
print( file=file1 )
|
||||||
|
|
||||||
|
# Define fine-grained timestep timing macros
|
||||||
|
# These default to enabled profiling without requiring AMSS_NCKU_Input.py edits.
|
||||||
|
|
||||||
|
fine_timing = getattr(input_data, "Fine_Timing",
|
||||||
|
getattr(input_data, "Finegrained_Timing", "yes"))
|
||||||
|
timing_report_every = max(1, int(getattr(
|
||||||
|
input_data, "Timing_Every_Steps",
|
||||||
|
getattr(input_data, "Timing_Report_Every", 1))))
|
||||||
|
timing_top_hotspots = max(1, int(getattr(
|
||||||
|
input_data, "Timing_Top_Hotspots", 8)))
|
||||||
|
|
||||||
|
if ( fine_timing == "yes" ):
|
||||||
|
print( "#define BSSN_FINE_TIMING 1", file=file1 )
|
||||||
|
print( file=file1 )
|
||||||
|
elif ( fine_timing == "no" ):
|
||||||
|
print( "#define BSSN_FINE_TIMING 0", file=file1 )
|
||||||
|
print( file=file1 )
|
||||||
|
else:
|
||||||
|
print( "Fine_Timing setting error!!!" )
|
||||||
|
print()
|
||||||
|
print( "# Fine_Timing setting error!!!", file=file1 )
|
||||||
|
print( file=file1 )
|
||||||
|
|
||||||
|
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
|
||||||
|
print( file=file1 )
|
||||||
|
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
|
||||||
|
print( file=file1 )
|
||||||
|
|
||||||
# Define macro USE_GPU
|
# Define macro USE_GPU
|
||||||
# use GPU or not
|
# use GPU or not
|
||||||
|
|
||||||
@@ -224,6 +252,15 @@ def generate_macrodef_h():
|
|||||||
print( "// 0: for every level;", file=file1 )
|
print( "// 0: for every level;", file=file1 )
|
||||||
print( "// 1: for all", file=file1 )
|
print( "// 1: for all", file=file1 )
|
||||||
print( "//", file=file1 )
|
print( "//", file=file1 )
|
||||||
|
print( "// define BSSN_FINE_TIMING", file=file1 )
|
||||||
|
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
|
||||||
|
print( "//", file=file1 )
|
||||||
|
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
|
||||||
|
print( "// report timing every N coarse timesteps", file=file1 )
|
||||||
|
print( "//", file=file1 )
|
||||||
|
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
|
||||||
|
print( "// number of hottest timing buckets shown in stdout", file=file1 )
|
||||||
|
print( "//", file=file1 )
|
||||||
print( "// define USE_GPU", file=file1 )
|
print( "// define USE_GPU", file=file1 )
|
||||||
print( "// use gpu or not", file=file1 )
|
print( "// use gpu or not", file=file1 )
|
||||||
print( "//", file=file1 )
|
print( "//", file=file1 )
|
||||||
|
|||||||
Reference in New Issue
Block a user