Compare commits

...

11 Commits

21 changed files with 2807 additions and 1279 deletions

View File

@@ -37,51 +37,56 @@ close(77)
end program checkFFT
#endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!-------------
SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none
INTEGER, intent(in) :: isign, nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
type(DFTI_DESCRIPTOR), pointer :: desc
integer :: status
! Create DFTI descriptor for 1D complex-to-complex transform
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
if (status /= 0) return
! Set input/output storage as interleaved complex (default)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
INTEGER::isign,nn
double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
double precision::tempi,tempr
DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
j=1
do i=1,n,2
if(j.gt.i)then
tempr=dataa(j)
tempi=dataa(j+1)
dataa(j)=dataa(i)
dataa(j+1)=dataa(i+1)
dataa(i)=tempr
dataa(i+1)=tempi
endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return
END SUBROUTINE four1

View File

@@ -5284,6 +5284,41 @@ double Parallel::L2Norm(Patch *Pat, var *vf)
return tvf;
}
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{
int myrank;
@@ -5315,6 +5350,41 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
return tvf;
}
void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{
int myrank = 0;

View File

@@ -183,6 +183,7 @@ namespace Parallel
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
@@ -218,6 +219,7 @@ namespace Parallel
void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -3472,6 +3472,43 @@ double ShellPatch::L2Norm(var *vf)
return tvf;
}
void ShellPatch::L2Norm7(var **vf, double *norms)
{
double tvf[7], dtvf[7];
int BDW = overghost;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<ss_patch> *sPp = PatL;
while (sPp)
{
MyList<Block> *Bp = sPp->data->blb;
while (Bp)
{
Block *cg = Bp->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (Bp == sPp->data->ble)
break;
Bp = Bp->next;
}
sPp = sPp->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,

View File

@@ -198,6 +198,7 @@ public:
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst);
double L2Norm(var *vf);
void L2Norm7(var **vf, double *norms);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
};

View File

@@ -27,7 +27,21 @@ using namespace std;
#endif
#include "TwoPunctures.h"
#include <mkl_cblas.h>
extern "C" {
double cblas_ddot(const int, const double *, const int, const double *, const int);
double cblas_dnrm2(const int, const double *, const int);
void cblas_dgemm(const int, const int, const int,
const int, const int, const int,
const double, const double *, const int,
const double *, const int, const double,
double *, const int);
}
enum {
CblasRowMajor = 101,
CblasNoTrans = 111
};
TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz,

View File

@@ -47,6 +47,233 @@ using namespace std;
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
#ifndef BSSN_FINE_TIMING
#define BSSN_FINE_TIMING 0
#endif
#ifndef BSSN_FINE_TIMING_EVERY
#define BSSN_FINE_TIMING_EVERY 1
#endif
#ifndef BSSN_FINE_TIMING_TOPN
#define BSSN_FINE_TIMING_TOPN 8
#endif
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#ifndef BSSN_ENABLE_STDIN_ABORT_POLL
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
#endif
#if BSSN_FINE_TIMING
namespace step_timing
{
enum Bucket
{
TB_ANALYSIS_PSI4 = 0,
TB_ANALYSIS_SURFACE,
TB_ANALYSIS_IO,
TB_BH_PREDICTOR,
TB_PREDICTOR_RHS,
TB_PREDICTOR_SYNC,
TB_BH_CORRECTOR,
TB_CORRECTOR_RHS,
TB_CORRECTOR_SYNC,
TB_STATE_SWAP,
TB_RESTRICT_PROLONG,
TB_CONSTRAINT_OUT,
TB_DUMP_3D,
TB_DUMP_2D,
TB_CHECKPOINT,
TB_REGRID,
TB_COUNT
};
static double local_bucket_seconds[TB_COUNT];
static const char *bucket_labels[TB_COUNT] =
{
"analysis_psi4",
"analysis_surface",
"analysis_io",
"bh_predictor",
"predictor_rhs",
"predictor_sync",
"bh_corrector",
"corrector_rhs",
"corrector_sync",
"state_swap",
"restrict_prolong",
"constraint_out",
"dump_3d",
"dump_2d",
"checkpoint",
"regrid"
};
void reset()
{
for (int i = 0; i < TB_COUNT; i++)
local_bucket_seconds[i] = 0.0;
}
void add(Bucket bucket, double seconds)
{
local_bucket_seconds[int(bucket)] += seconds;
}
void report(int myrank, int nprocs, monitor *TimingMonitor,
int step_index, double phys_time, double step_wall_seconds)
{
double max_bucket_seconds[TB_COUNT];
double avg_bucket_seconds[TB_COUNT];
MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
for (int i = 0; i < TB_COUNT; i++)
avg_bucket_seconds[i] /= Mymax(1, nprocs);
if (TimingMonitor)
{
double row[2 + 2 * TB_COUNT];
row[0] = double(step_index);
row[1] = step_wall_seconds;
for (int i = 0; i < TB_COUNT; i++)
{
row[2 + i] = max_bucket_seconds[i];
row[2 + TB_COUNT + i] = avg_bucket_seconds[i];
}
TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row);
}
double residual = step_wall_seconds;
for (int i = 0; i < TB_COUNT; i++)
residual -= max_bucket_seconds[i];
if (residual < 0.0)
residual = 0.0;
int order[TB_COUNT];
for (int i = 0; i < TB_COUNT; i++)
order[i] = i;
for (int i = 0; i < TB_COUNT - 1; i++)
for (int j = i + 1; j < TB_COUNT; j++)
if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]])
{
int tmp = order[i];
order[i] = order[j];
order[j] = tmp;
}
ios::fmtflags old_flags = cout.flags();
streamsize old_precision = cout.precision();
cout << " Fine timing hot spots (max rank wall estimate):" << endl;
const int topn = Mymin(BSSN_FINE_TIMING_TOPN, TB_COUNT);
for (int i = 0; i < topn; i++)
{
const int ib = order[i];
const double frac = (step_wall_seconds > 0.0) ? (100.0 * max_bucket_seconds[ib] / step_wall_seconds) : 0.0;
cout << " "
<< setw(20) << left << bucket_labels[ib]
<< " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib]
<< " s (" << setw(6) << setprecision(4) << frac << "%)" << endl;
}
if (residual > 1.0e-6)
{
const double frac = (step_wall_seconds > 0.0) ? (100.0 * residual / step_wall_seconds) : 0.0;
cout << " "
<< setw(20) << left << "unprofiled_residual"
<< " = " << setw(10) << right << setprecision(6) << residual
<< " s (" << setw(6) << setprecision(4) << frac << "%)" << endl;
}
cout << endl;
cout.flags(old_flags);
cout.precision(old_precision);
}
}
#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime()
#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name))
#else
#define STEP_TIMER_DECL(var_name)
#define STEP_TIMER_ADD(bucket_name, var_name)
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing_report
{
void report(int myrank, int nprocs, int step_index, double step_wall_seconds)
{
const int bucket_count = f_bssn_rhs_kernel_timing_bucket_count();
const double *local_bucket_seconds = f_bssn_rhs_kernel_timing_local_seconds();
if (bucket_count <= 0 || !local_bucket_seconds)
return;
double *max_bucket_seconds = new double[bucket_count];
double *avg_bucket_seconds = new double[bucket_count];
int *order = new int[bucket_count];
MPI_Reduce((void *)local_bucket_seconds, max_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce((void *)local_bucket_seconds, avg_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myrank == 0)
{
double kernel_total = 0.0;
for (int i = 0; i < bucket_count; ++i)
{
avg_bucket_seconds[i] /= Mymax(1, nprocs);
order[i] = i;
kernel_total += max_bucket_seconds[i];
}
for (int i = 0; i < bucket_count - 1; ++i)
for (int j = i + 1; j < bucket_count; ++j)
if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]])
{
int tmp = order[i];
order[i] = order[j];
order[j] = tmp;
}
ios::fmtflags old_flags = cout.flags();
streamsize old_precision = cout.precision();
const double kernel_frac = (step_wall_seconds > 0.0) ? (100.0 * kernel_total / step_wall_seconds) : 0.0;
cout << " RHS kernel split (max-rank accumulated over step " << step_index << "): total "
<< setprecision(6) << kernel_total << " s (" << setprecision(4)
<< kernel_frac << "% of coarse step)" << endl;
const int topn = Mymin(BSSN_FINE_TIMING_TOPN, bucket_count);
for (int i = 0; i < topn; ++i)
{
const int ib = order[i];
const double frac = (kernel_total > 0.0) ? (100.0 * max_bucket_seconds[ib] / kernel_total) : 0.0;
cout << " "
<< setw(20) << left << f_bssn_rhs_kernel_timing_label(ib)
<< " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib]
<< " s (" << setw(6) << setprecision(4) << frac << "% of kernel)" << endl;
}
cout << endl;
cout.flags(old_flags);
cout.precision(old_precision);
}
delete[] max_bucket_seconds;
delete[] avg_bucket_seconds;
delete[] order;
}
}
#endif
//================================================================================================
// define bssn_class
@@ -65,6 +292,7 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
xc(0), yc(0), zc(0), xr(0), yr(0), zr(0), trigger(0), dTT(0), dumpid(0),
#endif
a_lev(a_levi), maxl(maxli), decn(decni), maxrex(maxrexi), drex(drexi),
ConstraintRefreshLevels(0),
CheckPoint(0)
// CheckPoint(0)
{
@@ -107,6 +335,24 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
a_stream.str("");
a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz";
ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str());
#if BSSN_FINE_TIMING
a_stream.clear();
a_stream.str("");
a_stream << setw(8) << "# step";
a_stream << setw(14) << "wall";
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
a_stream << setw(18) << step_timing::bucket_labels[ib];
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
{
char str_avg[64];
sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]);
a_stream << setw(18) << str_avg;
}
TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str());
#else
TimingMonitor = 0;
#endif
}
// setup sphere integration engine
Waveshell = new surface_integral(Symmetry);
@@ -702,6 +948,9 @@ void bssn_class::Initialize()
}
}
GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
ConstraintRefreshLevels = new int[GH->levels];
for (int il = 0; il < GH->levels; il++)
ConstraintRefreshLevels[il] = 0;
if (checkrun)
CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
else
@@ -791,6 +1040,8 @@ bssn_class::~bssn_class()
DumpList->clearList();
ConstraintList->clearList();
delete[] ConstraintRefreshLevels;
delete phio;
delete trKo;
delete gxxo;
@@ -1050,6 +1301,7 @@ bssn_class::~bssn_class()
delete BHMonitor;
delete MAPMonitor;
delete ConVMonitor;
delete TimingMonitor;
delete Waveshell;
delete CheckPoint;
@@ -2147,6 +2399,15 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++)
{
#if BSSN_FINE_TIMING
step_timing::reset();
#endif
#if BSSN_KERNEL_FINE_TIMING
f_bssn_rhs_kernel_timing_reset();
#endif
#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING)
const double step_wall_start = MPI_Wtime();
#endif
// special for large mass ratio consideration
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
// { GH->levels=GH->movls; }
@@ -2173,6 +2434,7 @@ void bssn_class::Evolve(int Steps)
// When LastDump >= DumpTime, output corresponding binary data
if (LastDump >= DumpTime)
{
STEP_TIMER_DECL(timer_dump3d);
// misc::tillherecheck("before Dump_Data");
for (int lev = 0; lev < GH->levels; lev++)
@@ -2180,6 +2442,7 @@ void bssn_class::Evolve(int Steps)
#ifdef WithShell
SH->Dump_Data(DumpList, 0, PhysTime, dT_mon);
#endif
STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d);
LastDump = 0;
@@ -2192,10 +2455,12 @@ void bssn_class::Evolve(int Steps)
// When Last2dDump >= d2DumpTime, output corresponding 2D data
if (Last2dDump >= d2DumpTime)
{
STEP_TIMER_DECL(timer_dump2d);
// misc::tillherecheck("before 2dDump_Data");
for (int lev = 0; lev < GH->levels; lev++)
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
Last2dDump = 0;
@@ -2220,10 +2485,12 @@ void bssn_class::Evolve(int Steps)
break;
#if (REGLEV == 1)
STEP_TIMER_DECL(timer_regrid);
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
STEP_TIMER_ADD(TB_REGRID, timer_regrid);
#endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2263,10 +2530,13 @@ void bssn_class::Evolve(int Steps)
<< endl;
}
cout << endl;
#if BSSN_ENABLE_STDIN_ABORT_POLL
cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl;
#endif
// cout << endl;
}
#if BSSN_ENABLE_STDIN_ABORT_POLL
////////////////////////////////////////////////////////////
// If an "abort" command is detected on stdin, terminate MPI processes
////////////////////////////////////////////////////////////
@@ -2294,10 +2564,12 @@ void bssn_class::Evolve(int Steps)
}
////////////////////////////////////////////////////////////
#endif
// When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime)
{
STEP_TIMER_DECL(timer_checkpoint);
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
@@ -2306,7 +2578,20 @@ void bssn_class::Evolve(int Steps)
CheckPoint->writecheck_sh(PhysTime, SH);
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint);
}
#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING)
const double step_wall_seconds = MPI_Wtime() - step_wall_start;
#endif
#if BSSN_FINE_TIMING
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, step_wall_seconds);
#endif
#if BSSN_KERNEL_FINE_TIMING
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
rhs_kernel_timing_report::report(myrank, nprocs, ncount, step_wall_seconds);
#endif
}
/*
#ifdef With_AHF
@@ -2438,10 +2723,16 @@ void bssn_class::RecursiveStep(int lev)
#endif
#if (REGLEV == 0)
STEP_TIMER_DECL(timer_regrid_onelevel);
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
{
if (ConstraintRefreshLevels)
ConstraintRefreshLevels[lev] = 1;
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
}
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif
}
@@ -3034,6 +3325,7 @@ void bssn_class::Step(int lev, int YN)
// new code 2013-2-15, zjcao
#if (MAPBH == 1)
STEP_TIMER_DECL(timer_bh_predictor);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -3064,6 +3356,7 @@ void bssn_class::Step(int lev, int YN)
}
}
}
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
@@ -3086,6 +3379,7 @@ void bssn_class::Step(int lev, int YN)
int ERROR = 0;
MyList<ss_patch> *sPp;
STEP_TIMER_DECL(timer_predictor_rhs);
// Predictor
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
@@ -3361,6 +3655,9 @@ void bssn_class::Step(int lev, int YN)
}
#endif
STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs);
STEP_TIMER_DECL(timer_predictor_sync);
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -3398,6 +3695,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
#if (MAPBH == 0)
// for black hole position
@@ -3442,6 +3740,7 @@ void bssn_class::Step(int lev, int YN)
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
STEP_TIMER_DECL(timer_corrector_rhs);
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
@@ -3721,6 +4020,9 @@ void bssn_class::Step(int lev, int YN)
}
#endif
STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs);
STEP_TIMER_DECL(timer_corrector_sync);
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -3760,8 +4062,10 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
#if (MAPBH == 0)
STEP_TIMER_DECL(timer_bh_corrector);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -3794,11 +4098,13 @@ void bssn_class::Step(int lev, int YN)
}
}
}
STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector);
#endif
// swap time level
if (iter_count < 3)
{
STEP_TIMER_DECL(timer_state_swap);
Pp = GH->PatL[lev];
while (Pp)
{
@@ -3845,9 +4151,11 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap);
}
}
#if (RPS == 0)
STEP_TIMER_DECL(timer_restrict_prolong);
// mesh refinement boundary part
RestrictProlong(lev, YN, BB);
@@ -3868,6 +4176,7 @@ void bssn_class::Step(int lev, int YN)
}
#endif
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
#endif
// note the data structure before update
// SynchList_cor 1 -----------
@@ -3876,6 +4185,7 @@ void bssn_class::Step(int lev, int YN)
//
// OldStateList old -----------
// update
STEP_TIMER_DECL(timer_state_commit);
Pp = GH->PatL[lev];
while (Pp)
{
@@ -3932,6 +4242,7 @@ void bssn_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit);
}
//================================================================================================
@@ -4258,7 +4569,9 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
STEP_TIMER_DECL(timer_bh_predictor);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -4297,6 +4610,7 @@ void bssn_class::Step(int lev, int YN)
{
AnalysisStuff(lev, dT_lev);
}
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
// corrector
for (iter_count = 1; iter_count < 3; iter_count++)
{
@@ -5767,6 +6081,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2)
// stringstream a_stream;
// a_stream.setf(ios::left);
@@ -5909,6 +6224,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================
@@ -5928,6 +6244,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
STEP_TIMER_DECL(timer_restrict_prolong);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
if (lev >= GH->levels - 1)
@@ -5996,6 +6313,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================
@@ -6006,6 +6324,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
void bssn_class::RestrictProlong(int lev, int YN, bool BB)
{
STEP_TIMER_DECL(timer_restrict_prolong);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
// we assume for fine
// SynchList_cor 1 -----------
@@ -6085,6 +6404,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================
@@ -6835,18 +7155,15 @@ void bssn_class::compute_Porg_rhs(double **BH_PS,double **BH_RHS,var *forx,var *
void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int ilev)
{
const int InList = 3;
MyList<var> DG_List_x(forx);
MyList<var> DG_List_y(fory);
MyList<var> DG_List_z(forz);
DG_List_x.next = &DG_List_y;
DG_List_y.next = &DG_List_z;
MyList<var> *DG_List = new MyList<var>(forx);
DG_List->insert(fory);
DG_List->insert(forz);
double *x1, *y1, *z1;
double *shellf;
shellf = new double[3];
double *pox[3];
for (int i = 0; i < 3; i++)
pox[i] = new double[1];
double shellf[3];
double pox_buf[3][1];
double *pox[3] = {pox_buf[0], pox_buf[1], pox_buf[2]};
for (int n = 0; n < BH_num; n++)
{
@@ -6857,9 +7174,9 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
int lev = ilev;
#if (PSTR == 0)
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry))
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry, GH->Commlev[lev]))
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry, GH->Commlev[lev]))
#endif
{
lev--;
@@ -6868,7 +7185,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
ErrorMonitor->outfile << "fail to find black holes at t = " << PhysTime << endl;
for (n = 0; n < BH_num; n++)
ErrorMonitor->outfile << "(x,y,z) = ("
<< pox[0][n] << "," << pox[1][n] << "," << pox[2][n]
<< BH_PS[n][0] << "," << BH_PS[n][1] << "," << BH_PS[n][2]
<< ")" << endl;
break;
}
@@ -6881,11 +7198,6 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
BH_RHS[n][2] = -shellf[2];
}
}
DG_List->clearList();
delete[] shellf;
for (int i = 0; i < 3; i++)
delete[] pox[i];
}
#endif
@@ -7108,6 +7420,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
IP = new double[NN];
RoutMAP = new double[7];
double Rex = maxrex;
bool patch_mass_prepared = false;
#ifdef WithShell
bool shell_mass_prepared = false;
#endif
for (int i = 0; i < decn; i++)
{
#ifdef Point_Psi4
@@ -7135,7 +7451,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
}
else
{
@@ -7143,44 +7460,52 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
RoutMAP, ErrorMonitor, !shell_mass_prepared);
shell_mass_prepared = true;
}
#else
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
#endif
#else
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before surface integral");
#ifdef WithShell
if (lev > 0 || Rex < GH->bbox[0][0][3])
{
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor);
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0,
Waveshell->surf_WaveMassPAng(Rex, lev, GH,
Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
phi0, trK0,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
}
else
{
Waveshell->surf_Wave(Rex, lev, SH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor);
Waveshell->surf_MassPAng(Rex, lev, SH, phi0, trK0,
Waveshell->surf_WaveMassPAng(Rex, lev, SH,
Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
phi0, trK0,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !shell_mass_prepared);
shell_mass_prepared = true;
}
#else
#if (PSTR == 0)
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor);
Waveshell->surf_MassPAng(Rex, lev, GH, phi0, trK0,
Waveshell->surf_WaveMassPAng(Rex, lev, GH,
Rpsi4, Ipsi4, 2, maxl, NN, RP, IP,
phi0, trK0,
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor);
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1,
RoutMAP, ErrorMonitor, !patch_mass_prepared);
patch_mass_prepared = true;
#elif (PSTR == 1 || PSTR == 2)
Waveshell->surf_Wave(Rex, lev, GH, Rpsi4, Ipsi4, 2, maxl, NN, RP, IP, ErrorMonitor, GH->Commlev[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after surf_Wave");
@@ -7188,7 +7513,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
gxx0, gxy0, gxz0, gyy0, gyz0, gzz0,
Axx0, Axy0, Axz0, Ayy0, Ayz0, Azz0,
Gmx0, Gmy0, Gmz0, Sfx1, Sfy1, Sfz1, // here we can not touch rhs variables, but 1 variables
RoutMAP, ErrorMonitor, GH->Commlev[lev]);
RoutMAP, ErrorMonitor, GH->Commlev[lev], !patch_mass_prepared);
patch_mass_prepared = true;
#endif
#endif
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral");
@@ -7261,7 +7587,7 @@ void bssn_class::Constraint_Out()
for (int lev = 0; lev < GH->levels; lev++)
{
// make sure the data consistent for higher levels
if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation
if (lev > 0 && ConstraintRefreshLevels && ConstraintRefreshLevels[lev]) // only refresh levels whose grid layout changed after evolution
{
double TRK4 = PhysTime;
double ndeps = numepsb;
@@ -7415,35 +7741,18 @@ void bssn_class::Constraint_Out()
#if (PSTR == 1 || PSTR == 2)
double ConV_h[7];
#endif
var *ConstraintVars[7] = {Cons_Ham, Cons_Px, Cons_Py, Cons_Pz, Cons_Gx, Cons_Gy, Cons_Gz};
#ifdef WithShell
ConV[0] = SH->L2Norm(Cons_Ham);
ConV[1] = SH->L2Norm(Cons_Px);
ConV[2] = SH->L2Norm(Cons_Py);
ConV[3] = SH->L2Norm(Cons_Pz);
ConV[4] = SH->L2Norm(Cons_Gx);
ConV[5] = SH->L2Norm(Cons_Gy);
ConV[6] = SH->L2Norm(Cons_Gz);
SH->L2Norm7(ConstraintVars, ConV);
ConVMonitor->writefile(PhysTime, 7, ConV);
#endif
for (int levi = 0; levi < GH->levels; levi++)
{
#if (PSTR == 0)
ConV[0] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Ham);
ConV[1] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Px);
ConV[2] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Py);
ConV[3] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Pz);
ConV[4] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gx);
ConV[5] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gy);
ConV[6] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gz);
Parallel::L2Norm7(GH->PatL[levi]->data, ConstraintVars, ConV);
#elif (PSTR == 1 || PSTR == 2)
ConV[0] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Ham, GH->Commlev[levi]);
ConV[1] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Px, GH->Commlev[levi]);
ConV[2] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Py, GH->Commlev[levi]);
ConV[3] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Pz, GH->Commlev[levi]);
ConV[4] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gx, GH->Commlev[levi]);
ConV[5] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gy, GH->Commlev[levi]);
ConV[6] = Parallel::L2Norm(GH->PatL[levi]->data, Cons_Gz, GH->Commlev[levi]);
Parallel::L2Norm7(GH->PatL[levi]->data, ConstraintVars, ConV, GH->Commlev[levi]);
// misc::tillherecheck("before collect data to cpu0");
// MPI_ALLREDUCE( sendbuf, recvbuf, count, datatype, op, comm), sendbu and recvbuf must be different
if (levi > 0)
@@ -7474,6 +7783,9 @@ void bssn_class::Constraint_Out()
Interp_Constraint(false);
LastConsOut = 0;
if (ConstraintRefreshLevels)
for (int lev = 0; lev < GH->levels; lev++)
ConstraintRefreshLevels[lev] = 0;
}
}

View File

@@ -48,6 +48,7 @@ public:
double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut;
int *ConstraintRefreshLevels;
double Courant;
double numepss, numepsb, numepsh;
int Symmetry;
@@ -134,7 +135,7 @@ public:
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor;
monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell;
checkpoint *CheckPoint;

View File

@@ -32,6 +32,19 @@
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_
#endif
#ifdef __cplusplus
extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C"
{
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z

View File

@@ -2,12 +2,88 @@
#include "bssn_rhs.h"
#include "share_func.h"
#include "tool.h"
#include <time.h>
// 0-based i,j,k
// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
// ex(1)=nx, ex(2)=ny, ex(3)=nz
// 用法a[ IDX_F(i,j,k,nx,ny) ]
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing
{
enum Bucket
{
KB_SETUP_DERIVS = 0,
KB_GEOM_GAMMA,
KB_RICCI_METRIC,
KB_CHI_LAPSE,
KB_AIJ_TRK_GAUGE,
KB_KO_CONSTRAINT,
KB_COUNT
};
static double local_bucket_seconds[KB_COUNT];
static const char *bucket_labels[KB_COUNT] =
{
"setup_derivs",
"geom_gamma",
"ricci_metric",
"chi_lapse",
"aij_trk_gauge",
"ko_constraint"
};
static inline double now_seconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec);
}
}
extern "C" void f_bssn_rhs_kernel_timing_reset()
{
for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i)
rhs_kernel_timing::local_bucket_seconds[i] = 0.0;
}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count()
{
return rhs_kernel_timing::KB_COUNT;
}
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds()
{
return rhs_kernel_timing::local_bucket_seconds;
}
extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index)
{
if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT)
return "unknown";
return rhs_kernel_timing::bucket_labels[bucket_index];
}
#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds()
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \
rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \
rhs_kernel_timing::now_seconds() - (var_name)
#else
extern "C" void f_bssn_rhs_kernel_timing_reset() {}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; }
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; }
extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; }
#define RHS_KERNEL_TIMER_DECL(var_name)
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name)
#endif
// C function that calculates the right-hand side for BSSN equations
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
@@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0];
dZ = Z[1] - Z[0];
RHS_KERNEL_TIMER_DECL(timer_setup_derivs);
// 1ms //
for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0;
@@ -141,6 +218,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
}
RHS_KERNEL_TIMER_ADD(KB_SETUP_DERIVS, timer_setup_derivs);
RHS_KERNEL_TIMER_DECL(timer_geom_gamma);
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
for(int i=0;i<all;i+=1){
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
@@ -283,9 +362,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
@@ -315,6 +391,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
);
}
RHS_KERNEL_TIMER_ADD(KB_GEOM_GAMMA, timer_geom_gamma);
RHS_KERNEL_TIMER_DECL(timer_ricci_metric);
// 22.3ms //
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
@@ -332,7 +410,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
@@ -686,69 +763,74 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i]
);
}
RHS_KERNEL_TIMER_ADD(KB_RICCI_METRIC, timer_ricci_metric);
RHS_KERNEL_TIMER_DECL(timer_chi_lapse);
// 22.3ms //
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 7ms //
for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
fyy[i] = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
fyz[i] = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
fzz[i] = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
f[i] =
gupxx[i] * (fxx[i] - (F3o2 / chin1[i]) * chix[i] * chix[i])
+ gupyy[i] * (fyy[i] - (F3o2 / chin1[i]) * chiy[i] * chiy[i])
+ gupzz[i] * (fzz[i] - (F3o2 / chin1[i]) * chiz[i] * chiz[i])
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO);
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO);
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO);
const double inv_chin1 = ONE / chin1[i];
const double half_inv_chin1 = HALF * inv_chin1;
const double scaled_inv = F3o2 * inv_chin1;
const double cxx = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
const double cxy = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
const double cxz = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
const double cyy = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
const double cyz = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
const double czz = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
const double ricci_chi =
gupxx[i] * (cxx - scaled_inv * chix[i] * chix[i])
+ gupyy[i] * (cyy - scaled_inv * chiy[i] * chiy[i])
+ gupzz[i] * (czz - scaled_inv * chiz[i] * chiz[i])
+ TWO * gupxy[i] * (cxy - scaled_inv * chix[i] * chiy[i])
+ TWO * gupxz[i] * (cxz - scaled_inv * chix[i] * chiz[i])
+ TWO * gupyz[i] * (cyz - scaled_inv * chiy[i] * chiz[i]);
f[i] = ricci_chi;
Rxx[i] = Rxx[i] + ( cxx - half_inv_chin1 * chix[i] * chix[i] + (dxx[i] + ONE) * ricci_chi ) * half_inv_chin1;
Ryy[i] = Ryy[i] + ( cyy - half_inv_chin1 * chiy[i] * chiy[i] + (dyy[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rzz[i] = Rzz[i] + ( czz - half_inv_chin1 * chiz[i] * chiz[i] + (dzz[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
Ryz[i] = Ryz[i] + ( fyz[i] - (chiy[i] * chiz[i]) / (chin1[i] * TWO) + gyz[i] * f[i] ) / (chin1[i] * TWO);
Rxy[i] = Rxy[i] + ( cxy - half_inv_chin1 * chix[i] * chiy[i] + gxy[i] * ricci_chi ) * half_inv_chin1;
Rxz[i] = Rxz[i] + ( cxz - half_inv_chin1 * chix[i] * chiz[i] + gxz[i] * ricci_chi ) * half_inv_chin1;
Ryz[i] = Ryz[i] + ( cyz - half_inv_chin1 * chiy[i] * chiz[i] + gyz[i] * ricci_chi ) * half_inv_chin1;
}
// 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms //
for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i];
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i];
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];
const double inv_chin1 = ONE / chin1[i];
const double gchi_x = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) * inv_chin1;
const double gchi_y = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) * inv_chin1;
const double gchi_z = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) * inv_chin1;
/* Christoffel 修正项 */
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF;
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) * inv_chin1) - (dxx[i] + ONE) * gchi_x ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_y ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_z ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_x ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) * inv_chin1) - (dyy[i] + ONE) * gchi_y ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_z ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_x ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_y ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) * inv_chin1) - (dzz[i] + ONE) * gchi_z ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gxxz[i] ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] * inv_chin1) - gxy[i] * gchi_x ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] * inv_chin1) - gxy[i] * gchi_y ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gchi_z ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] / chin1[i]) - gxz[i] * gxxx[i] ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gxxy[i] ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] / chin1[i]) - gxz[i] * gxxz[i] ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] * inv_chin1) - gxz[i] * gchi_x ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gchi_y ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] * inv_chin1) - gxz[i] * gchi_z ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gxxx[i] ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] / chin1[i]) - gyz[i] * gxxy[i] ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] / chin1[i]) - gyz[i] * gxxz[i] ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gchi_x ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] * inv_chin1) - gyz[i] * gchi_y ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] * inv_chin1) - gyz[i] * gchi_z ) * HALF;
/* fxx..fyz 修正:减去 Γ * ∂Lap */
fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i];
@@ -762,6 +844,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
}
RHS_KERNEL_TIMER_ADD(KB_CHI_LAPSE, timer_chi_lapse);
RHS_KERNEL_TIMER_DECL(timer_aij_trk_gauge);
// 2.5ms //
for (int i=0;i<all;i+=1) {
const double divb = betaxx[i] + betayy[i] + betazz[i];
@@ -1062,6 +1146,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#endif
}
RHS_KERNEL_TIMER_ADD(KB_AIJ_TRK_GAUGE, timer_aij_trk_gauge);
RHS_KERNEL_TIMER_DECL(timer_ko_constraint);
// advection + KO dissipation with shared symmetry buffer
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
@@ -1193,6 +1279,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
}
}
RHS_KERNEL_TIMER_ADD(KB_KO_CONSTRAINT, timer_ko_constraint);

View File

@@ -1514,6 +1514,81 @@ f_out = f_out*dX*dY*dZ
return
end subroutine l2normhelper
!--------------------------------------------------------------------------------------
subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f1,f2,f3,f4,f5,f6,f7,f_out,gw)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&

View File

@@ -13,6 +13,7 @@
#define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper
#define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average
@@ -42,6 +43,7 @@
#define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER
#define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE
@@ -71,6 +73,7 @@
#define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_
#define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_
@@ -164,6 +167,15 @@ extern "C"
double *, double &, int &);
}
extern "C"
{
void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,

View File

@@ -17,65 +17,103 @@ using namespace std;
#include <math.h>
#endif
// Intel oneMKL LAPACK interface
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
/* Linear equation solution by Gauss-Jordan elimination.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors.
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
corresponding set of solution vectors. */
int gaussj(double *a, double *b, int n)
{
// Allocate pivot array and workspace
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
double swap;
// Make a copy of matrix a for solving (dgesv modifies it to LU form)
double *a_copy = new double[n * n];
for (int i = 0; i < n * n; i++) {
a_copy[i] = a[i];
int *indxc, *indxr, *ipiv;
indxc = new int[n];
indxr = new int[n];
ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
return 1;
}
}
// Step 1: Solve linear system A*x = b using LU decomposition
// LAPACKE_dgesv uses column-major by default, but we use row-major
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
if (info != 0) {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
return 1;
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
for (l = n - 1; l >= 0; l--)
{
if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
{
swap = a[k * n + indxr[l]];
a[k * n + indxr[l]] = a[k * n + indxc[l]];
a[k * n + indxc[l]] = swap;
}
}
delete[] indxc;
delete[] indxr;
delete[] ipiv;
delete[] a_copy;
return 0;
}

View File

@@ -29,6 +29,16 @@
#define REGLEV 0
#define BSSN_FINE_TIMING 0
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU
//#define CHECKDETAIL
@@ -88,6 +98,21 @@
// 0: for every level;
// 1: for all
//
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU
// use gpu or not
//
@@ -142,4 +167,3 @@
#define TINY 1e-10
#endif /* MICRODEF_H */

View File

@@ -8,27 +8,16 @@ include makefile.inc
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
## Legacy GNU/OpenMPI flags
CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
CXXAPPFLAGS = $(CXXBASEFLAGS)
f90appflags = $(F90BASEFLAGS)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
CXXAPPFLAGS = $(CXXBASEFLAGS)
f90appflags = $(F90BASEFLAGS)
endif
.SUFFIXES: .o .f90 .C .for .cu
@@ -68,16 +57,13 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) -c $< -o $@
# Input files
@@ -185,7 +171,7 @@ ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILE
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f

56
AMSS_NCKU_source/makefile.inc Executable file → Normal file
View File

@@ -1,33 +1,27 @@
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Legacy GNU/OpenMPI toolchain configuration
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## OpenMPI wrappers are installed but may not be on PATH.
OMPI_BIN ?= /usr/lib64/openmpi/bin
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
## Wrapper compilers
f90 = $(OMPI_BIN)/mpifort
f77 = $(OMPI_BIN)/mpifort
CXX = $(OMPI_BIN)/mpicxx
CC = $(OMPI_BIN)/mpicc
CLINKER = $(OMPI_BIN)/mpicxx
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## Extra include flags are not needed when using the OpenMPI wrappers.
filein =
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
PGO_MODE ?= opt
## BLAS/LAPACK backend:
## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
## PGO build mode switch
## off : default legacy GNU build without PGO
## instrument : accepted for compatibility, currently same as off
PGO_MODE ?= off
## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
USE_CXX_KERNELS ?= 1
## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 1 (default) : use C/C++ rewrite of rungekutta4_rout
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
## OpenMP is only used for TwoPunctures on the legacy toolchain.
TP_OPENMP_FLAGS ?= -fopenmp
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

View File

@@ -36,7 +36,14 @@ using namespace std;
//| Constructor
//|============================================================================
surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry),
wave_cache_spinw(-1),
wave_cache_maxl(-1),
wave_cache_modes(0),
wave_theta_pos(0),
wave_theta_neg(0),
wave_phi_cos(0),
wave_phi_sin(0)
{
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
@@ -182,6 +189,7 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
//|============================================================================
surface_integral::~surface_integral()
{
clear_wave_cache();
delete[] nx_g;
delete[] ny_g;
delete[] nz_g;
@@ -190,6 +198,65 @@ surface_integral::~surface_integral()
delete[] wtcostheta;
#endif
}
void surface_integral::clear_wave_cache()
{
delete[] wave_theta_pos;
delete[] wave_theta_neg;
delete[] wave_phi_cos;
delete[] wave_phi_sin;
wave_theta_pos = 0;
wave_theta_neg = 0;
wave_phi_cos = 0;
wave_phi_sin = 0;
wave_cache_spinw = -1;
wave_cache_maxl = -1;
wave_cache_modes = 0;
}
void surface_integral::build_wave_cache(int spinw, int maxl)
{
if (wave_cache_spinw == spinw && wave_cache_maxl == maxl && wave_theta_pos && wave_theta_neg && wave_phi_cos && wave_phi_sin)
return;
clear_wave_cache();
int modes = 0;
for (int pl = spinw; pl < maxl + 1; pl++)
for (int pm = -pl; pm < pl + 1; pm++)
modes++;
wave_theta_pos = new double[modes * N_theta];
wave_theta_neg = new double[modes * N_theta];
wave_phi_cos = new double[modes * N_phi];
wave_phi_sin = new double[modes * N_phi];
int countlm = 0;
for (int pl = spinw; pl < maxl + 1; pl++)
for (int pm = -pl; pm < pl + 1; pm++)
{
const double prefactor = sqrt((2 * pl + 1.0) / 4.0 / PI);
for (int i = 0; i < N_theta; i++)
{
#ifdef GaussInt
const double weight = wtcostheta[i];
#else
const double weight = 1.0;
#endif
wave_theta_pos[countlm * N_theta + i] = prefactor * misc::Wigner_d_function(pl, pm, spinw, arcostheta[i]) * weight;
wave_theta_neg[countlm * N_theta + i] = prefactor * misc::Wigner_d_function(pl, pm, spinw, -arcostheta[i]) * weight;
}
for (int j = 0; j < N_phi; j++)
{
const double phase = pm * (j + 0.5) * dphi;
wave_phi_cos[countlm * N_phi + j] = cos(phase);
wave_phi_sin[countlm * N_phi + j] = sin(phase);
}
countlm++;
}
wave_cache_spinw = spinw;
wave_cache_maxl = maxl;
wave_cache_modes = modes;
}
//|----------------------------------------------------------------
// spin weighted spinw component of psi4, general routine
// l takes from spinw to maxl; m takes from -l to l
@@ -264,6 +331,39 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
lpsy = 8;
double psi4RR, psi4II;
if (Symmetry == 0 || Symmetry == 1)
{
build_wave_cache(spinw, maxl);
for (n = Nmin; n <= Nmax; n++)
{
i = int(n / N_phi);
j = n - i * N_phi;
const double psi4RR0 = shellf[InList * n];
const double psi4II0 = shellf[InList * n + 1];
const double psi4RR1 = Rpsi4->SoA[2] * psi4RR0;
const double psi4II1 = Ipsi4->SoA[2] * psi4II0;
for (int countlm = 0; countlm < wave_cache_modes; countlm++)
{
const int theta_idx = countlm * N_theta + i;
const int phi_idx = countlm * N_phi + j;
const double theta_pos = wave_theta_pos[theta_idx];
const double cosmphi_here = wave_phi_cos[phi_idx];
const double sinmphi_here = wave_phi_sin[phi_idx];
RP_out[countlm] += theta_pos * (psi4RR0 * cosmphi_here + psi4II0 * sinmphi_here);
IP_out[countlm] += theta_pos * (psi4II0 * cosmphi_here - psi4RR0 * sinmphi_here);
if (Symmetry == 1)
{
const double theta_neg = wave_theta_neg[theta_idx];
RP_out[countlm] += theta_neg * (psi4RR1 * cosmphi_here + psi4II1 * sinmphi_here);
IP_out[countlm] += theta_neg * (psi4II1 * cosmphi_here - psi4RR1 * sinmphi_here);
}
}
}
}
else
{
for (n = Nmin; n <= Nmax; n++)
{
// need round off always
@@ -348,6 +448,7 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
}
}
}
for (int ii = 0; ii < NN; ii++)
{
@@ -466,6 +567,39 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
lpsy = 8;
double psi4RR, psi4II;
if (Symmetry == 0 || Symmetry == 1)
{
build_wave_cache(spinw, maxl);
for (n = Nmin; n <= Nmax; n++)
{
i = int(n / N_phi);
j = n - i * N_phi;
const double psi4RR0 = shellf[InList * n];
const double psi4II0 = shellf[InList * n + 1];
const double psi4RR1 = Rpsi4->SoA[2] * psi4RR0;
const double psi4II1 = Ipsi4->SoA[2] * psi4II0;
for (int countlm = 0; countlm < wave_cache_modes; countlm++)
{
const int theta_idx = countlm * N_theta + i;
const int phi_idx = countlm * N_phi + j;
const double theta_pos = wave_theta_pos[theta_idx];
const double cosmphi_here = wave_phi_cos[phi_idx];
const double sinmphi_here = wave_phi_sin[phi_idx];
RP_out[countlm] += theta_pos * (psi4RR0 * cosmphi_here + psi4II0 * sinmphi_here);
IP_out[countlm] += theta_pos * (psi4II0 * cosmphi_here - psi4RR0 * sinmphi_here);
if (Symmetry == 1)
{
const double theta_neg = wave_theta_neg[theta_idx];
RP_out[countlm] += theta_neg * (psi4RR1 * cosmphi_here + psi4II1 * sinmphi_here);
IP_out[countlm] += theta_neg * (psi4II1 * cosmphi_here - psi4RR1 * sinmphi_here);
}
}
}
}
else
{
for (n = Nmin; n <= Nmax; n++)
{
// need round off always
@@ -550,6 +684,7 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
countlm++; // no sanity check for countlm and NN which should be noted in the input parameters
}
}
}
for (int ii = 0; ii < NN; ii++)
{
@@ -2319,7 +2454,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor)
double *Rout, monitor *Monitor, bool refresh_mass_fields)
{
if (myrank == 0 && GH->grids[lev] != 1)
if (Monitor && Monitor->outfile)
@@ -2329,6 +2464,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
double mass, px, py, pz, sx, sy, sz;
if (refresh_mass_fields)
{
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -2352,6 +2489,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
}
Pp = Pp->next;
}
}
const int InList = 17;
@@ -2581,7 +2719,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here)
double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields)
{
int lmyrank;
MPI_Comm_rank(Comm_here, &lmyrank);
@@ -2593,6 +2731,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
double mass, px, py, pz, sx, sy, sz;
if (refresh_mass_fields)
{
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -2616,6 +2756,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
}
Pp = Pp->next;
}
}
const int InList = 17;
@@ -2853,7 +2994,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor)
double *Rout, monitor *Monitor, bool refresh_mass_fields)
{
if (lev != 0)
{
@@ -2869,6 +3010,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
double mass, px, py, pz, sx, sy, sz;
if (refresh_mass_fields)
{
MyList<ss_patch> *Pp = GH->PatL;
while (Pp)
{
@@ -2903,6 +3046,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
}
Pp = Pp->next;
}
}
const int InList = 17;
@@ -3128,6 +3272,626 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
delete[] shellf;
DG_List->clearList();
}
void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields)
{
if (Symmetry != 0 && Symmetry != 1)
{
surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
surf_MassPAng(rex, lev, GH, chi, trK,
gxx, gxy, gxz, gyy, gyz, gzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gmx, Gmy, Gmz,
Sfx_rhs, Sfy_rhs, Sfz_rhs,
Rout, Monitor, refresh_mass_fields);
return;
}
if (myrank == 0 && GH->grids[lev] != 1)
if (Monitor && Monitor->outfile)
Monitor->outfile << "WARNING: surface integral on multipatches" << endl;
else
cout << "WARNING: surface integral on multipatches" << endl;
if (refresh_mass_fields)
{
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_admmass_bssn(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
Symmetry);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
const int InList = 19;
const int idx_rpsi4 = 0, idx_ipsi4 = 1;
const int idx_sfx = 2, idx_sfy = 3, idx_sfz = 4;
const int idx_chi = 5, idx_trk = 6;
const int idx_gxx = 7, idx_gxy = 8, idx_gxz = 9, idx_gyy = 10, idx_gyz = 11, idx_gzz = 12;
const int idx_axx = 13, idx_axy = 14, idx_axz = 15, idx_ayy = 16, idx_ayz = 17, idx_azz = 18;
MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4);
DG_List->insert(Sfx_rhs);
DG_List->insert(Sfy_rhs);
DG_List->insert(Sfz_rhs);
DG_List->insert(chi);
DG_List->insert(trK);
DG_List->insert(gxx);
DG_List->insert(gxy);
DG_List->insert(gxz);
DG_List->insert(gyy);
DG_List->insert(gyz);
DG_List->insert(gzz);
DG_List->insert(Axx);
DG_List->insert(Axy);
DG_List->insert(Axz);
DG_List->insert(Ayy);
DG_List->insert(Ayz);
DG_List->insert(Azz);
int n;
double *pox[3];
for (int ia = 0; ia < 3; ia++)
pox[ia] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp;
if (Lp > myrank)
{
Nmin = myrank * mp + myrank;
Nmax = Nmin + mp;
}
else
{
Nmin = myrank * mp + Lp;
Nmax = Nmin + mp - 1;
}
double *shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
double *RP_out = new double[NN];
double *IP_out = new double[NN];
for (int ii = 0; ii < NN; ii++)
{
RP_out[ii] = 0.0;
IP_out[ii] = 0.0;
}
double Mass_out = 0.0;
double ang_outx = 0.0, ang_outy = 0.0, ang_outz = 0.0;
double p_outx = 0.0, p_outy = 0.0, p_outz = 0.0;
const double f1o8 = 0.125;
build_wave_cache(spinw, maxl);
for (n = Nmin; n <= Nmax; n++)
{
const int base = InList * n;
const int i = int(n / N_phi);
const int j = n - i * N_phi;
const double psi4RR0 = shellf[base + idx_rpsi4];
const double psi4II0 = shellf[base + idx_ipsi4];
const double psi4RR1 = Rpsi4->SoA[2] * psi4RR0;
const double psi4II1 = Ipsi4->SoA[2] * psi4II0;
for (int countlm = 0; countlm < wave_cache_modes; countlm++)
{
const int theta_idx = countlm * N_theta + i;
const int phi_idx = countlm * N_phi + j;
const double theta_pos = wave_theta_pos[theta_idx];
const double cosmphi_here = wave_phi_cos[phi_idx];
const double sinmphi_here = wave_phi_sin[phi_idx];
RP_out[countlm] += theta_pos * (psi4RR0 * cosmphi_here + psi4II0 * sinmphi_here);
IP_out[countlm] += theta_pos * (psi4II0 * cosmphi_here - psi4RR0 * sinmphi_here);
if (Symmetry == 1)
{
const double theta_neg = wave_theta_neg[theta_idx];
RP_out[countlm] += theta_neg * (psi4RR1 * cosmphi_here + psi4II1 * sinmphi_here);
IP_out[countlm] += theta_neg * (psi4II1 * cosmphi_here - psi4RR1 * sinmphi_here);
}
}
double Chi = shellf[base + idx_chi];
double TRK = shellf[base + idx_trk];
double Gxx = shellf[base + idx_gxx] + 1.0;
double Gxy = shellf[base + idx_gxy];
double Gxz = shellf[base + idx_gxz];
double Gyy = shellf[base + idx_gyy] + 1.0;
double Gyz = shellf[base + idx_gyz];
double Gzz = shellf[base + idx_gzz] + 1.0;
double axx = shellf[base + idx_axx];
double axy = shellf[base + idx_axy];
double axz = shellf[base + idx_axz];
double ayy = shellf[base + idx_ayy];
double ayz = shellf[base + idx_ayz];
double azz = shellf[base + idx_azz];
Chi = 1.0 / (1.0 + Chi);
const double Psi = Chi * sqrt(Chi);
#ifdef GaussInt
const double theta_weight = wtcostheta[i];
Mass_out += (shellf[base + idx_sfx] * nx_g[n] + shellf[base + idx_sfy] * ny_g[n] + shellf[base + idx_sfz] * nz_g[n]) * theta_weight;
#else
const double theta_weight = 1.0;
Mass_out += shellf[base + idx_sfx] * nx_g[n] + shellf[base + idx_sfy] * ny_g[n] + shellf[base + idx_sfz] * nz_g[n];
#endif
double detg = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
const double gupxx = (Gyy * Gzz - Gyz * Gyz) / detg;
const double gupxy = -(Gxy * Gzz - Gyz * Gxz) / detg;
const double gupxz = (Gxy * Gyz - Gyy * Gxz) / detg;
const double gupyy = (Gxx * Gzz - Gxz * Gxz) / detg;
const double gupyz = -(Gxx * Gyz - Gxy * Gxz) / detg;
const double gupzz = (Gxx * Gyy - Gxy * Gxy) / detg;
const double aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
const double aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
const double aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
const double aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
const double aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
const double aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
const double aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
const double aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
const double aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
if (Symmetry == 0)
{
ang_outx += f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * theta_weight;
ang_outy += f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * theta_weight;
ang_outz += f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * theta_weight;
}
else
{
ang_outz += f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * theta_weight;
}
axx = Chi * (axx + Gxx * TRK / 3.0);
axy = Chi * (axy + Gxy * TRK / 3.0);
axz = Chi * (axz + Gxz * TRK / 3.0);
ayy = Chi * (ayy + Gyy * TRK / 3.0);
ayz = Chi * (ayz + Gyz * TRK / 3.0);
azz = Chi * (azz + Gzz * TRK / 3.0);
axx -= TRK;
ayy -= TRK;
azz -= TRK;
p_outx += f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * theta_weight;
p_outy += f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * theta_weight;
if (Symmetry == 0)
p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
}
for (int ii = 0; ii < NN; ii++)
{
#ifdef GaussInt
RP_out[ii] = RP_out[ii] * rex * dphi;
IP_out[ii] = IP_out[ii] * rex * dphi;
#else
RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
#endif
}
double mass, px, py, pz, sx, sy, sz;
{
double *reduce_out = new double[2 * NN + 7];
double *reduce_in = new double[2 * NN + 7];
memcpy(reduce_out, RP_out, NN * sizeof(double));
memcpy(reduce_out + NN, IP_out, NN * sizeof(double));
reduce_out[2 * NN + 0] = Mass_out;
reduce_out[2 * NN + 1] = ang_outx;
reduce_out[2 * NN + 2] = ang_outy;
reduce_out[2 * NN + 3] = ang_outz;
reduce_out[2 * NN + 4] = p_outx;
reduce_out[2 * NN + 5] = p_outy;
reduce_out[2 * NN + 6] = p_outz;
MPI_Allreduce(reduce_out, reduce_in, 2 * NN + 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, reduce_in, NN * sizeof(double));
memcpy(IP, reduce_in + NN, NN * sizeof(double));
mass = reduce_in[2 * NN + 0];
sx = reduce_in[2 * NN + 1];
sy = reduce_in[2 * NN + 2];
sz = reduce_in[2 * NN + 3];
px = reduce_in[2 * NN + 4];
py = reduce_in[2 * NN + 5];
pz = reduce_in[2 * NN + 6];
delete[] reduce_out;
delete[] reduce_in;
}
#ifdef GaussInt
mass = mass * rex * rex * dphi * factor;
sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
px = px * rex * rex * dphi * (1.0 / PI) * factor;
py = py * rex * rex * dphi * (1.0 / PI) * factor;
pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
#else
mass = mass * rex * rex * dphi * dcostheta * factor;
sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
#endif
Rout[0] = mass;
Rout[1] = px;
Rout[2] = py;
Rout[3] = pz;
Rout[4] = sx;
Rout[5] = sy;
Rout[6] = sz;
delete[] pox[0];
delete[] pox[1];
delete[] pox[2];
delete[] shellf;
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
void surface_integral::surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields)
{
if (Symmetry != 0 && Symmetry != 1)
{
surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
surf_MassPAng(rex, lev, GH, chi, trK,
gxx, gxy, gxz, gyy, gyz, gzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gmx, Gmy, Gmz,
Sfx_rhs, Sfy_rhs, Sfz_rhs,
Rout, Monitor, refresh_mass_fields);
return;
}
if (lev != 0)
{
if (myrank == 0)
{
if (Monitor && Monitor->outfile)
Monitor->outfile << "WARNING: shell surface integral not on level 0" << endl;
else
cout << "WARNING: shell surface integral not on level 0" << endl;
}
return;
}
if (refresh_mass_fields)
{
MyList<ss_patch> *Pp = GH->PatL;
while (Pp)
{
MyList<Block> *BL = Pp->data->blb;
int fngfs = Pp->data->fngfs;
while (BL)
{
Block *cg = BL->data;
if (myrank == cg->rank)
{
f_admmass_bssn_ss(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[fngfs + ShellPatch::gx], cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gz],
cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhody], cg->fgfs[fngfs + ShellPatch::drhodz],
cg->fgfs[fngfs + ShellPatch::dsigmadx], cg->fgfs[fngfs + ShellPatch::dsigmady], cg->fgfs[fngfs + ShellPatch::dsigmadz],
cg->fgfs[fngfs + ShellPatch::dRdx], cg->fgfs[fngfs + ShellPatch::dRdy], cg->fgfs[fngfs + ShellPatch::dRdz],
cg->fgfs[fngfs + ShellPatch::drhodxx], cg->fgfs[fngfs + ShellPatch::drhodxy], cg->fgfs[fngfs + ShellPatch::drhodxz],
cg->fgfs[fngfs + ShellPatch::drhodyy], cg->fgfs[fngfs + ShellPatch::drhodyz], cg->fgfs[fngfs + ShellPatch::drhodzz],
cg->fgfs[fngfs + ShellPatch::dsigmadxx], cg->fgfs[fngfs + ShellPatch::dsigmadxy], cg->fgfs[fngfs + ShellPatch::dsigmadxz],
cg->fgfs[fngfs + ShellPatch::dsigmadyy], cg->fgfs[fngfs + ShellPatch::dsigmadyz], cg->fgfs[fngfs + ShellPatch::dsigmadzz],
cg->fgfs[fngfs + ShellPatch::dRdxx], cg->fgfs[fngfs + ShellPatch::dRdxy], cg->fgfs[fngfs + ShellPatch::dRdxz],
cg->fgfs[fngfs + ShellPatch::dRdyy], cg->fgfs[fngfs + ShellPatch::dRdyz], cg->fgfs[fngfs + ShellPatch::dRdzz],
cg->fgfs[chi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
cg->fgfs[Sfx_rhs->sgfn], cg->fgfs[Sfy_rhs->sgfn], cg->fgfs[Sfz_rhs->sgfn],
Symmetry, Pp->data->sst);
}
if (BL == Pp->data->ble)
break;
BL = BL->next;
}
Pp = Pp->next;
}
}
const int InList = 19;
const int idx_rpsi4 = 0, idx_ipsi4 = 1;
const int idx_sfx = 2, idx_sfy = 3, idx_sfz = 4;
const int idx_chi = 5, idx_trk = 6;
const int idx_gxx = 7, idx_gxy = 8, idx_gxz = 9, idx_gyy = 10, idx_gyz = 11, idx_gzz = 12;
const int idx_axx = 13, idx_axy = 14, idx_axz = 15, idx_ayy = 16, idx_ayz = 17, idx_azz = 18;
MyList<var> *DG_List = new MyList<var>(Rpsi4);
DG_List->insert(Ipsi4);
DG_List->insert(Sfx_rhs);
DG_List->insert(Sfy_rhs);
DG_List->insert(Sfz_rhs);
DG_List->insert(chi);
DG_List->insert(trK);
DG_List->insert(gxx);
DG_List->insert(gxy);
DG_List->insert(gxz);
DG_List->insert(gyy);
DG_List->insert(gyz);
DG_List->insert(gzz);
DG_List->insert(Axx);
DG_List->insert(Axy);
DG_List->insert(Axz);
DG_List->insert(Ayy);
DG_List->insert(Ayz);
DG_List->insert(Azz);
int n;
double *pox[3];
for (int ia = 0; ia < 3; ia++)
pox[ia] = new double[n_tot];
for (n = 0; n < n_tot; n++)
{
pox[0][n] = rex * nx_g[n];
pox[1][n] = rex * ny_g[n];
pox[2][n] = rex * nz_g[n];
}
double *shellf = new double[n_tot * InList];
GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
int mp, Lp, Nmin, Nmax;
mp = n_tot / cpusize;
Lp = n_tot - cpusize * mp;
if (Lp > myrank)
{
Nmin = myrank * mp + myrank;
Nmax = Nmin + mp;
}
else
{
Nmin = myrank * mp + Lp;
Nmax = Nmin + mp - 1;
}
double *RP_out = new double[NN];
double *IP_out = new double[NN];
for (int ii = 0; ii < NN; ii++)
{
RP_out[ii] = 0.0;
IP_out[ii] = 0.0;
}
double Mass_out = 0.0;
double ang_outx = 0.0, ang_outy = 0.0, ang_outz = 0.0;
double p_outx = 0.0, p_outy = 0.0, p_outz = 0.0;
const double f1o8 = 0.125;
build_wave_cache(spinw, maxl);
for (n = Nmin; n <= Nmax; n++)
{
const int base = InList * n;
const int i = int(n / N_phi);
const int j = n - i * N_phi;
const double psi4RR0 = shellf[base + idx_rpsi4];
const double psi4II0 = shellf[base + idx_ipsi4];
const double psi4RR1 = Rpsi4->SoA[2] * psi4RR0;
const double psi4II1 = Ipsi4->SoA[2] * psi4II0;
for (int countlm = 0; countlm < wave_cache_modes; countlm++)
{
const int theta_idx = countlm * N_theta + i;
const int phi_idx = countlm * N_phi + j;
const double theta_pos = wave_theta_pos[theta_idx];
const double cosmphi_here = wave_phi_cos[phi_idx];
const double sinmphi_here = wave_phi_sin[phi_idx];
RP_out[countlm] += theta_pos * (psi4RR0 * cosmphi_here + psi4II0 * sinmphi_here);
IP_out[countlm] += theta_pos * (psi4II0 * cosmphi_here - psi4RR0 * sinmphi_here);
if (Symmetry == 1)
{
const double theta_neg = wave_theta_neg[theta_idx];
RP_out[countlm] += theta_neg * (psi4RR1 * cosmphi_here + psi4II1 * sinmphi_here);
IP_out[countlm] += theta_neg * (psi4II1 * cosmphi_here - psi4RR1 * sinmphi_here);
}
}
double Chi = shellf[base + idx_chi];
double TRK = shellf[base + idx_trk];
double Gxx = shellf[base + idx_gxx] + 1.0;
double Gxy = shellf[base + idx_gxy];
double Gxz = shellf[base + idx_gxz];
double Gyy = shellf[base + idx_gyy] + 1.0;
double Gyz = shellf[base + idx_gyz];
double Gzz = shellf[base + idx_gzz] + 1.0;
double axx = shellf[base + idx_axx];
double axy = shellf[base + idx_axy];
double axz = shellf[base + idx_axz];
double ayy = shellf[base + idx_ayy];
double ayz = shellf[base + idx_ayz];
double azz = shellf[base + idx_azz];
Chi = 1.0 / (1.0 + Chi);
const double Psi = Chi * sqrt(Chi);
#ifdef GaussInt
const double theta_weight = wtcostheta[i];
Mass_out += (shellf[base + idx_sfx] * nx_g[n] + shellf[base + idx_sfy] * ny_g[n] + shellf[base + idx_sfz] * nz_g[n]) * theta_weight;
#else
const double theta_weight = 1.0;
Mass_out += shellf[base + idx_sfx] * nx_g[n] + shellf[base + idx_sfy] * ny_g[n] + shellf[base + idx_sfz] * nz_g[n];
#endif
double detg = Gxx * Gyy * Gzz + Gxy * Gyz * Gxz + Gxz * Gxy * Gyz -
Gxz * Gyy * Gxz - Gxy * Gxy * Gzz - Gxx * Gyz * Gyz;
const double gupxx = (Gyy * Gzz - Gyz * Gyz) / detg;
const double gupxy = -(Gxy * Gzz - Gyz * Gxz) / detg;
const double gupxz = (Gxy * Gyz - Gyy * Gxz) / detg;
const double gupyy = (Gxx * Gzz - Gxz * Gxz) / detg;
const double gupyz = -(Gxx * Gyz - Gxy * Gxz) / detg;
const double gupzz = (Gxx * Gyy - Gxy * Gxy) / detg;
const double aupxx = gupxx * axx + gupxy * axy + gupxz * axz;
const double aupxy = gupxx * axy + gupxy * ayy + gupxz * ayz;
const double aupxz = gupxx * axz + gupxy * ayz + gupxz * azz;
const double aupyx = gupxy * axx + gupyy * axy + gupyz * axz;
const double aupyy = gupxy * axy + gupyy * ayy + gupyz * ayz;
const double aupyz = gupxy * axz + gupyy * ayz + gupyz * azz;
const double aupzx = gupxz * axx + gupyz * axy + gupzz * axz;
const double aupzy = gupxz * axy + gupyz * ayy + gupzz * ayz;
const double aupzz = gupxz * axz + gupyz * ayz + gupzz * azz;
if (Symmetry == 0)
{
ang_outx += f1o8 * Psi * (nx_g[n] * (pox[1][n] * aupxz - pox[2][n] * aupxy) + ny_g[n] * (pox[1][n] * aupyz - pox[2][n] * aupyy) + nz_g[n] * (pox[1][n] * aupzz - pox[2][n] * aupzy)) * theta_weight;
ang_outy += f1o8 * Psi * (nx_g[n] * (pox[2][n] * aupxx - pox[0][n] * aupxz) + ny_g[n] * (pox[2][n] * aupyx - pox[0][n] * aupyz) + nz_g[n] * (pox[2][n] * aupzx - pox[0][n] * aupzz)) * theta_weight;
ang_outz += f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * theta_weight;
}
else
{
ang_outz += f1o8 * Psi * (nx_g[n] * (pox[0][n] * aupxy - pox[1][n] * aupxx) + ny_g[n] * (pox[0][n] * aupyy - pox[1][n] * aupyx) + nz_g[n] * (pox[0][n] * aupzy - pox[1][n] * aupzx)) * theta_weight;
}
axx = Chi * (axx + Gxx * TRK / 3.0);
axy = Chi * (axy + Gxy * TRK / 3.0);
axz = Chi * (axz + Gxz * TRK / 3.0);
ayy = Chi * (ayy + Gyy * TRK / 3.0);
ayz = Chi * (ayz + Gyz * TRK / 3.0);
azz = Chi * (azz + Gzz * TRK / 3.0);
axx -= TRK;
ayy -= TRK;
azz -= TRK;
p_outx += f1o8 * Psi * (nx_g[n] * axx + ny_g[n] * axy + nz_g[n] * axz) * theta_weight;
p_outy += f1o8 * Psi * (nx_g[n] * axy + ny_g[n] * ayy + nz_g[n] * ayz) * theta_weight;
if (Symmetry == 0)
p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
}
for (int ii = 0; ii < NN; ii++)
{
#ifdef GaussInt
RP_out[ii] = RP_out[ii] * rex * dphi;
IP_out[ii] = IP_out[ii] * rex * dphi;
#else
RP_out[ii] = RP_out[ii] * rex * dphi * dcostheta;
IP_out[ii] = IP_out[ii] * rex * dphi * dcostheta;
#endif
}
double mass, px, py, pz, sx, sy, sz;
{
double *reduce_out = new double[2 * NN + 7];
double *reduce_in = new double[2 * NN + 7];
memcpy(reduce_out, RP_out, NN * sizeof(double));
memcpy(reduce_out + NN, IP_out, NN * sizeof(double));
reduce_out[2 * NN + 0] = Mass_out;
reduce_out[2 * NN + 1] = ang_outx;
reduce_out[2 * NN + 2] = ang_outy;
reduce_out[2 * NN + 3] = ang_outz;
reduce_out[2 * NN + 4] = p_outx;
reduce_out[2 * NN + 5] = p_outy;
reduce_out[2 * NN + 6] = p_outz;
MPI_Allreduce(reduce_out, reduce_in, 2 * NN + 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
memcpy(RP, reduce_in, NN * sizeof(double));
memcpy(IP, reduce_in + NN, NN * sizeof(double));
mass = reduce_in[2 * NN + 0];
sx = reduce_in[2 * NN + 1];
sy = reduce_in[2 * NN + 2];
sz = reduce_in[2 * NN + 3];
px = reduce_in[2 * NN + 4];
py = reduce_in[2 * NN + 5];
pz = reduce_in[2 * NN + 6];
delete[] reduce_out;
delete[] reduce_in;
}
#ifdef GaussInt
mass = mass * rex * rex * dphi * factor;
sx = sx * rex * rex * dphi * (1.0 / PI) * factor;
sy = sy * rex * rex * dphi * (1.0 / PI) * factor;
sz = sz * rex * rex * dphi * (1.0 / PI) * factor;
px = px * rex * rex * dphi * (1.0 / PI) * factor;
py = py * rex * rex * dphi * (1.0 / PI) * factor;
pz = pz * rex * rex * dphi * (1.0 / PI) * factor;
#else
mass = mass * rex * rex * dphi * dcostheta * factor;
sx = sx * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
sy = sy * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
sz = sz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
px = px * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
py = py * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
pz = pz * rex * rex * dphi * dcostheta * (1.0 / PI) * factor;
#endif
Rout[0] = mass;
Rout[1] = px;
Rout[2] = py;
Rout[3] = pz;
Rout[4] = sx;
Rout[5] = sy;
Rout[6] = sz;
delete[] pox[0];
delete[] pox[1];
delete[] pox[2];
delete[] shellf;
delete[] RP_out;
delete[] IP_out;
DG_List->clearList();
}
//|----------------------------------------------------------------
// do not discriminate box and shell
// for Gravitational wave specially symmetric case

View File

@@ -36,6 +36,11 @@ private:
double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize;
int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
double *wave_theta_pos, *wave_theta_neg;
double *wave_phi_cos, *wave_phi_sin;
void clear_wave_cache();
void build_wave_cache(int spinw, int maxl);
public:
surface_integral(int iSymmetry);
@@ -82,13 +87,29 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor);
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor);
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
@@ -115,7 +136,7 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here);
double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -97,7 +97,9 @@ Here, we take the Ubuntu 22.04 system as an example
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
The settings for the Ubuntu 22.04 system do not need to be modified.
The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
1. Enter the AMSS-NCKU Python code folder and modify the input.

View File

@@ -144,6 +144,62 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 )
print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU
# use GPU or not
@@ -224,6 +280,21 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 )
print( "//", file=file1 )

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data
import os
import subprocess
import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores
BUILD_JOBS = 64
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output