Compare commits

...

11 Commits

21 changed files with 2807 additions and 1279 deletions

View File

@@ -37,51 +37,56 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!------------- SUBROUTINE four1(dataa,nn,isign)
! Optimized FFT using Intel oneMKL DFTI implicit none
! Mathematical equivalence: Standard DFT definition INTEGER::isign,nn
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) double precision,dimension(2*nn)::dataa
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) INTEGER::i,istep,j,m,mmax,n
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] double precision::tempi,tempr
!------------- DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
SUBROUTINE four1(dataa,nn,isign) n=2*nn
use MKL_DFTI j=1
implicit none do i=1,n,2
INTEGER, intent(in) :: isign, nn if(j.gt.i)then
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa tempr=dataa(j)
tempi=dataa(j+1)
type(DFTI_DESCRIPTOR), pointer :: desc dataa(j)=dataa(i)
integer :: status dataa(j+1)=dataa(i+1)
dataa(i)=tempr
! Create DFTI descriptor for 1D complex-to-complex transform dataa(i+1)=tempi
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) endif
if (status /= 0) return m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
! Set input/output storage as interleaved complex (default) j=j-m
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) m=m/2
if (status /= 0) then goto 1
status = DftiFreeDescriptor(desc) endif
return j=j+m
endif enddo
mmax=2
! Commit the descriptor 2 if (n.gt.mmax) then
status = DftiCommitDescriptor(desc) istep=2*mmax
if (status /= 0) then theta=6.28318530717959d0/(isign*mmax)
status = DftiFreeDescriptor(desc) wpr=-2.d0*sin(0.5d0*theta)**2
return wpi=sin(theta)
endif wr=1.d0
wi=0.d0
! Execute FFT based on direction do m=1,mmax,2
if (isign == 1) then do i=m,n,istep
! Forward FFT: exp(-2*pi*i*k*n/N) j=i+mmax
status = DftiComputeForward(desc, dataa) tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
else tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
! Backward FFT: exp(+2*pi*i*k*n/N) dataa(j)=dataa(i)-tempr
status = DftiComputeBackward(desc, dataa) dataa(j+1)=dataa(i+1)-tempi
endif dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
! Free descriptor enddo
status = DftiFreeDescriptor(desc) wtemp=wr
wr=wr*wpr-wi*wpi+wr
return wi=wi*wpr+wtemp*wpi+wi
END SUBROUTINE four1 enddo
mmax=istep
goto 2
endif
return
END SUBROUTINE four1

View File

@@ -5253,10 +5253,10 @@ void Parallel::PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry)
delete[] transfer_src; delete[] transfer_src;
delete[] transfer_dst; delete[] transfer_dst;
} }
double Parallel::L2Norm(Patch *Pat, var *vf) double Parallel::L2Norm(Patch *Pat, var *vf)
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf, dtvf = 0; double tvf, dtvf = 0;
int BDW = ghost_width; int BDW = ghost_width;
@@ -5281,13 +5281,48 @@ double Parallel::L2Norm(Patch *Pat, var *vf)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
tvf = sqrt(tvf); tvf = sqrt(tvf);
return tvf; return tvf;
} }
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here) void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms)
{ {
int myrank; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf, dtvf = 0; double tvf, dtvf = 0;
int BDW = ghost_width; int BDW = ghost_width;
@@ -5312,12 +5347,47 @@ double Parallel::L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here); MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
tvf = sqrt(tvf); tvf = sqrt(tvf);
return tvf; return tvf;
} }
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only) void Parallel::L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here)
{ {
int myrank = 0; int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double tvf[7], dtvf[7];
int BDW = ghost_width;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<Block> *BP = Pat->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pat->bbox[0], Pat->bbox[1], Pat->bbox[2],
Pat->bbox[3], Pat->bbox[4], Pat->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (BP == Pat->ble)
break;
BP = BP->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
void Parallel::checkgsl(MyList<Parallel::gridseg> *pp, bool first_only)
{
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == 0) if (myrank == 0)
{ {

View File

@@ -179,12 +179,13 @@ namespace Parallel
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only); MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat); MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti, void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst); MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry); void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf); double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only); void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkvarl(MyList<var> *pp, bool first_only); void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat, void prepare_inter_time_level(Patch *Pat,
@@ -216,11 +217,12 @@ namespace Parallel
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape); void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl); bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog); void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList, void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
int NN, double **XX, bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
double *Shellf, int Symmetry, MPI_Comm Comm_here); int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi, MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0); bool periodic, int start_rank, int end_rank, int nodes = 0);

View File

@@ -3439,10 +3439,10 @@ void ShellPatch::write_Pablo_file_ss(int *ext, double xmin, double xmax, double
delete[] Z; delete[] Z;
} }
double ShellPatch::L2Norm(var *vf) double ShellPatch::L2Norm(var *vf)
{ {
double tvf, dtvf = 0; double tvf, dtvf = 0;
int BDW = overghost; int BDW = overghost;
MyList<ss_patch> *sPp = PatL; MyList<ss_patch> *sPp = PatL;
while (sPp) while (sPp)
@@ -3469,13 +3469,50 @@ double ShellPatch::L2Norm(var *vf)
MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
tvf = sqrt(tvf); tvf = sqrt(tvf);
return tvf; return tvf;
} }
void ShellPatch::L2Norm7(var **vf, double *norms)
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs {
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX, double tvf[7], dtvf[7];
double *Shellf) int BDW = overghost;
for (int i = 0; i < 7; i++)
dtvf[i] = 0;
MyList<ss_patch> *sPp = PatL;
while (sPp)
{
MyList<Block> *Bp = sPp->data->blb;
while (Bp)
{
Block *cg = Bp->data;
if (myrank == cg->rank)
{
f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
cg->fgfs[vf[6]->sgfn], tvf, BDW);
for (int i = 0; i < 7; i++)
dtvf[i] += tvf[i];
}
if (Bp == sPp->data->ble)
break;
Bp = Bp->next;
}
sPp = sPp->next;
}
MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
for (int i = 0; i < 7; i++)
norms[i] = sqrt(tvf[i]);
}
// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
double *Shellf)
{ {
MyList<var> *varl; MyList<var> *varl;
int num_var = 0; int num_var = 0;

View File

@@ -195,10 +195,11 @@ public:
bool Interp_One_Point(MyList<var> *VarList, bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/ double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry); double *Shellf, int Symmetry);
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax, void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst); char *filename, int sst);
double L2Norm(var *vf); double L2Norm(var *vf);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf); void L2Norm7(var **vf, double *norms);
}; void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
};
#endif /* SHELLPATCH_H */ #endif /* SHELLPATCH_H */

View File

@@ -25,9 +25,23 @@ using namespace std;
#include <math.h> #include <math.h>
#include <complex.h> #include <complex.h>
#endif #endif
#include "TwoPunctures.h" #include "TwoPunctures.h"
#include <mkl_cblas.h>
extern "C" {
double cblas_ddot(const int, const double *, const int, const double *, const int);
double cblas_dnrm2(const int, const double *, const int);
void cblas_dgemm(const int, const int, const int,
const int, const int, const int,
const double, const double *, const int,
const double *, const int, const double,
double *, const int);
}
enum {
CblasRowMajor = 101,
CblasNoTrans = 111
};
TwoPunctures::TwoPunctures(double mp, double mm, double b, TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz, double P_plusx, double P_plusy, double P_plusz,

File diff suppressed because it is too large Load Diff

View File

@@ -45,10 +45,11 @@ public:
int checkrun; int checkrun;
char checkfilename[50]; char checkfilename[50];
int Steps; int Steps;
double StartTime, TotalTime; double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime; double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut; double LastAnas, LastConsOut;
double Courant; int *ConstraintRefreshLevels;
double Courant;
double numepss, numepsb, numepsh; double numepss, numepsb, numepsh;
int Symmetry; int Symmetry;
int maxl, decn; int maxl, decn;
@@ -133,9 +134,9 @@ public:
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;
checkpoint *CheckPoint; checkpoint *CheckPoint;
public: public:

View File

@@ -22,19 +22,32 @@
#define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
#define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR
#endif #endif
#ifdef fortran3 #ifdef fortran3
#define f_compute_rhs_bssn compute_rhs_bssn_ #define f_compute_rhs_bssn compute_rhs_bssn_
#define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_ #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
#define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_ #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
#define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_ #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
#define f_compute_rhs_Z4c compute_rhs_z4c_ #define f_compute_rhs_Z4c compute_rhs_z4c_
#define f_compute_rhs_Z4cnot compute_rhs_z4cnot_ #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_ #define f_compute_constraint_fr compute_constraint_fr_
#endif #endif
extern "C"
{ #ifdef __cplusplus
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C"
{
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij double *, double *, double *, double *, double *, double *, // Aij

View File

@@ -2,12 +2,88 @@
#include "bssn_rhs.h" #include "bssn_rhs.h"
#include "share_func.h" #include "share_func.h"
#include "tool.h" #include "tool.h"
#include <time.h>
// 0-based i,j,k // 0-based i,j,k
// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
// ex(1)=nx, ex(2)=ny, ex(3)=nz // ex(1)=nx, ex(2)=ny, ex(3)=nz
// 用法a[ IDX_F(i,j,k,nx,ny) ] // 用法a[ IDX_F(i,j,k,nx,ny) ]
#ifndef BSSN_KERNEL_FINE_TIMING
#define BSSN_KERNEL_FINE_TIMING 0
#endif
#if BSSN_KERNEL_FINE_TIMING
namespace rhs_kernel_timing
{
enum Bucket
{
KB_SETUP_DERIVS = 0,
KB_GEOM_GAMMA,
KB_RICCI_METRIC,
KB_CHI_LAPSE,
KB_AIJ_TRK_GAUGE,
KB_KO_CONSTRAINT,
KB_COUNT
};
static double local_bucket_seconds[KB_COUNT];
static const char *bucket_labels[KB_COUNT] =
{
"setup_derivs",
"geom_gamma",
"ricci_metric",
"chi_lapse",
"aij_trk_gauge",
"ko_constraint"
};
static inline double now_seconds()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec);
}
}
extern "C" void f_bssn_rhs_kernel_timing_reset()
{
for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i)
rhs_kernel_timing::local_bucket_seconds[i] = 0.0;
}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count()
{
return rhs_kernel_timing::KB_COUNT;
}
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds()
{
return rhs_kernel_timing::local_bucket_seconds;
}
extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index)
{
if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT)
return "unknown";
return rhs_kernel_timing::bucket_labels[bucket_index];
}
#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds()
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \
rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \
rhs_kernel_timing::now_seconds() - (var_name)
#else
extern "C" void f_bssn_rhs_kernel_timing_reset() {}
extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; }
extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; }
extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; }
#define RHS_KERNEL_TIMER_DECL(var_name)
#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name)
#endif
// C function that calculates the right-hand side for BSSN equations // C function that calculates the right-hand side for BSSN equations
int f_compute_rhs_bssn(int *ex, double &T, int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z, double *X, double *Y, double *Z,
@@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
dY = Y[1] - Y[0]; dY = Y[1] - Y[0];
dZ = Z[1] - Z[0]; dZ = Z[1] - Z[0];
RHS_KERNEL_TIMER_DECL(timer_setup_derivs);
// 1ms // // 1ms //
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
alpn1[i] = Lap[i] + 1.0; alpn1[i] = Lap[i] + 1.0;
@@ -141,6 +218,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i] (dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i]; + (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
} }
RHS_KERNEL_TIMER_ADD(KB_SETUP_DERIVS, timer_setup_derivs);
RHS_KERNEL_TIMER_DECL(timer_geom_gamma);
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1) // Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
for(int i=0;i<all;i+=1){ for(int i=0;i<all;i+=1){
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] - double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
@@ -283,9 +362,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i] + ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i] + ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i]; + ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) + Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
TWO * alpn1[i] * ( TWO * alpn1[i] * (
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) - -F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
@@ -315,6 +391,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz ) + TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
); );
} }
RHS_KERNEL_TIMER_ADD(KB_GEOM_GAMMA, timer_geom_gamma);
RHS_KERNEL_TIMER_DECL(timer_ricci_metric);
// 22.3ms // // 22.3ms //
fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx, fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev); X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
@@ -332,7 +410,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
double lfxx = gxxx[i] + gxyy[i] + gxzz[i]; double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
double lfxy = gxyx[i] + gyyy[i] + gyzz[i]; double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
double lfxz = gxzx[i] + gyzy[i] + gzzz[i]; double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i] double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] ); + TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
@@ -686,69 +763,74 @@ int f_compute_rhs_bssn(int *ex, double &T,
+ Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i] + Gamxyz[i] * gzzx[i] + Gamyyz[i] * gzzy[i] + Gamzyz[i] * gzzz[i]
); );
} }
RHS_KERNEL_TIMER_ADD(KB_RICCI_METRIC, timer_ricci_metric);
RHS_KERNEL_TIMER_DECL(timer_chi_lapse);
// 22.3ms // // 22.3ms //
fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 7ms // // 7ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
fxx[i] = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i]; const double inv_chin1 = ONE / chin1[i];
fxy[i] = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i]; const double half_inv_chin1 = HALF * inv_chin1;
fxz[i] = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i]; const double scaled_inv = F3o2 * inv_chin1;
fyy[i] = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i]; const double cxx = fxx[i] - Gamxxx[i] * chix[i] - Gamyxx[i] * chiy[i] - Gamzxx[i] * chiz[i];
fyz[i] = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i]; const double cxy = fxy[i] - Gamxxy[i] * chix[i] - Gamyxy[i] * chiy[i] - Gamzxy[i] * chiz[i];
fzz[i] = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i]; const double cxz = fxz[i] - Gamxxz[i] * chix[i] - Gamyxz[i] * chiy[i] - Gamzxz[i] * chiz[i];
f[i] = const double cyy = fyy[i] - Gamxyy[i] * chix[i] - Gamyyy[i] * chiy[i] - Gamzyy[i] * chiz[i];
gupxx[i] * (fxx[i] - (F3o2 / chin1[i]) * chix[i] * chix[i]) const double cyz = fyz[i] - Gamxyz[i] * chix[i] - Gamyyz[i] * chiy[i] - Gamzyz[i] * chiz[i];
+ gupyy[i] * (fyy[i] - (F3o2 / chin1[i]) * chiy[i] * chiy[i]) const double czz = fzz[i] - Gamxzz[i] * chix[i] - Gamyzz[i] * chiy[i] - Gamzzz[i] * chiz[i];
+ gupzz[i] * (fzz[i] - (F3o2 / chin1[i]) * chiz[i] * chiz[i]) const double ricci_chi =
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i]) gupxx[i] * (cxx - scaled_inv * chix[i] * chix[i])
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i]) + gupyy[i] * (cyy - scaled_inv * chiy[i] * chiy[i])
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]); + gupzz[i] * (czz - scaled_inv * chiz[i] * chiz[i])
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxy[i] * (cxy - scaled_inv * chix[i] * chiy[i])
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupxz[i] * (cxz - scaled_inv * chix[i] * chiz[i])
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO); + TWO * gupyz[i] * (cyz - scaled_inv * chiy[i] * chiz[i]);
f[i] = ricci_chi;
Rxx[i] = Rxx[i] + ( cxx - half_inv_chin1 * chix[i] * chix[i] + (dxx[i] + ONE) * ricci_chi ) * half_inv_chin1;
Ryy[i] = Ryy[i] + ( cyy - half_inv_chin1 * chiy[i] * chiy[i] + (dyy[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rzz[i] = Rzz[i] + ( czz - half_inv_chin1 * chiz[i] * chiz[i] + (dzz[i] + ONE) * ricci_chi ) * half_inv_chin1;
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO); Rxy[i] = Rxy[i] + ( cxy - half_inv_chin1 * chix[i] * chiy[i] + gxy[i] * ricci_chi ) * half_inv_chin1;
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO); Rxz[i] = Rxz[i] + ( cxz - half_inv_chin1 * chix[i] * chiz[i] + gxz[i] * ricci_chi ) * half_inv_chin1;
Ryz[i] = Ryz[i] + ( fyz[i] - (chiy[i] * chiz[i]) / (chin1[i] * TWO) + gyz[i] * f[i] ) / (chin1[i] * TWO); Ryz[i] = Ryz[i] + ( cyz - half_inv_chin1 * chiy[i] * chiz[i] + gyz[i] * ricci_chi ) * half_inv_chin1;
} }
// 24ms // // 24ms //
fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev); fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
// 6ms // // 6ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
/* gxxx,gxxy,gxxz (这里是“升指标后的chi导数/chi”那类量你沿用原变量名即可) */ const double inv_chin1 = ONE / chin1[i];
gxxx[i] = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) / chin1[i]; const double gchi_x = (gupxx[i] * chix[i] + gupxy[i] * chiy[i] + gupxz[i] * chiz[i]) * inv_chin1;
gxxy[i] = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) / chin1[i]; const double gchi_y = (gupxy[i] * chix[i] + gupyy[i] * chiy[i] + gupyz[i] * chiz[i]) * inv_chin1;
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i]; const double gchi_z = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) * inv_chin1;
/* Christoffel 修正项 */ /* Christoffel 修正项 */
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF; Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) * inv_chin1) - (dxx[i] + ONE) * gchi_x ) * HALF;
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */ Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_y ) * HALF; /* 原式只有 -gxx*gxxy */
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF; Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gchi_z ) * HALF;
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF; Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_x ) * HALF;
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF; Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) * inv_chin1) - (dyy[i] + ONE) * gchi_y ) * HALF;
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF; Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gchi_z ) * HALF;
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF; Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_x ) * HALF;
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF; Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gchi_y ) * HALF;
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF; Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) * inv_chin1) - (dzz[i] + ONE) * gchi_z ) * HALF;
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF; Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] * inv_chin1) - gxy[i] * gchi_x ) * HALF;
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF; Gamyxy[i] = Gamyxy[i] - ( ( chix[i] * inv_chin1) - gxy[i] * gchi_y ) * HALF;
Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gxxz[i] ) * HALF; Gamzxy[i] = Gamzxy[i] - ( 0.0 - gxy[i] * gchi_z ) * HALF;
Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] / chin1[i]) - gxz[i] * gxxx[i] ) * HALF; Gamxxz[i] = Gamxxz[i] - ( ( chiz[i] * inv_chin1) - gxz[i] * gchi_x ) * HALF;
Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gxxy[i] ) * HALF; Gamyxz[i] = Gamyxz[i] - ( 0.0 - gxz[i] * gchi_y ) * HALF;
Gamzxz[i] = Gamzxz[i] - ( ( chix[i] / chin1[i]) - gxz[i] * gxxz[i] ) * HALF; Gamzxz[i] = Gamzxz[i] - ( ( chix[i] * inv_chin1) - gxz[i] * gchi_z ) * HALF;
Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gxxx[i] ) * HALF; Gamxyz[i] = Gamxyz[i] - ( 0.0 - gyz[i] * gchi_x ) * HALF;
Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] / chin1[i]) - gyz[i] * gxxy[i] ) * HALF; Gamyyz[i] = Gamyyz[i] - ( ( chiz[i] * inv_chin1) - gyz[i] * gchi_y ) * HALF;
Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] / chin1[i]) - gyz[i] * gxxz[i] ) * HALF; Gamzyz[i] = Gamzyz[i] - ( ( chiy[i] * inv_chin1) - gyz[i] * gchi_z ) * HALF;
/* fxx..fyz 修正:减去 Γ * ∂Lap */ /* fxx..fyz 修正:减去 Γ * ∂Lap */
fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i]; fxx[i] = fxx[i] - Gamxxx[i] * Lapx[i] - Gamyxx[i] * Lapy[i] - Gamzxx[i] * Lapz[i];
@@ -762,6 +844,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i] trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] ); + TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
} }
RHS_KERNEL_TIMER_ADD(KB_CHI_LAPSE, timer_chi_lapse);
RHS_KERNEL_TIMER_DECL(timer_aij_trk_gauge);
// 2.5ms // // 2.5ms //
for (int i=0;i<all;i+=1) { for (int i=0;i<all;i+=1) {
const double divb = betaxx[i] + betayy[i] + betazz[i]; const double divb = betaxx[i] + betayy[i] + betazz[i];
@@ -1062,6 +1146,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i]; dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#endif #endif
} }
RHS_KERNEL_TIMER_ADD(KB_AIJ_TRK_GAUGE, timer_aij_trk_gauge);
RHS_KERNEL_TIMER_DECL(timer_ko_constraint);
// advection + KO dissipation with shared symmetry buffer // advection + KO dissipation with shared symmetry buffer
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps); lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps); lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
@@ -1193,6 +1279,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i]; movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
} }
} }
RHS_KERNEL_TIMER_ADD(KB_KO_CONSTRAINT, timer_ko_constraint);

View File

@@ -1511,13 +1511,88 @@ deallocate(f_flat)
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
return return
end subroutine l2normhelper end subroutine l2normhelper
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& f1,f2,f3,f4,f5,f6,f7,f_out,gw)
f,f_out,gw,ogw,Symmetry)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw,ogw,Symmetry)
implicit none implicit none
!~~~~~~> Input parameters: !~~~~~~> Input parameters:

View File

@@ -12,9 +12,10 @@
#define f_global_interpind global_interpind #define f_global_interpind global_interpind
#define f_global_interpind2d global_interpind2d #define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d #define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper #define f_l2normhelper l2normhelper
#define f_l2normhelper_sh l2normhelper_sh #define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average #define f_average average
#define f_average3 average3 #define f_average3 average3
#define f_average2 average2 #define f_average2 average2
@@ -41,9 +42,10 @@
#define f_global_interpind GLOBAL_INTERPIND #define f_global_interpind GLOBAL_INTERPIND
#define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D #define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER #define f_l2normhelper L2NORMHELPER
#define f_l2normhelper_sh L2NORMHELPER_SH #define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE #define f_average AVERAGE
#define f_average3 AVERAGE3 #define f_average3 AVERAGE3
#define f_average2 AVERAGE2 #define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
#define f_global_interpind global_interpind_ #define f_global_interpind global_interpind_
#define f_global_interpind2d global_interpind2d_ #define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_ #define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_ #define f_l2normhelper l2normhelper_
#define f_l2normhelper_sh l2normhelper_sh_ #define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_ #define f_average average_
#define f_average3 average3_ #define f_average3 average3_
#define f_average2 average2_ #define f_average2 average2_
@@ -156,20 +159,29 @@ extern "C"
int *, double *, int &, int &); int *, double *, int &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper(int *, double *, double *, double *, void f_l2normhelper(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &); double *, double &, int &);
} }
extern "C" extern "C"
{ {
void f_l2normhelper_sh(int *, double *, double *, double *, void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &,
double *, double &, int &, int &, int &); double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C"
{
void f_l2normhelper_sh(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double &, int &, int &, int &);
} }
extern "C" extern "C"

View File

@@ -17,68 +17,106 @@ using namespace std;
#include <math.h> #include <math.h>
#endif #endif
// Intel oneMKL LAPACK interface /* Linear equation solution by Gauss-Jordan elimination.
#include <mkl_lapacke.h> a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
/* Linear equation solution using Intel oneMKL LAPACK. containing the right-hand side vectors. On output a is
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input replaced by its matrix inverse, and b is replaced by the
containing the right-hand side vectors. On output a is corresponding set of solution vectors. */
replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. int gaussj(double *a, double *b, int n)
{
Mathematical equivalence: double swap;
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results int *indxc, *indxr, *ipiv;
within numerical precision. */ indxc = new int[n];
indxr = new int[n];
int gaussj(double *a, double *b, int n) ipiv = new int[n];
{
// Allocate pivot array and workspace int i, icol, irow, j, k, l, ll;
lapack_int *ipiv = new lapack_int[n]; double big, dum, pivinv;
lapack_int info;
for (j = 0; j < n; j++)
// Make a copy of matrix a for solving (dgesv modifies it to LU form) ipiv[j] = 0;
double *a_copy = new double[n * n]; for (i = 0; i < n; i++)
for (int i = 0; i < n * n; i++) { {
a_copy[i] = a[i]; big = 0.0;
} for (j = 0; j < n; j++)
if (ipiv[j] != 1)
// Step 1: Solve linear system A*x = b using LU decomposition for (k = 0; k < n; k++)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (ipiv[k] == 0)
{
if (info != 0) { if (fabs(a[j * n + k]) >= big)
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; {
delete[] ipiv; big = fabs(a[j * n + k]);
delete[] a_copy; irow = j;
return 1; icol = k;
} }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization else if (ipiv[k] > 1)
// First do LU factorization of original matrix a {
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); cout << "gaussj: Singular Matrix-1" << endl;
return 1;
if (info != 0) { }
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; ipiv[icol] = ipiv[icol] + 1;
return 1; if (irow != icol)
} {
for (l = 0; l < n; l++)
// Then compute inverse from LU factorization {
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
if (info != 0) { a[icol * n + l] = swap;
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; swap = b[irow];
return 1; b[irow] = b[icol];
} b[icol] = swap;
}
delete[] ipiv;
delete[] a_copy; indxr[i] = irow;
indxc[i] = icol;
return 0;
} if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
return 1;
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
}
for (l = n - 1; l >= 0; l--)
{
if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
{
swap = a[k * n + indxr[l]];
a[k * n + indxr[l]] = a[k * n + indxc[l]];
a[k * n + indxc[l]] = swap;
}
}
delete[] indxc;
delete[] indxr;
delete[] ipiv;
return 0;
}
// for check usage // for check usage
/* /*
int main() int main()

View File

@@ -29,6 +29,16 @@
#define REGLEV 0 #define REGLEV 0
#define BSSN_FINE_TIMING 0
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU //#define USE_GPU
//#define CHECKDETAIL //#define CHECKDETAIL
@@ -88,6 +98,21 @@
// 0: for every level; // 0: for every level;
// 1: for all // 1: for all
// //
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU // define USE_GPU
// use gpu or not // use gpu or not
// //
@@ -142,4 +167,3 @@
#define TINY 1e-10 #define TINY 1e-10
#endif /* MICRODEF_H */ #endif /* MICRODEF_H */

View File

@@ -8,27 +8,16 @@ include makefile.inc
POLINT6_USE_BARY ?= 1 POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) ## Legacy GNU/OpenMPI flags
## make -> opt (PGO-guided, maximum performance) CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(PGO_MODE),instrument)
ifeq ($(PGO_MODE),instrument) CXXAPPFLAGS = $(CXXBASEFLAGS)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability f90appflags = $(F90BASEFLAGS)
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ CXXAPPFLAGS = $(CXXBASEFLAGS)
## PGO has been turned off, now tested and found to be negative optimization f90appflags = $(F90BASEFLAGS)
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
@@ -67,17 +56,14 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \ TwoPunctures.o: TwoPunctures.C
-Dfortran3 -Dnewc -I${MKLROOT}/include ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctures.o: TwoPunctures.C TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
@@ -184,8 +170,8 @@ ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

56
AMSS_NCKU_source/makefile.inc Executable file → Normal file
View File

@@ -1,33 +1,27 @@
## GCC version (commented out) ## Legacy GNU/OpenMPI toolchain configuration
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## OpenMPI wrappers are installed but may not be on PATH.
filein = -I/usr/include/ -I${MKLROOT}/include OMPI_BIN ?= /usr/lib64/openmpi/bin
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Wrapper compilers
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library f90 = $(OMPI_BIN)/mpifort
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 f77 = $(OMPI_BIN)/mpifort
CXX = $(OMPI_BIN)/mpicxx
CC = $(OMPI_BIN)/mpicc
CLINKER = $(OMPI_BIN)/mpicxx
## Memory allocator switch ## Extra include flags are not needed when using the OpenMPI wrappers.
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) filein =
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## BLAS/LAPACK backend:
## opt : (default) maximum performance with PGO profile-guided optimization ## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
## instrument : PGO Phase 1 instrumentation to collect fresh profile data BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
PGO_MODE ?= opt LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
## PGO build mode switch
## off : default legacy GNU build without PGO
## instrument : accepted for compatibility, currently same as off
PGO_MODE ?= off
## Interp_Points load balance profiling mode ## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation ## off : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
USE_CXX_KERNELS ?= 1 USE_CXX_KERNELS ?= 1
## RK4 kernel implementation switch ## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 1 (default) : use C/C++ rewrite of rungekutta4_rout
## 0 : use original Fortran rungekutta4_rout.o ## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1 USE_CXX_RK4 ?= 1
f90 = ifx ## OpenMP is only used for TwoPunctures on the legacy toolchain.
f77 = ifx TP_OPENMP_FLAGS ?= -fopenmp
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

File diff suppressed because it is too large Load Diff

View File

@@ -27,19 +27,24 @@ using namespace std;
class surface_integral class surface_integral
{ {
private: private:
int Symmetry, factor; int Symmetry, factor;
int N_theta, N_phi; // Number of points in Theta & Phi directions int N_theta, N_phi; // Number of points in Theta & Phi directions
double dphi, dcostheta; double dphi, dcostheta;
double *arcostheta, *wtcostheta; double *arcostheta, *wtcostheta;
int n_tot; // size of arrays int n_tot; // size of arrays
double *nx_g, *ny_g, *nz_g; // global list of unit normals double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize; int myrank, cpusize;
int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
public: double *wave_theta_pos, *wave_theta_neg;
surface_integral(int iSymmetry); double *wave_phi_cos, *wave_phi_sin;
~surface_integral(); void clear_wave_cache();
void build_wave_cache(int spinw, int maxl);
public:
surface_integral(int iSymmetry);
~surface_integral();
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
@@ -77,21 +82,37 @@ public:
double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &,
double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &,
double &, double &)); // NN is the length of RP and IP double &, double &)); // NN is the length of RP and IP
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor); double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH, void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *chi, var *trK, var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *chix, var *chiy, var *chiz, var *chix, var *chiy, var *chiz,
var *trKx, var *trKy, var *trKz, var *trKx, var *trKy, var *trKz,
@@ -110,12 +131,12 @@ public:
bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH, bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
int NN, double **XX, double *Shellf); int NN, double **XX, double *Shellf);
void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here); double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here); monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example
#### How to use AMSS-NCKU #### How to use AMSS-NCKU
0. Setting the parameters for compilation 0. Setting the parameters for compilation
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
The settings for the Ubuntu 22.04 system do not need to be modified. The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
1. Enter the AMSS-NCKU Python code folder and modify the input. 1. Enter the AMSS-NCKU Python code folder and modify the input.

View File

@@ -144,6 +144,62 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 ) print( "#define REGLEV 0", file=file1 )
print( file=file1 ) print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU # Define macro USE_GPU
# use GPU or not # use GPU or not
@@ -224,6 +280,21 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 ) print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 ) print( "// 1: for all", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 ) print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 ) print( "// use gpu or not", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores ## Build parallelism: match the number of bound cores
BUILD_JOBS = 64 BUILD_JOBS = 64
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
################################################################## ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output