Compare commits

..

1 Commits

Author SHA1 Message Date
c6e4d4ab71 Add OpenMP parallelization to BSSN RHS hot-path stencil routines
Enable OpenMP threading for the dominant computational kernels:
- makefile.inc: add -qopenmp to f90appflags
- diff_new.f90: split fderivs/fdderivs into OpenMP interior + serial boundary
- kodiss.f90: split kodis into OpenMP interior + serial boundary
- lopsidediff.f90: add OMP PARALLEL DO COLLAPSE(2) to lopsided
- fmisc.f90: parallelize symmetry_bd bulk array copy
- bssn_rhs.f90: add OMP WORKSHARE to array-syntax operations

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:58:55 +08:00
30 changed files with 13856 additions and 16583 deletions

View File

@@ -8,14 +8,6 @@
## ##
################################################################## ##################################################################
## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script, spawning exponentially more
## workers (fork bomb).
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
################################################################## ##################################################################
@@ -66,8 +58,7 @@ if os.path.exists(File_directory):
## Prompt whether to overwrite the existing directory ## Prompt whether to overwrite the existing directory
while True: while True:
try: try:
## inputvalue = input() inputvalue = input()
inputvalue = "continue"
## If the user agrees to overwrite, proceed and remove the existing directory ## If the user agrees to overwrite, proceed and remove the existing directory
if ( inputvalue == "continue" ): if ( inputvalue == "continue" ):
print( " Continue the calculation !!! " ) print( " Continue the calculation !!! " )
@@ -433,31 +424,26 @@ print(
import plot_xiaoqu import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory ## Plot black hole trajectory
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time ## Plot black hole separation vs. time
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) ) plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude) ## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) ) plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution ## Plot ADM mass evolution
for i in range(input_data.Detector_Number): for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time ## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level): for i in range(input_data.grid_level):
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) ) plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
run_plot_tasks_parallel(plot_tasks)
## Plot stored binary data ## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory ) plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )

File diff suppressed because it is too large Load Diff

View File

@@ -39,10 +39,6 @@ public:
bool Find_Point(double *XX); bool Find_Point(double *XX);
void Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer);
void Interp_Points(MyList<var> *VarList, void Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -24,7 +24,6 @@ using namespace std;
#endif #endif
#include <mpi.h> #include <mpi.h>
#include <memory.h>
#include "MyList.h" #include "MyList.h"
#include "Block.h" #include "Block.h"
#include "Parallel.h" #include "Parallel.h"

File diff suppressed because it is too large Load Diff

View File

@@ -1,235 +1,167 @@
#ifndef PARALLEL_H #ifndef PARALLEL_H
#define PARALLEL_H #define PARALLEL_H
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
#include <fstream> #include <fstream>
#include <cstdlib> #include <cstdlib>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
using namespace std; using namespace std;
#include <memory.h>
#include "Parallel_bam.h" #include "Parallel_bam.h"
#include "var.h" #include "var.h"
#include "MPatch.h" #include "MPatch.h"
#include "Block.h" #include "Block.h"
#include "MyList.h" #include "MyList.h"
#include "macrodef.h" //need dim; ghost_width; CONTRACT #include "macrodef.h" //need dim; ghost_width; CONTRACT
namespace Parallel namespace Parallel
{ {
struct gridseg struct gridseg
{ {
double llb[dim]; double llb[dim];
double uub[dim]; double uub[dim];
int shape[dim]; int shape[dim];
double illb[dim], iuub[dim]; // only use for OutBdLow2Hi double illb[dim], iuub[dim]; // only use for OutBdLow2Hi
Block *Bg; Block *Bg;
}; };
int partition1(int &nx, int split_size, int min_width, int cpusize, int shape); // special for 1 diemnsion int partition1(int &nx, int split_size, int min_width, int cpusize, int shape); // special for 1 diemnsion
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape); int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks void KillBlocks(MyList<Patch> *PatchLIST);
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig, void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
int jb1_orig, int jb4_orig, void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
int kb2_orig, int kb5_orig, void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
Patch* PP, int r_left, int r_right, double zmin, double zmax, char *filename, double *data_out);
int ingfsi, int fngfsi, bool periodic, void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
Block* &split_first_block, Block* &split_last_block); char *filename, double *datain);
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox, void getarrayindex(int DIM, int *shape, int *index, int n);
int block_id, int ingfsi, int fngfsi, int lev); int getarraylocation(int DIM, int *shape, int *index);
void KillBlocks(MyList<Patch> *PatchLIST); void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
int *shape, double *datain, double *llb, double *uub);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z)); void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT);
void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z)); void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax, void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
double zmin, double zmax, char *filename, double *data_out); double *Collect_Data(Patch *PP, var *VP);
void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax, void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
char *filename, double *datain); void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
void getarrayindex(int DIM, int *shape, int *index, int n); void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT);
int getarraylocation(int DIM, int *shape, int *index); double global_interp(int DIM, int *ext, double **CoX, double *datain,
void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin, double *poX, int ordn, double *SoA, int Symmetry);
int *shape, double *datain, double *llb, double *uub); double global_interp(int DIM, int *ext, double **CoX, double *datain,
void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT); double *poX, int ordn);
void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT); double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals);
void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd); double LagrangePoly(double x, int pt, int npts, double *xpts);
double *Collect_Data(Patch *PP, var *VP); MyList<gridseg> *build_complete_gsl(Patch *Pat);
void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT); MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL);
void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd); MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL);
void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT); MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL); // - buffer
double global_interp(int DIM, int *ext, double **CoX, double *datain, MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in); // - ghost without extension, special for Sync usage
double *poX, int ordn, double *SoA, int Symmetry); MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in); // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid
double global_interp(int DIM, int *ext, double **CoX, double *datain, MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in); // - buffer - ghost
double *poX, int ordn); MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost
double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals); MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost
double LagrangePoly(double x, int pt, int npts, double *xpts); MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in); // similar to build_owned_gsl2 but no extension
MyList<gridseg> *build_complete_gsl(Patch *Pat); MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry);
MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL); void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst);
MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL); int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL); // - buffer MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in); // - ghost without extension, special for Sync usage void transfer(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in); // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in); // - buffer - ghost int Symmetry);
MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in); // similar to build_owned_gsl2 but no extension void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry); MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst); int Symmetry);
int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir, void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry); void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void transfer(MyList<gridseg> **src, MyList<gridseg> **dst, void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); int Symmetry);
int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir, void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry); MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst, int Symmetry);
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, void OutBdLow2Himix(Patch *Patc, Patch *Patf,
int Symmetry); MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry); int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry); MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
struct SyncCache { void Prolong(Patch *Patc, Patch *Patf,
bool valid; MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int cpusize; int Symmetry);
MyList<gridseg> **combined_src; void Prolongint(Patch *Patc, Patch *Patf,
MyList<gridseg> **combined_dst; MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int *send_lengths; int Symmetry);
int *recv_lengths; void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
double **send_bufs; MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
double **recv_bufs; int Symmetry);
int *send_buf_caps; void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
int *recv_buf_caps; MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
MPI_Request *reqs; int Symmetry); // for -ghost - BDghost
MPI_Status *stats; MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat);
int max_reqs; MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL);
bool lengths_valid; MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat);
SyncCache(); MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat);
void invalidate(); MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
void destroy(); MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
}; MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache); MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst, MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<var> *VarList1, MyList<var> *VarList2, MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
int Symmetry, SyncCache &cache); MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
struct AsyncSyncState { MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
int req_no; void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
bool active; double L2Norm(Patch *Pat, var *vf);
AsyncSyncState() : req_no(0), active(false) {} void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
}; void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
SyncCache &cache, AsyncSyncState &state); void prepare_inter_time_level(Patch *Pat,
void Sync_finish(SyncCache &cache, AsyncSyncState &state, MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList, int Symmetry); MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void OutBdLow2Hi(Patch *Patc, Patch *Patf, void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
int Symmetry); MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL, void prepare_inter_time_level(MyList<Patch> *PatL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
int Symmetry); MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void OutBdLow2Himix(Patch *Patc, Patch *Patf, void prepare_inter_time_level(MyList<Patch> *Pat,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
int Symmetry); MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL, void merge_gsl(MyList<gridseg> *&A, const double ratio);
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
int Symmetry); // Add ghost region to tangent plane
void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL, // we assume the grids have the same resolution
MyList<var> *VarList1, MyList<var> *VarList2, void add_ghost_touch(MyList<gridseg> *&A);
int Symmetry, SyncCache &cache); void cut_gsl(MyList<gridseg> *&A);
void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL, bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
MyList<var> *VarList1, MyList<var> *VarList2, MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
int Symmetry, SyncCache &cache); void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL, MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
MyList<var> *VarList1, MyList<var> *VarList2, MyList<var> *tmList, int Symmetry, bool BB, bool CC);
int Symmetry, SyncCache &cache); bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
void Prolong(Patch *Patc, Patch *Patf, int NN, double **XX,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, double *Shellf, int Symmetry);
int Symmetry); void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
void Prolongint(Patch *Patc, Patch *Patf, bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, void checkpatchlist(MyList<Patch> *PatL, bool buflog);
int Symmetry);
void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL, double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int Symmetry); int NN, double **XX,
void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL, double *Shellf, int Symmetry, MPI_Comm Comm_here);
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */, #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
int Symmetry); // for -ghost - BDghost MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat); bool periodic, int start_rank, int end_rank, int nodes = 0);
MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL); #endif
MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat); }
MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat); #endif /*PARALLEL_H */
MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *PatL,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void merge_gsl(MyList<gridseg> *&A, const double ratio);
bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
// Add ghost region to tangent plane
// we assume the grids have the same resolution
void add_ghost_touch(MyList<gridseg> *&A);
void cut_gsl(MyList<gridseg> *&A);
bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
MyList<var> *tmList, int Symmetry, bool BB, bool CC);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry);
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
// Redistribute blocks with time statistics for load balancing
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif
// Dynamic load balancing: split blocks for heavy ranks
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
int split_factor, int cpusize, int ingfsi, int fngfsi);
// Check if load balancing is needed based on interpolation times
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
}
#endif /*PARALLEL_H */

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,7 @@
#ifndef TWO_PUNCTURES_H #ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H #define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19 #define StencilSize 19
#define N_PlaneRelax 1 #define N_PlaneRelax 1
#define NRELAX 200 #define NRELAX 200
@@ -33,7 +32,7 @@ private:
int npoints_A, npoints_B, npoints_phi; int npoints_A, npoints_B, npoints_phi;
double target_M_plus, target_M_minus; double target_M_plus, target_M_minus;
double admMass; double admMass;
double adm_tol; double adm_tol;
@@ -43,17 +42,32 @@ private:
int ntotal; int ntotal;
// ===== Precomputed spectral derivative matrices ===== // Pre-allocated workspace buffers for hot-path allocation elimination
double *D1_A, *D2_A; // LineRelax_be workspace (sized for n2)
double *D1_B, *D2_B; double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be;
double *DF1_phi, *DF2_phi; // LineRelax_al workspace (sized for n1)
double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
// ===== Pre-allocated workspace for LineRelax (per-thread) ===== // ThomasAlgorithm workspace (sized for max(n1,n2))
int max_threads; double *ws_thomas_y;
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be; // JFD_times_dv workspace (sized for nvar)
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be; double *ws_jfd_values;
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al; derivs ws_jfd_dU, ws_jfd_U;
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al; // chebft_Zeros workspace (sized for max(n1,n2,n3)+1)
double *ws_cheb_c;
// fourft workspace (sized for max(n1,n2,n3)/2+1 each)
double *ws_four_a, *ws_four_b;
// Derivatives_AB3 workspace
double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
double *ws_deriv_q, *ws_deriv_dq;
double *ws_deriv_r, *ws_deriv_dr;
int *ws_deriv_indx;
// F_of_v workspace
double *ws_fov_sources;
double *ws_fov_values;
derivs ws_fov_U;
// J_times_dv workspace
double *ws_jtdv_values;
derivs ws_jtdv_dU, ws_jtdv_U;
struct parameters struct parameters
{ {
@@ -71,28 +85,6 @@ public:
int Newtonmaxit); int Newtonmaxit);
~TwoPunctures(); ~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve(); void Solve();
void set_initial_guess(derivs v); void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d); int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +143,23 @@ public:
double BY_KKofxyz(double x, double y, double z); double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values); int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R, void LinEquations(double A, double B, double X, double R,
double x, double r, double phi, double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values); double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname); void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu) // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -164,4 +168,4 @@ public:
void SpecCoef(parameters par, int ivar, double *v, double *cf); void SpecCoef(parameters par, int ivar, double *v, double *cf);
}; };
#endif /* TWO_PUNCTURES_H */ #endif /* TWO_PUNCTURES_H */

View File

@@ -730,12 +730,6 @@ void bssn_class::Initialize()
PhysTime = StartTime; PhysTime = StartTime;
Setup_Black_Hole_position(); Setup_Black_Hole_position();
} }
// Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
} }
//================================================================================================ //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
delete Azzz; delete Azzz;
#endif #endif
// Destroy sync caches before GH
if (sync_cache_pre)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_pre[i].destroy();
delete[] sync_cache_pre;
}
if (sync_cache_cor)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_cor[i].destroy();
delete[] sync_cache_cor;
}
if (sync_cache_rp_coarse)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse;
}
if (sync_cache_rp_fine)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine;
}
delete GH; delete GH;
#ifdef WithShell #ifdef WithShell
delete SH; delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2608,7 +2574,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif #endif
} }
@@ -2775,7 +2740,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2790,7 +2754,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2809,7 +2772,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -2825,7 +2787,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre, SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear(); // a_stream.clear();
// a_stream.str(""); // a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3215,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3363,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -3579,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1) #elif (AGM == 1)
if (iter_count == 3) if (iter_count == 3)
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#"
<< iter_count << " variables at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -3723,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif #endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
} }
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4077,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[fngfs + ShellPatch::gx], cg->fgfs[fngfs + ShellPatch::gx],
cg->fgfs[fngfs + ShellPatch::gy], cg->fgfs[fngfs + ShellPatch::gy],
cg->fgfs[fngfs + ShellPatch::gz], cg->fgfs[fngfs + ShellPatch::gz],
cg->fgfs[fngfs + ShellPatch::drhodx], cg->fgfs[fngfs + ShellPatch::drhodx],
cg->fgfs[fngfs + ShellPatch::drhody], cg->fgfs[fngfs + ShellPatch::drhody],
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
} }
#endif #endif
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4222,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
#endif #endif
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next; Pp = Pp->next;
} }
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls // check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell #ifdef WithShell
// evolve Shell Patches // evolve Shell Patches
@@ -4423,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
{ {
#if (AGM == 0) #if (AGM == 0)
f_enforce_ga(cg->shape, f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1) #elif (AGM == 1)
if (iter_count == 3) if (iter_count == 3)
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -4567,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif #endif
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR) if (ERROR)
{ {
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0) if (myrank == 0)
{ {
if (ErrorMonitor->outfile) if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << " variables at t = " << PhysTime
<< ", lev = " << lev << endl; << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
} }
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
#if (MAPBH == 0) #if (MAPBH == 0)
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check"); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
#endif #endif
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5484,25 +5473,12 @@ void bssn_class::SHStep()
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// corrector // corrector
for (iter_count = 1; iter_count < 4; iter_count++) for (iter_count = 1; iter_count < 4; iter_count++)
{ {
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
sPp = sPp->next; sPp = sPp->next;
} }
} }
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency // check error information
MPI_Request err_req_cor;
{ {
int erh = ERROR; int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
} }
{ {
@@ -5661,26 +5647,12 @@ void bssn_class::SHStep()
{ {
prev_clock = curr_clock; prev_clock = curr_clock;
curr_clock = clock(); curr_clock = clock();
cout << " Shell stuff synchronization used " cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl; << " seconds! " << endl;
} }
} }
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
sPp = SH->PatL; sPp = SH->PatL;
while (sPp) while (sPp)
{ {
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2) #if (PSTR == 1 || PSTR == 2)
// a_stream.clear(); // a_stream.clear();
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SL, Symmetry);
} }
} }
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
#endif #endif
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
} }
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
else // no time refinement levels and for all same time levels else // no time refinement levels and for all same time levels
{ {
#if (RPB == 0) #if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0) #if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1) #elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif #endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1) #elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#else #else
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#endif #endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
} }
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
} }
} }
#undef MIXOUTB #undef MIXOUTB

View File

@@ -126,11 +126,6 @@ public:
MyList<var> *OldStateList, *DumpList; MyList<var> *OldStateList, *DumpList;
MyList<var> *ConstraintList; MyList<var> *ConstraintList;
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor; monitor *ConVMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;

View File

@@ -168,6 +168,8 @@
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + & gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx) TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
@@ -186,7 +188,7 @@
gxy * betaxz + gyy * betayz + & gxy * betaxz + gyy * betayz + &
gxz * betaxy + gzz * betazy & gxz * betaxy + gzz * betazy &
- gyz * betaxx - gyz * betaxx
gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + & gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + &
gxx * betaxz + gxy * betayz + & gxx * betaxz + gxy * betayz + &
gyz * betayx + gzz * betazx & gyz * betayx + gzz * betazx &
@@ -201,6 +203,8 @@
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
!$OMP END WORKSHARE
!$OMP END PARALLEL
if(co == 0)then if(co == 0)then
! Gam^i_Res = Gam^i + gup^ij_,j ! Gam^i_Res = Gam^i + gup^ij_,j
@@ -234,6 +238,8 @@
endif endif
! second kind of connection ! second kind of connection
!$OMP PARALLEL
!$OMP WORKSHARE
Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz )) Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz )) Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz )) Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
@@ -282,6 +288,8 @@
(gupxy * gupyz + gupyy * gupxz)* Axy + & (gupxy * gupyz + gupyy * gupxz)* Axy + &
(gupxy * gupzz + gupyz * gupxz)* Axz + & (gupxy * gupzz + gupyz * gupxz)* Axz + &
(gupyy * gupzz + gupyz * gupyz)* Ayz (gupyy * gupzz + gupyz * gupyz)* Ayz
!$OMP END WORKSHARE
!$OMP END PARALLEL
! Right hand side for Gam^i without shift terms... ! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
@@ -336,6 +344,8 @@
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - & Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + & Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + & F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
@@ -375,6 +385,8 @@
gyyz = gxz * Gamxyy + gyz * Gamyyy + gzz * Gamzyy gyyz = gxz * Gamxyy + gyz * Gamyyy + gzz * Gamzyy
gyzz = gxz * Gamxyz + gyz * Gamyyz + gzz * Gamzyz gyzz = gxz * Gamxyz + gyz * Gamyyz + gzz * Gamzyz
gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz
!$OMP END WORKSHARE
!$OMP END PARALLEL
!compute Ricci tensor for tilted metric !compute Ricci tensor for tilted metric
call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev) call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev)
@@ -401,6 +413,8 @@
Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + & Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
!$OMP PARALLEL
!$OMP WORKSHARE
Rxx = - HALF * Rxx + & Rxx = - HALF * Rxx + &
gxx * Gamxx+ gxy * Gamyx + gxz * Gamzx + & gxx * Gamxx+ gxy * Gamyx + gxz * Gamzx + &
Gamxa * gxxx + Gamya * gxyx + Gamza * gxzx + & Gamxa * gxxx + Gamya * gxyx + Gamza * gxzx + &
@@ -601,9 +615,13 @@
Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + & Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz + &
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + & Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz ) Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!$OMP END WORKSHARE
!$OMP END PARALLEL
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
@@ -626,11 +644,15 @@
Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
!$OMP END WORKSHARE
!$OMP END PARALLEL
! covariant second derivatives of the lapse respect to physical metric ! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev) SYM,SYM,SYM,symmetry,Lev)
!$OMP PARALLEL
!$OMP WORKSHARE
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1 gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1 gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1 gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
@@ -791,6 +813,8 @@
!!!! gauge variable part !!!! gauge variable part
Lap_rhs = -TWO*alpn1*trK Lap_rhs = -TWO*alpn1*trK
!$OMP END WORKSHARE
!$OMP END PARALLEL
#if (GAUGE == 0) #if (GAUGE == 0)
betax_rhs = FF*dtSfx betax_rhs = FF*dtSfx
betay_rhs = FF*dtSfy betay_rhs = FF*dtSfy
@@ -945,60 +969,103 @@
SSA(2)=SYM SSA(2)=SYM
SSA(3)=ANTI SSA(3)=ANTI
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term part
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps) call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps) call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps) call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
!!
#if 1
!! bam does not apply dissipation on gauge variables
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#else
! No dissipation on gauge variables (advection only)
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS) call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7) #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS) call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS) call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA) call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif #endif
if(eps>0)then
! usual Kreiss-Oliger dissipation
call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "before",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif #endif
call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "after",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
#if 1
!! bam does not apply dissipation on gauge variables
call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
#endif
#endif
endif
if(co == 0)then if(co == 0)then
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho

File diff suppressed because it is too large Load Diff

View File

@@ -1,107 +1,92 @@
#ifndef CGH_H #ifndef CGH_H
#define CGH_H #define CGH_H
#include <mpi.h> #include <mpi.h>
#include "MyList.h" #include "MyList.h"
#include "MPatch.h" #include "MPatch.h"
#include "macrodef.h" #include "macrodef.h"
#include "monitor.h" #include "monitor.h"
#include "Parallel.h" #include "Parallel.h"
class cgh class cgh
{ {
public: public:
int levels, movls, BH_num_in; int levels, movls, BH_num_in;
// information of boxes // information of boxes
int *grids; int *grids;
double ***bbox; double ***bbox;
int ***shape; int ***shape;
double ***handle; double ***handle;
double ***Porgls; double ***Porgls;
double *Lt; double *Lt;
// information of Patch list // information of Patch list
MyList<Patch> **PatL; MyList<Patch> **PatL;
// information of OutBdLow2Hi point list and Restrict point list // information of OutBdLow2Hi point list and Restrict point list
#if (RPB == 1) #if (RPB == 1)
MyList<Parallel::pointstru_bam> **bdsul, **rsul; MyList<Parallel::pointstru_bam> **bdsul, **rsul;
#endif #endif
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
int mylev; int mylev;
int *start_rank, *end_rank; int *start_rank, *end_rank;
MPI_Comm *Commlev; MPI_Comm *Commlev;
#endif #endif
protected: protected:
int ingfs, fngfs; int ingfs, fngfs;
static constexpr double ratio = 0.75; static constexpr double ratio = 0.75;
int trfls; int trfls;
public: public:
cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor); cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor);
~cgh(); ~cgh();
void compose_cgh(int nprocs); void compose_cgh(int nprocs);
void sethandle(monitor *ErrorMonitor); void sethandle(monitor *ErrorMonitor);
void checkPatchList(MyList<Patch> *PatL, bool buflog); void checkPatchList(MyList<Patch> *PatL, bool buflog);
void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0, void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor); monitor *ErrorMonitor);
void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0, void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor); monitor *ErrorMonitor);
void recompose_cgh(int nprocs, bool *lev_flag, void recompose_cgh(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB); int Symmetry, bool BB);
void recompose_cgh_fake(int nprocs, bool *lev_flag, void recompose_cgh_fake(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB); int Symmetry, bool BB);
void read_bbox(int Symmetry, char *filename); void read_bbox(int Symmetry, char *filename);
MyList<Patch> *construct_patchlist(int lev, int Symmetry); MyList<Patch> *construct_patchlist(int lev, int Symmetry);
bool Interp_One_Point(MyList<var> *VarList, bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/ double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry); double *Shellf, int Symmetry);
void recompose_cgh_Onelevel(int nprocs, int lev, void recompose_cgh_Onelevel(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB); int Symmetry, bool BB);
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor); monitor *ErrorMonitor);
void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor); monitor *ErrorMonitor);
void settrfls(const int lev); void settrfls(const int lev);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs); void construct_mylev(int nprocs);
#endif #endif
};
// Load balancing support
bool enable_load_balance; // Enable load balancing #endif /* CGH_H */
int load_balance_check_interval; // Check interval (in time steps)
int current_time_step; // Current time step counter
double *rank_interp_times; // Store interpolation times for each rank
int *heavy_ranks; // Store heavy rank numbers
int num_heavy_ranks; // Number of heavy ranks
void init_load_balance(int nprocs);
void update_interp_time(int rank, double time);
bool check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
};
#endif /* CGH_H */

View File

@@ -69,12 +69,10 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
! x direction ! x direction
if(i+1 <= imax .and. i-1 >= imin)then if(i+1 <= imax .and. i-1 >= imin)then
! !
! - f(i-1) + f(i+1) ! - f(i-1) + f(i+1)
@@ -373,8 +371,6 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
@@ -1001,11 +997,11 @@
fy = ZEO fy = ZEO
fz = ZEO fz = ZEO
#if 0
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0 ! x direction
! x direction
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
@@ -1022,7 +1018,7 @@
! set imax and imin 0 ! set imax and imin 0
endif endif
! y direction ! y direction
if(j+2 <= jmax .and. j-2 >= jmin)then if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k)) fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
@@ -1033,7 +1029,7 @@
! set jmax and jmin 0 ! set jmax and jmin 0
endif endif
! z direction ! z direction
if(k+2 <= kmax .and. k-2 >= kmin)then if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2)) fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
@@ -1044,9 +1040,13 @@
! set kmax and kmin 0 ! set kmax and kmin 0
endif endif
enddo
enddo
enddo
#elif 0 #elif 0
! x direction do k=1,ex(3)-1
if(i+2 <= imax .and. i-2 >= imin)then do j=1,ex(2)-1
do i=1,ex(1)-1
! !
! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2) ! f(i-2) - 8 f(i-1) + 8 f(i+1) - f(i+2)
! fx(i) = --------------------------------------------- ! fx(i) = ---------------------------------------------
@@ -1083,8 +1083,32 @@
! set kmax and kmin 0 ! set kmax and kmin 0
endif endif
enddo
enddo
enddo
#else #else
! for bam comparison ! for bam comparison — split into branch-free interior + serial boundary
! Interior: all stencil points guaranteed in-bounds, no branches needed
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=max(3,1),min(ex(3)-1,kmax-2)
do j=max(3,1),min(ex(2)-1,jmax-2)
!DIR$ IVDEP
do i=max(3,1),min(ex(1)-1,imax-2)
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
enddo
enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i >= 3 .and. i <= imax-2 .and. &
j >= 3 .and. j <= jmax-2 .and. &
k >= 3 .and. k <= kmax-2) cycle
if(i+2 <= imax .and. i-2 >= imin .and. & if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. & j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then k+2 <= kmax .and. k-2 >= kmin) then
@@ -1098,10 +1122,10 @@
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k)) fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1)) fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif endif
enddo
enddo
enddo
#endif #endif
enddo
enddo
enddo
return return
@@ -1405,10 +1429,10 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
#if 0
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0
!~~~~~~ fxx !~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !
@@ -1485,9 +1509,48 @@
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2))) - (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif endif
enddo
enddo
enddo
#else #else
! for bam comparison ! for bam comparison — split into branch-free interior + serial boundary
! Interior: all stencil points guaranteed in-bounds, no branches needed
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=max(3,1),min(ex(3)-1,kmax-2)
do j=max(3,1),min(ex(2)-1,jmax-2)
!DIR$ IVDEP
do i=max(3,1),min(ex(1)-1,imax-2)
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
enddo
enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
if(i >= 3 .and. i <= imax-2 .and. &
j >= 3 .and. j <= jmax-2 .and. &
k >= 3 .and. k <= kmax-2) cycle
if(i+2 <= imax .and. i-2 >= imin .and. & if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. & j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then k+2 <= kmax .and. k-2 >= kmin) then
@@ -1522,10 +1585,10 @@
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1)) fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1)) fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif endif
enddo
enddo
enddo
#endif #endif
enddo
enddo
enddo
return return

View File

@@ -881,19 +881,24 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA real*8, dimension(1:3), intent(in) :: SoA
integer::i integer::i,j,k
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=1,extc(3)
do j=1,extc(2)
do i=1,extc(1)
funcc(i,j,k) = func(i,j,k)
enddo
enddo
enddo
!$OMP END PARALLEL DO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
enddo enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2) funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1 do i=0,ord-1
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
enddo enddo
@@ -1116,7 +1121,6 @@ end subroutine d2dump
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint
subroutine polint(xa, ya, x, y, dy, ordn) subroutine polint(xa, ya, x, y, dy, ordn)
implicit none implicit none

View File

@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
! dx^4 ! dx^4
! note the sign (-1)^r-1, now r=2 ! note the sign (-1)^r-1, now r=2
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
@@ -161,36 +159,12 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
do k=1,ex(3) ! Interior: all stencil points guaranteed in-bounds
do j=1,ex(2) !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do i=1,ex(1) do k=4,ex(3)-3
do j=4,ex(2)-3
if(i-3 >= imin .and. i+3 <= imax .and. & !DIR$ IVDEP
j-3 >= jmin .and. j+3 <= jmax .and. & do i=4,ex(1)-3
k-3 >= kmin .and. k+3 <= kmax) then
#if 0
! x direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dX/cof * ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )
! y direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dY/cof * ( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )
! z direction
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dZ/cof * ( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )
#else
! calculation order if important ?
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( & f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - & (fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + & SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
@@ -206,9 +180,37 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + & SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - & FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ ) TWT* fh(i,j,k) )/dZ )
#endif enddo
endif enddo
enddo
!$OMP END PARALLEL DO
! Boundary shell: original branching logic for points near edges
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i >= 4 .and. i <= ex(1)-3 .and. &
j >= 4 .and. j <= ex(2)-3 .and. &
k >= 4 .and. k <= ex(3)-3) cycle
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo enddo
enddo enddo
enddo enddo

View File

@@ -231,12 +231,13 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
call symmetry_bd(3,ex,f,fh,SoA) call symmetry_bd(3,ex,f,fh,SoA)
! upper bound set ex-1 only for efficiency, ! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also ! the loop body will set ex 0 also
!$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
do k=1,ex(3)-1 do k=1,ex(3)-1
do j=1,ex(2)-1 do j=1,ex(2)-1
do i=1,ex(1)-1 do i=1,ex(1)-1
#if 0 #if 0
!! old code !! old code
! x direction ! x direction
if(Sfx(i,j,k) >= ZEO .and. i+3 <= imax .and. i-1 >= imin)then if(Sfx(i,j,k) >= ZEO .and. i+3 <= imax .and. i-1 >= imin)then
@@ -482,206 +483,12 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
enddo enddo
enddo enddo
enddo enddo
!$OMP END PARALLEL DO
return return
end subroutine lopsided end subroutine lopsided
!-----------------------------------------------------------------------------
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
! Mathematically identical to calling lopsided then kodis separately.
!-----------------------------------------------------------------------------
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
implicit none
!~~~~~~> Input parameters:
integer, intent(in) :: ex(1:3),Symmetry
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,intent(in) :: eps
!~~~~~~> local variables:
! note index -2,-1,0, so we have 3 extra points
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
! kodis parameters
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8, parameter :: cof=6.4d1 ! 2^6
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
! Single symmetry_bd call shared by both advection and dissipation
call symmetry_bd(3,ex,f,fh,SoA)
! ---- Advection (lopsided) loop ----
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
endif
elseif(Sfx(i,j,k) < ZEO)then
if(i-3 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
endif
endif
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
endif
elseif(Sfy(i,j,k) < ZEO)then
if(j-3 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
endif
endif
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
endif
elseif(Sfz(i,j,k) < ZEO)then
if(k-3 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
enddo
enddo
enddo
! ---- Dissipation (kodis) loop ----
if(eps > ZEO) then
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo
enddo
enddo
endif
return
end subroutine lopsided_kodis
#elif (ghost_width == 4) #elif (ghost_width == 4)
! sixth order code ! sixth order code
! Compute advection terms in right hand sides of field equations ! Compute advection terms in right hand sides of field equations

View File

@@ -16,12 +16,6 @@ include makefile.inc
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -8,17 +8,17 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lifcore -limf -lpthread -lm -ldl -qopenmp
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) ## Aggressive optimization flags:
## -fprofile-instr-use: use collected profile data to guide optimization decisions ## -O3: Maximum optimization
## (branch prediction, basic block layout, inlining, loop unrolling) ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
PROFDATA = ../../pgo_profile/default.profdata ## -fp-model fast=2: Aggressive floating-point optimizations
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ ## -fma: Enable fused multiply-add instructions
-fprofile-instr-use=$(PROFDATA) \ ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
-fprofile-instr-use=$(PROFDATA) \
-align array64byte -fpp -I${MKLROOT}/include -align array64byte -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx

File diff suppressed because it is too large Load Diff

View File

@@ -10,18 +10,17 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
import time
## CPU core binding configuration using taskset ## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask ## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111" NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
## Build parallelism configuration ## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds ## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96 BUILD_JOBS = 104
################################################################## ##################################################################
@@ -118,7 +117,6 @@ def run_ABE():
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -154,14 +152,13 @@ def run_ABE():
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
def run_TwoPunctureABE(): def run_TwoPunctureABE():
tp_time1=time.time()
print( ) print( )
print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
print( ) print( )
## Define the command to run ## Define the command to run
#TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE" TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command = " ./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log" TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output ## Execute the command with subprocess.Popen and stream output
@@ -182,9 +179,7 @@ def run_TwoPunctureABE():
print( ) print( )
print( " The TwoPunctureABE simulation is finished " ) print( " The TwoPunctureABE simulation is finished " )
print( ) print( )
tp_time2=time.time()
et=tp_time2-tp_time1
print(f"Used time: {et}")
return return
################################################################## ##################################################################

View File

@@ -1,29 +0,0 @@
import multiprocessing
def run_plot_task(task):
"""Execute a single plotting task.
Parameters
----------
task : tuple
A tuple of (function, args_tuple) where function is a callable
plotting function and args_tuple contains its arguments.
"""
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
"""Execute a list of independent plotting tasks in parallel.
Uses the 'fork' context to create worker processes so that the main
script is NOT re-imported/re-executed in child processes.
Parameters
----------
plot_tasks : list of tuples
Each element is (function, args_tuple).
"""
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

View File

@@ -1,97 +0,0 @@
# AMSS-NCKU PGO Profile Analysis Report
## 1. Profiling Environment
| Item | Value |
|------|-------|
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
| Instrumentation Flag | `-fprofile-instr-generate` |
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
| Merged Profile | `default.profdata` (394 KB) |
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
## 2. Reduced Simulation Parameters (for profiling run)
| Parameter | Production Value | Profiling Value |
|-----------|-----------------|-----------------|
| MPI_processes | 64 | 1 |
| grid_level | 9 | 4 |
| static_grid_level | 5 | 3 |
| static_grid_number | 96 | 24 |
| moving_grid_number | 48 | 16 |
| largest_box_xyz_max | 320^3 | 160^3 |
| Final_Evolution_Time | 1000.0 | 10.0 |
| Evolution_Step_Number | 10,000,000 | 1,000 |
| Detector_Number | 12 | 2 |
## 3. Profile Summary
| Metric | Value |
|--------|-------|
| Total instrumented functions | 1,392 |
| Functions with non-zero counts | 117 (8.4%) |
| Functions with zero counts | 1,275 (91.6%) |
| Maximum function entry count | 386,459,248 |
| Maximum internal block count | 370,477,680 |
| Total block count | 4,198,023,118 |
## 4. Top 20 Hotspot Functions
| Rank | Total Count | Max Block Count | Function | Category |
|------|------------|-----------------|----------|----------|
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
## 5. Hotspot Category Breakdown
Top 20 functions account for ~98% of total execution counts:
| Category | Functions | Combined Count | Share |
|----------|-----------|---------------|-------|
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
| Dissipation | kodis_ | ~92M | ~2% |
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
## 6. Conclusions
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
## 7. PGO Phase 2 Usage
To apply the profile, use the following flags in `makefile.inc`:
```makefile
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-align array64byte -fpp -I${MKLROOT}/include
```

Binary file not shown.

Binary file not shown.

View File

@@ -11,8 +11,6 @@
import numpy ## numpy for array operations import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing import scipy ## scipy for interpolation and signal processing
import math import math
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations import os ## os for system/file operations

View File

@@ -8,23 +8,16 @@
## ##
################################################# #################################################
## Restrict OpenMP to one thread per process so that running
## many workers in parallel does not create an O(workers * BLAS_threads)
## thread explosion. The variable MUST be set before numpy/scipy
## are imported, because the BLAS library reads them only at load time.
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
import numpy import numpy
import scipy import scipy
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import Axes3D
## import torch ## import torch
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
######################################################################################### #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
#################################################################################### ####################################################################################
####################################################################################
## Allow this module to be run as a standalone script so that each
## binary-data plot can be executed in a fresh subprocess whose BLAS
## environment variables (set above) take effect before numpy loads.
##
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
####################################################################################
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -8,8 +8,6 @@
################################################# #################################################
import numpy ## numpy for array operations import numpy ## numpy for array operations
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting import matplotlib.pyplot as plt ## matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob import glob
@@ -17,9 +15,6 @@ import os ## operating system utilities
import plot_binary_data import plot_binary_data
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots # plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x) file_list.append(x)
print(x) print(x)
## Plot each file in parallel using subprocesses. ## Plot each file in the list
## Each subprocess is a fresh Python process where the BLAS thread-count
## environment variables (set at the top of plot_binary_data.py) take
## effect before numpy is imported. This avoids the thread explosion
## that occurs when multiprocessing.Pool with 'fork' context inherits
## already-initialized multi-threaded BLAS from the parent.
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
running = []
failed = []
for filename in file_list: for filename in file_list:
print(filename) print(filename)
proc = subprocess.Popen( plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
## Keep at most max_workers subprocesses active at a time
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
## Wait for all remaining subprocesses to finish
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
print( ) print( )
print( " Binary Data Plot Has been Finished " ) print( " Binary Data Plot Has been Finished " )