Compare commits

..

2 Commits

Author SHA1 Message Date
6796384bf4 taskset setting updated 2026-02-07 22:24:02 +08:00
c974a88d6d Pool fh work arrays in compute_rhs_bssn to eliminate allocation churn
Add _fh variants of fderivs, fdderivs, kodis, and lopsided that accept
a caller-provided fh work array instead of allocating one internally.
Declare two shared work arrays in compute_rhs_bssn (fh_work2 for
symmetry_bd(2) callers, fh_work3 for symmetry_bd(3) callers) and pass
them to all ~84 subroutine calls, eliminating ~77 redundant automatic
array allocations (~591 MB churn per RHS call, ~2.3 GB per timestep).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 21:49:12 +08:00
30 changed files with 14103 additions and 16431 deletions

View File

@@ -8,14 +8,6 @@
##
##################################################################
## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script, spawning exponentially more
## workers (fork bomb).
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
##################################################################
@@ -66,8 +58,7 @@ if os.path.exists(File_directory):
## Prompt whether to overwrite the existing directory
while True:
try:
## inputvalue = input()
inputvalue = "continue"
inputvalue = input()
## If the user agrees to overwrite, proceed and remove the existing directory
if ( inputvalue == "continue" ):
print( " Continue the calculation !!! " )
@@ -433,31 +424,26 @@ print(
import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) )
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution
for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level):
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
run_plot_tasks_parallel(plot_tasks)
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )

File diff suppressed because it is too large Load Diff

View File

@@ -39,10 +39,6 @@ public:
bool Find_Point(double *XX);
void Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry,
int Nmin_consumer, int Nmax_consumer);
void Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);

View File

@@ -24,7 +24,6 @@ using namespace std;
#endif
#include <mpi.h>
#include <memory.h>
#include "MyList.h"
#include "Block.h"
#include "Parallel.h"

File diff suppressed because it is too large Load Diff

View File

@@ -1,235 +1,167 @@
#ifndef PARALLEL_H
#define PARALLEL_H
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <string>
#include <cmath>
#include <new>
using namespace std;
#include <memory.h>
#include "Parallel_bam.h"
#include "var.h"
#include "MPatch.h"
#include "Block.h"
#include "MyList.h"
#include "macrodef.h" //need dim; ghost_width; CONTRACT
namespace Parallel
{
struct gridseg
{
double llb[dim];
double uub[dim];
int shape[dim];
double illb[dim], iuub[dim]; // only use for OutBdLow2Hi
Block *Bg;
};
int partition1(int &nx, int split_size, int min_width, int cpusize, int shape); // special for 1 diemnsion
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig,
Patch* PP, int r_left, int r_right,
int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block);
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
int block_id, int ingfsi, int fngfsi, int lev);
void KillBlocks(MyList<Patch> *PatchLIST);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
double zmin, double zmax, char *filename, double *data_out);
void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
char *filename, double *datain);
void getarrayindex(int DIM, int *shape, int *index, int n);
int getarraylocation(int DIM, int *shape, int *index);
void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
int *shape, double *datain, double *llb, double *uub);
void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT);
void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
double *Collect_Data(Patch *PP, var *VP);
void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT);
double global_interp(int DIM, int *ext, double **CoX, double *datain,
double *poX, int ordn, double *SoA, int Symmetry);
double global_interp(int DIM, int *ext, double **CoX, double *datain,
double *poX, int ordn);
double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals);
double LagrangePoly(double x, int pt, int npts, double *xpts);
MyList<gridseg> *build_complete_gsl(Patch *Pat);
MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL);
MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL);
MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL); // - buffer
MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in); // - ghost without extension, special for Sync usage
MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in); // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid
MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in); // - buffer - ghost
MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost
MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost
MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in); // similar to build_owned_gsl2 but no extension
MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry);
void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst);
int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
void transfer(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry);
int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
struct SyncCache {
bool valid;
int cpusize;
MyList<gridseg> **combined_src;
MyList<gridseg> **combined_dst;
int *send_lengths;
int *recv_lengths;
double **send_bufs;
double **recv_bufs;
int *send_buf_caps;
int *recv_buf_caps;
MPI_Request *reqs;
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
SyncCache();
void invalidate();
void destroy();
};
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
struct AsyncSyncState {
int req_no;
bool active;
AsyncSyncState() : req_no(0), active(false) {}
};
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
SyncCache &cache, AsyncSyncState &state);
void Sync_finish(SyncCache &cache, AsyncSyncState &state,
MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Himix(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void Prolong(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Prolongint(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); // for -ghost - BDghost
MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL);
MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *PatL,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void merge_gsl(MyList<gridseg> *&A, const double ratio);
bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
// Add ghost region to tangent plane
// we assume the grids have the same resolution
void add_ghost_touch(MyList<gridseg> *&A);
void cut_gsl(MyList<gridseg> *&A);
bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
MyList<var> *tmList, int Symmetry, bool BB, bool CC);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry);
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
// Redistribute blocks with time statistics for load balancing
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif
// Dynamic load balancing: split blocks for heavy ranks
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
int split_factor, int cpusize, int ingfsi, int fngfsi);
// Check if load balancing is needed based on interpolation times
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
}
#endif /*PARALLEL_H */
#ifndef PARALLEL_H
#define PARALLEL_H
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <string>
#include <cmath>
#include <new>
using namespace std;
#include "Parallel_bam.h"
#include "var.h"
#include "MPatch.h"
#include "Block.h"
#include "MyList.h"
#include "macrodef.h" //need dim; ghost_width; CONTRACT
namespace Parallel
{
struct gridseg
{
double llb[dim];
double uub[dim];
int shape[dim];
double illb[dim], iuub[dim]; // only use for OutBdLow2Hi
Block *Bg;
};
int partition1(int &nx, int split_size, int min_width, int cpusize, int shape); // special for 1 diemnsion
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
void KillBlocks(MyList<Patch> *PatchLIST);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
void setfunction(int rank, MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
void writefile(double time, int nx, int ny, int nz, double xmin, double xmax, double ymin, double ymax,
double zmin, double zmax, char *filename, double *data_out);
void writefile(double time, int nx, int ny, double xmin, double xmax, double ymin, double ymax,
char *filename, double *datain);
void getarrayindex(int DIM, int *shape, int *index, int n);
int getarraylocation(int DIM, int *shape, int *index);
void copy(int DIM, double *llbout, double *uubout, int *Dshape, double *DD, double *llbin, double *uubin,
int *shape, double *datain, double *llb, double *uub);
void Dump_CPU_Data(MyList<Block> *BlL, MyList<var> *DumpList, char *tag, double time, double dT);
void Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
void Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
double *Collect_Data(Patch *PP, var *VP);
void d2Dump_Data(MyList<Patch> *PL, MyList<var> *DumpList, char *tag, double time, double dT);
void d2Dump_Data(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT, int grd);
void Dump_Data0(Patch *PP, MyList<var> *DumpList, char *tag, double time, double dT);
double global_interp(int DIM, int *ext, double **CoX, double *datain,
double *poX, int ordn, double *SoA, int Symmetry);
double global_interp(int DIM, int *ext, double **CoX, double *datain,
double *poX, int ordn);
double Lagrangian_Int(double x, int npts, double *xpts, double *funcvals);
double LagrangePoly(double x, int pt, int npts, double *xpts);
MyList<gridseg> *build_complete_gsl(Patch *Pat);
MyList<gridseg> *build_complete_gsl(MyList<Patch> *PatL);
MyList<gridseg> *build_complete_gsl_virtual(MyList<Patch> *PatL);
MyList<gridseg> *build_complete_gsl_virtual2(MyList<Patch> *PatL); // - buffer
MyList<gridseg> *build_owned_gsl0(Patch *Pat, int rank_in); // - ghost without extension, special for Sync usage
MyList<gridseg> *build_owned_gsl1(Patch *Pat, int rank_in); // - ghost, similar to build_owned_gsl0 but extend one point on left side for vertex grid
MyList<gridseg> *build_owned_gsl2(Patch *Pat, int rank_in); // - buffer - ghost
MyList<gridseg> *build_owned_gsl3(Patch *Pat, int rank_in, int Symmetry); // - ghost - BD ghost
MyList<gridseg> *build_owned_gsl4(Patch *Pat, int rank_in, int Symmetry); // - buffer - ghost - BD ghost
MyList<gridseg> *build_owned_gsl5(Patch *Pat, int rank_in); // similar to build_owned_gsl2 but no extension
MyList<gridseg> *build_owned_gsl(MyList<Patch> *PatL, int rank_in, int type, int Symmetry);
void build_gstl(MyList<gridseg> *srci, MyList<gridseg> *dsti, MyList<gridseg> **out_src, MyList<gridseg> **out_dst);
int data_packer(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
void transfer(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry);
int data_packermix(double *data, MyList<gridseg> *src, MyList<gridseg> *dst, int rank_in, int dir,
MyList<var> *VarLists, MyList<var> *VarListd, int Symmetry);
void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
int Symmetry);
void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
void OutBdLow2Hi(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Hi(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Himix(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Prolong(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Prolongint(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict_after(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry); // for -ghost - BDghost
MyList<Parallel::gridseg> *build_PhysBD_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_ghost_gsl(MyList<Patch> *PatL);
MyList<Parallel::gridseg> *build_ghost_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_buffer_gsl(Patch *Pat);
MyList<Parallel::gridseg> *build_buffer_gsl(MyList<Patch> *PatL);
MyList<Parallel::gridseg> *gsl_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_subtract(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gsl_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *gs_and(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(Patch *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *PatL,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* target (t+a*dt) */, int tindex);
void prepare_inter_time_level(MyList<Patch> *Pat,
MyList<var> *VarList1 /* source (t+dt) */, MyList<var> *VarList2 /* source (t) */,
MyList<var> *VarList3 /* source (t-dt) */, MyList<var> *VarList4 /* target (t+a*dt) */, int tindex);
void merge_gsl(MyList<gridseg> *&A, const double ratio);
bool merge_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C, const double ratio);
// Add ghost region to tangent plane
// we assume the grids have the same resolution
void add_ghost_touch(MyList<gridseg> *&A);
void cut_gsl(MyList<gridseg> *&A);
bool cut_gs(MyList<gridseg> *D, MyList<gridseg> *B, MyList<gridseg> *&C);
MyList<Parallel::gridseg> *gs_subtract_virtual(MyList<Parallel::gridseg> *A, MyList<Parallel::gridseg> *B);
void fill_level_data(MyList<Patch> *PatLd, MyList<Patch> *PatLs, MyList<Patch> *PatcL,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *FutureList,
MyList<var> *tmList, int Symmetry, bool BB, bool CC);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry);
void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif
}
#endif /*PARALLEL_H */

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,7 @@
#ifndef TWO_PUNCTURES_H
#define TWO_PUNCTURES_H
#include <omp.h>
#define StencilSize 19
#define N_PlaneRelax 1
#define NRELAX 200
@@ -33,7 +32,7 @@ private:
int npoints_A, npoints_B, npoints_phi;
double target_M_plus, target_M_minus;
double admMass;
double adm_tol;
@@ -43,17 +42,32 @@ private:
int ntotal;
// ===== Precomputed spectral derivative matrices =====
double *D1_A, *D2_A;
double *D1_B, *D2_B;
double *DF1_phi, *DF2_phi;
// ===== Pre-allocated workspace for LineRelax (per-thread) =====
int max_threads;
double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
// Pre-allocated workspace buffers for hot-path allocation elimination
// LineRelax_be workspace (sized for n2)
double *ws_diag_be, *ws_e_be, *ws_f_be, *ws_b_be, *ws_x_be;
// LineRelax_al workspace (sized for n1)
double *ws_diag_al, *ws_e_al, *ws_f_al, *ws_b_al, *ws_x_al;
// ThomasAlgorithm workspace (sized for max(n1,n2))
double *ws_thomas_y;
// JFD_times_dv workspace (sized for nvar)
double *ws_jfd_values;
derivs ws_jfd_dU, ws_jfd_U;
// chebft_Zeros workspace (sized for max(n1,n2,n3)+1)
double *ws_cheb_c;
// fourft workspace (sized for max(n1,n2,n3)/2+1 each)
double *ws_four_a, *ws_four_b;
// Derivatives_AB3 workspace
double *ws_deriv_p, *ws_deriv_dp, *ws_deriv_d2p;
double *ws_deriv_q, *ws_deriv_dq;
double *ws_deriv_r, *ws_deriv_dr;
int *ws_deriv_indx;
// F_of_v workspace
double *ws_fov_sources;
double *ws_fov_values;
derivs ws_fov_U;
// J_times_dv workspace
double *ws_jtdv_values;
derivs ws_jtdv_dU, ws_jtdv_U;
struct parameters
{
@@ -71,28 +85,6 @@ public:
int Newtonmaxit);
~TwoPunctures();
// 02/07: New/modified methods
void allocate_workspace();
void free_workspace();
void precompute_derivative_matrices();
void build_cheb_deriv_matrices(int n, double *D1, double *D2);
void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
double *l, double *u_ws, double *d, double *y);
void LineRelax_be_omp(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD, int tid);
void LineRelax_al_omp(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD, int tid);
void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void Solve();
void set_initial_guess(derivs v);
int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +143,23 @@ public:
double BY_KKofxyz(double x, double y, double z);
void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols, double **JFD);
void LineRelax_be(double *dv,
int const i, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols, int **cols,
double **JFD);
void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
int n3, derivs dv, derivs u, double *values);
void LinEquations(double A, double B, double X, double R,
double x, double r, double phi,
double y, double z, derivs dU, derivs U, double *values);
void LineRelax_al(double *dv,
int const j, int const k, int const nvar,
int const n1, int const n2, int const n3,
double const *rhs, int const *ncols,
int **cols, double **JFD);
void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
void Save(char *fname);
// provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -164,4 +168,4 @@ public:
void SpecCoef(parameters par, int ivar, double *v, double *cf);
};
#endif /* TWO_PUNCTURES_H */
#endif /* TWO_PUNCTURES_H */

View File

@@ -730,12 +730,6 @@ void bssn_class::Initialize()
PhysTime = StartTime;
Setup_Black_Hole_position();
}
// Initialize sync caches (per-level, for predictor and corrector)
sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
}
//================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
delete Azzz;
#endif
// Destroy sync caches before GH
if (sync_cache_pre)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_pre[i].destroy();
delete[] sync_cache_pre;
}
if (sync_cache_cor)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_cor[i].destroy();
delete[] sync_cache_cor;
}
if (sync_cache_rp_coarse)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_coarse[i].destroy();
delete[] sync_cache_rp_coarse;
}
if (sync_cache_rp_fine)
{
for (int i = 0; i < GH->levels; i++)
sync_cache_rp_fine[i].destroy();
delete[] sync_cache_rp_fine;
}
delete GH;
#ifdef WithShell
delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif
}
@@ -2608,7 +2574,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
#endif
}
@@ -2775,7 +2740,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear();
// a_stream.str("");
@@ -2790,7 +2754,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear();
// a_stream.str("");
@@ -2809,7 +2772,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear();
// a_stream.str("");
@@ -2825,7 +2787,6 @@ void bssn_class::ParallelStep()
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
// a_stream.clear();
// a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
}
Pp = Pp->next;
}
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell
// evolve Shell Patches
@@ -3215,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
{
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
#endif
}
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#ifdef WithShell
if (lev == 0)
@@ -3363,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0)
// for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next;
}
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell
// evolve Shell Patches
@@ -3579,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
{
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1)
if (iter_count == 3)
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next;
}
}
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req_cor;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#"
<< iter_count << " variables at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#ifdef WithShell
if (lev == 0)
@@ -3723,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
#if (MAPBH == 0)
// for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
}
Pp = Pp->next;
}
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell
// evolve Shell Patches
@@ -4077,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
{
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[fngfs + ShellPatch::gx],
cg->fgfs[fngfs + ShellPatch::gy],
cg->fgfs[fngfs + ShellPatch::gx],
cg->fgfs[fngfs + ShellPatch::gy],
cg->fgfs[fngfs + ShellPatch::gz],
cg->fgfs[fngfs + ShellPatch::drhodx],
cg->fgfs[fngfs + ShellPatch::drhody],
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
}
#endif
}
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#ifdef WithShell
if (lev == 0)
@@ -4222,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
Pp = Pp->next;
}
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
// check error information
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#ifdef WithShell
// evolve Shell Patches
@@ -4423,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
{
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#elif (AGM == 1)
if (iter_count == 3)
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
sPp = sPp->next;
}
}
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req_cor;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#ifdef WithShell
if (lev == 0)
@@ -4567,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
#ifdef WithShell
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
}
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
}
}
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#if (MAPBH == 0)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req_cor;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
}
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
#if (MAPBH == 0)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
#if (PSTR == 1 || PSTR == 2)
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
#endif
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency
MPI_Request err_req;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
{
@@ -5484,25 +5473,12 @@ void bssn_class::SHStep()
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
// Complete non-blocking error reduction and check
MPI_Wait(&err_req, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
sPp = sPp->next;
}
}
// Non-blocking error reduction overlapped with Synch to hide Allreduce latency
MPI_Request err_req_cor;
// check error information
{
int erh = ERROR;
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
{
@@ -5661,26 +5647,12 @@ void bssn_class::SHStep()
{
prev_clock = curr_clock;
curr_clock = clock();
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
// Complete non-blocking error reduction and check
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
if (ERROR)
{
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
if (myrank == 0)
{
if (ErrorMonitor->outfile)
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
<< " variables at t = " << PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
sPp = SH->PatL;
while (sPp)
{
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (PSTR == 1 || PSTR == 2)
// a_stream.clear();
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2)
// a_stream.clear();
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
}
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
#if (PSTR == 1 || PSTR == 2)
// a_stream.clear();
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
#endif
}
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
}
}
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
#endif
}
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
}
}
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
}
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
else // no time refinement levels and for all same time levels
{
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
#else
Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#endif
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
}
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
}
}
#undef MIXOUTB

View File

@@ -126,11 +126,6 @@ public:
MyList<var> *OldStateList, *DumpList;
MyList<var> *ConstraintList;
Parallel::SyncCache *sync_cache_pre; // per-level cache for predictor sync
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor;
surface_integral *Waveshell;

View File

@@ -83,6 +83,10 @@
real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
! shared work arrays (memory pool) to avoid repeated allocation in subroutines
real*8, dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh_work2 ! for fderivs_fh/fdderivs_fh (ghost_width=2)
real*8, dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh_work3 ! for kodis_fh/lopsided_fh (ghost_width=3)
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
@@ -151,22 +155,22 @@
gyy = dyy + ONE
gzz = dzz + ONE
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
call fderivs_fh(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev,fh_work2)
div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
call fderivs_fh(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev,fh_work2)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs_fh(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev,fh_work2)
gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
@@ -284,8 +288,8 @@
(gupyy * gupzz + gupyz * gupyz)* Ayz
! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
call fderivs_fh(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev,fh_work2)
Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + &
TWO * alpn1 * ( &
@@ -314,12 +318,12 @@
Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + &
TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
call fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,&
X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
call fdderivs_fh(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev,fh_work2)
call fdderivs_fh(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,&
X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev,fh_work2)
call fdderivs_fh(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev,fh_work2)
fxx = gxxx + gxyy + gxzz
fxy = gxyx + gyyy + gyzz
@@ -332,9 +336,9 @@
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
call fderivs_fh(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev,fh_work2)
call fderivs_fh(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev,fh_work2)
Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
@@ -377,27 +381,27 @@
gzzz = gxz * Gamxzz + gyz * Gamyzz + gzz * Gamzzz
!compute Ricci tensor for tilted metric
call fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev)
call fdderivs_fh(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev,fh_work2)
Rxx = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
call fdderivs(ex,dyy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev)
call fdderivs_fh(ex,dyy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev,fh_work2)
Ryy = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
call fdderivs(ex,dzz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev)
call fdderivs_fh(ex,dzz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,symmetry,Lev,fh_work2)
Rzz = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
call fdderivs(ex,gxy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI, ANTI,SYM ,symmetry,Lev)
call fdderivs_fh(ex,gxy,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI, ANTI,SYM ,symmetry,Lev,fh_work2)
Rxy = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
call fdderivs(ex,gxz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI ,SYM ,ANTI,symmetry,Lev)
call fdderivs_fh(ex,gxz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,ANTI ,SYM ,ANTI,symmetry,Lev,fh_work2)
Rxz = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
call fdderivs(ex,gyz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,ANTI ,ANTI,symmetry,Lev)
call fdderivs_fh(ex,gyz,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,ANTI ,ANTI,symmetry,Lev,fh_work2)
Ryz = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
( gupxy * fxy + gupxz * fxz + gupyz * fyz ) * TWO
@@ -602,7 +606,7 @@
Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy + &
Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
!covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fdderivs_fh(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
@@ -628,8 +632,8 @@
Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev)
call fdderivs_fh(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev,fh_work2)
gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
@@ -812,7 +816,7 @@
betay_rhs = FF*dtSfy
betaz_rhs = FF*dtSfz
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs_fh(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
@@ -824,7 +828,7 @@
betay_rhs = FF*dtSfy
betaz_rhs = FF*dtSfz
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs_fh(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
@@ -832,7 +836,7 @@
dtSfy_rhs = Gamy_rhs - reta*dtSfy
dtSfz_rhs = Gamz_rhs - reta*dtSfz
#elif (GAUGE == 4)
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs_fh(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
@@ -844,7 +848,7 @@
dtSfy_rhs = ZEO
dtSfz_rhs = ZEO
#elif (GAUGE == 5)
call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs_fh(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev,fh_work2)
reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
@@ -945,61 +949,104 @@
SSA(2)=SYM
SSA(3)=ANTI
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
!!!!!!!!!advection term part
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_fh(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_fh(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,fh_work3)
call lopsided_fh(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,fh_work3)
call lopsided_fh(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_fh(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,fh_work3)
call lopsided_fh(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_fh(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_fh(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,fh_work3)
call lopsided_fh(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,fh_work3)
call lopsided_fh(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_fh(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,fh_work3)
call lopsided_fh(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_fh(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_fh(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
call lopsided_fh(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,fh_work3)
call lopsided_fh(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,fh_work3)
call lopsided_fh(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,fh_work3)
!!
call lopsided_fh(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,fh_work3)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_fh(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,fh_work3)
call lopsided_fh(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,fh_work3)
call lopsided_fh(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,fh_work3)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_fh(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,fh_work3)
call lopsided_fh(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,fh_work3)
call lopsided_fh(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,fh_work3)
#endif
if(eps>0)then
! usual Kreiss-Oliger dissipation
call kodis_fh(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps,fh_work3)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "before",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis_fh(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps,fh_work3)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "after",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis_fh(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps,fh_work3)
#if 1
!! bam does not apply dissipation on gauge variables
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
call kodis_fh(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps,fh_work3)
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#else
! No dissipation on gauge variables (advection only)
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
call kodis_fh(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps,fh_work3)
call kodis_fh(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps,fh_work3)
#endif
#endif
endif
if(co == 0)then
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
! here trR is respect to physical metric
@@ -1035,12 +1082,12 @@
! mov_Res_j = gupkj*(-1/chi d_k chi*A_ij + D_k A_ij) - 2/3 d_j trK - 8 PI s_j where D respect to physical metric
! store D_i A_jk - 1/chi d_i chi*A_jk in gjk_i
call fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0)
call fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0)
call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0)
call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
call fderivs_fh(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0,fh_work2)
call fderivs_fh(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0,fh_work2)
call fderivs_fh(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0,fh_work2)
call fderivs_fh(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0,fh_work2)
call fderivs_fh(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0,fh_work2)
call fderivs_fh(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0,fh_work2)
gxxx = gxxx - ( Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz &
+ Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx/chin1

File diff suppressed because it is too large Load Diff

View File

@@ -1,107 +1,92 @@
#ifndef CGH_H
#define CGH_H
#include <mpi.h>
#include "MyList.h"
#include "MPatch.h"
#include "macrodef.h"
#include "monitor.h"
#include "Parallel.h"
class cgh
{
public:
int levels, movls, BH_num_in;
// information of boxes
int *grids;
double ***bbox;
int ***shape;
double ***handle;
double ***Porgls;
double *Lt;
// information of Patch list
MyList<Patch> **PatL;
// information of OutBdLow2Hi point list and Restrict point list
#if (RPB == 1)
MyList<Parallel::pointstru_bam> **bdsul, **rsul;
#endif
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
int mylev;
int *start_rank, *end_rank;
MPI_Comm *Commlev;
#endif
protected:
int ingfs, fngfs;
static constexpr double ratio = 0.75;
int trfls;
public:
cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor);
~cgh();
void compose_cgh(int nprocs);
void sethandle(monitor *ErrorMonitor);
void checkPatchList(MyList<Patch> *PatL, bool buflog);
void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void recompose_cgh(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void recompose_cgh_fake(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void read_bbox(int Symmetry, char *filename);
MyList<Patch> *construct_patchlist(int lev, int Symmetry);
bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry);
void recompose_cgh_Onelevel(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void settrfls(const int lev);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs);
#endif
// Load balancing support
bool enable_load_balance; // Enable load balancing
int load_balance_check_interval; // Check interval (in time steps)
int current_time_step; // Current time step counter
double *rank_interp_times; // Store interpolation times for each rank
int *heavy_ranks; // Store heavy rank numbers
int num_heavy_ranks; // Number of heavy ranks
void init_load_balance(int nprocs);
void update_interp_time(int rank, double time);
bool check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
};
#endif /* CGH_H */
#ifndef CGH_H
#define CGH_H
#include <mpi.h>
#include "MyList.h"
#include "MPatch.h"
#include "macrodef.h"
#include "monitor.h"
#include "Parallel.h"
class cgh
{
public:
int levels, movls, BH_num_in;
// information of boxes
int *grids;
double ***bbox;
int ***shape;
double ***handle;
double ***Porgls;
double *Lt;
// information of Patch list
MyList<Patch> **PatL;
// information of OutBdLow2Hi point list and Restrict point list
#if (RPB == 1)
MyList<Parallel::pointstru_bam> **bdsul, **rsul;
#endif
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
int mylev;
int *start_rank, *end_rank;
MPI_Comm *Commlev;
#endif
protected:
int ingfs, fngfs;
static constexpr double ratio = 0.75;
int trfls;
public:
cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun, monitor *ErrorMonitor);
~cgh();
void compose_cgh(int nprocs);
void sethandle(monitor *ErrorMonitor);
void checkPatchList(MyList<Patch> *PatL, bool buflog);
void Regrid(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void Regrid_fake(int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void recompose_cgh(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void recompose_cgh_fake(int nprocs, bool *lev_flag,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void read_bbox(int Symmetry, char *filename);
MyList<Patch> *construct_patchlist(int lev, int Symmetry);
bool Interp_One_Point(MyList<var> *VarList,
double *XX, /*input global Cartesian coordinate*/
double *Shellf, int Symmetry);
void recompose_cgh_Onelevel(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void Regrid_Onelevel_aux(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor);
void settrfls(const int lev);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs);
#endif
};
#endif /* CGH_H */

View File

@@ -69,12 +69,10 @@
fy = ZEO
fz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
! x direction
if(i+1 <= imax .and. i-1 >= imin)then
!
! - f(i-1) + f(i+1)
@@ -373,8 +371,6 @@
fxz = ZEO
fyz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1107,6 +1103,103 @@
end subroutine fderivs
!-----------------------------------------------------------------------------
! fderivs variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine fderivs_fh(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3, &
symmetry,onoff,fh)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)), intent(in ):: f
real*8, dimension(ex(1),ex(2),ex(3)), intent(out):: fx,fy,fz
real*8, intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
real*8 :: dX,dY,dZ
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0,ONE=1.d0
real*8, parameter :: TWO=2.d0,EIT=8.d0
real*8, parameter :: F12=1.2d1
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f,fh,SoA)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
fx = ZEO
fy = ZEO
fz = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
#if 0
if(i+2 <= imax .and. i-2 >= imin)then
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax .and. i-1 >= imin)then
fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
endif
if(j+2 <= jmax .and. j-2 >= jmin)then
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax .and. j-1 >= jmin)then
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
endif
if(k+2 <= kmax .and. k-2 >= kmin)then
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax .and. k-1 >= kmin)then
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif
#else
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
fx(i,j,k)=d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
fy(i,j,k)=d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
fz(i,j,k)=d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
fx(i,j,k)=d2dx*(-fh(i-1,j,k)+fh(i+1,j,k))
fy(i,j,k)=d2dy*(-fh(i,j-1,k)+fh(i,j+1,k))
fz(i,j,k)=d2dz*(-fh(i,j,k-1)+fh(i,j,k+1))
endif
#endif
enddo
enddo
enddo
return
end subroutine fderivs_fh
!-----------------------------------------------------------------------------
!
! single derivatives dx
!
@@ -1944,6 +2037,162 @@
end subroutine fddyz
!-----------------------------------------------------------------------------
! fdderivs variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine fdderivs_fh(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM1,SYM2,SYM3,symmetry,onoff,fh)
implicit none
integer, intent(in ):: ex(1:3),symmetry,onoff
real*8, dimension(ex(1),ex(2),ex(3)),intent(in ):: f
real*8, dimension(ex(1),ex(2),ex(3)),intent(out):: fxx,fxy,fxz,fyy,fyz,fzz
real*8, intent(in ):: X(ex(1)),Y(ex(2)),Z(ex(3))
real*8, intent(in ):: SYM1,SYM2,SYM3
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)),intent(inout):: fh
real*8 :: dX,dY,dZ
real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1
real*8, parameter :: F8=8.d0, F16=1.6d1, F30=3.d1
real*8, parameter :: F1o12=ONE/1.2d1, F1o144=ONE/1.44d2
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
SoA(1) = SYM1
SoA(2) = SYM2
SoA(3) = SYM3
call symmetry_bd(2,ex,f,fh,SoA)
Sdxdx = ONE /( dX * dX )
Sdydy = ONE /( dY * dY )
Sdzdz = ONE /( dZ * dZ )
Fdxdx = F1o12 /( dX * dX )
Fdydy = F1o12 /( dY * dY )
Fdzdz = F1o12 /( dZ * dZ )
Sdxdy = F1o4 /( dX * dY )
Sdxdz = F1o4 /( dX * dZ )
Sdydz = F1o4 /( dY * dZ )
Fdxdy = F1o144 /( dX * dY )
Fdxdz = F1o144 /( dX * dZ )
Fdydz = F1o144 /( dY * dZ )
fxx = ZEO
fyy = ZEO
fzz = ZEO
fxy = ZEO
fxz = ZEO
fyz = ZEO
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
#if 0
if(i+2 <= imax .and. i-2 >= imin)then
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
elseif(i+1 <= imax .and. i-1 >= imin)then
fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
endif
if(j+2 <= jmax .and. j-2 >= jmin)then
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
elseif(j+1 <= jmax .and. j-1 >= jmin)then
fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
endif
if(k+2 <= kmax .and. k-2 >= kmin)then
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
elseif(k+1 <= kmax .and. k-1 >= kmin)then
fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
endif
if(i+2 <= imax .and. i-2 >= imin .and. j+2 <= jmax .and. j-2 >= jmin)then
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
elseif(i+1 <= imax .and. i-1 >= imin .and. j+1 <= jmax .and. j-1 >= jmin)then
fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
endif
if(i+2 <= imax .and. i-2 >= imin .and. k+2 <= kmax .and. k-2 >= kmin)then
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
elseif(i+1 <= imax .and. i-1 >= imin .and. k+1 <= kmax .and. k-1 >= kmin)then
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
endif
if(j+2 <= jmax .and. j-2 >= jmin .and. k+2 <= kmax .and. k-2 >= kmin)then
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(j+1 <= jmax .and. j-1 >= jmin .and. k+1 <= kmax .and. k-1 >= kmin)then
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif
#else
! for bam comparison
if(i+2 <= imax .and. i-2 >= imin .and. &
j+2 <= jmax .and. j-2 >= jmin .and. &
k+2 <= kmax .and. k-2 >= kmin) then
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
elseif(i+1 <= imax .and. i-1 >= imin .and. &
j+1 <= jmax .and. j-1 >= jmin .and. &
k+1 <= kmax .and. k-1 >= kmin) then
fxx(i,j,k) = Sdxdx*(fh(i-1,j,k)-TWO*fh(i,j,k)+fh(i+1,j,k))
fyy(i,j,k) = Sdydy*(fh(i,j-1,k)-TWO*fh(i,j,k)+fh(i,j+1,k))
fzz(i,j,k) = Sdzdz*(fh(i,j,k-1)-TWO*fh(i,j,k)+fh(i,j,k+1))
fxy(i,j,k) = Sdxdy*(fh(i-1,j-1,k)-fh(i+1,j-1,k)-fh(i-1,j+1,k)+fh(i+1,j+1,k))
fxz(i,j,k) = Sdxdz*(fh(i-1,j,k-1)-fh(i+1,j,k-1)-fh(i-1,j,k+1)+fh(i+1,j,k+1))
fyz(i,j,k) = Sdydz*(fh(i,j-1,k-1)-fh(i,j+1,k-1)-fh(i,j-1,k+1)+fh(i,j+1,k+1))
endif
#endif
enddo
enddo
enddo
return
end subroutine fdderivs_fh
#elif (ghost_width == 4)
! sixth order code

View File

@@ -883,17 +883,13 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
enddo
@@ -1116,7 +1112,6 @@ end subroutine d2dump
! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint
subroutine polint(xa, ya, x, y, dy, ordn)
implicit none

View File

@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
! dx^4
! note the sign (-1)^r-1, now r=2
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -217,6 +215,99 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
end subroutine kodis
!-----------------------------------------------------------------------------
! kodis variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine kodis_fh(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps,fh)
implicit none
! argument variables
integer,intent(in) :: Symmetry
integer,dimension(3),intent(in)::ex
real*8, dimension(1:3), intent(in) :: SoA
double precision,intent(in),dimension(ex(1))::X
double precision,intent(in),dimension(ex(2))::Y
double precision,intent(in),dimension(ex(3))::Z
double precision,intent(in),dimension(ex(1),ex(2),ex(3))::f
double precision,intent(inout),dimension(ex(1),ex(2),ex(3))::f_rhs
real*8,intent(in) :: eps
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
! local variables
integer :: imin,jmin,kmin,imax,jmax,kmax
integer :: i,j,k
real*8 :: dX,dY,dZ
real*8, parameter :: ONE=1.d0,SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8,parameter::cof=6.4d1 ! 2^6
integer, parameter :: NO_SYMM=0, OCTANT=2
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry == OCTANT .and. dabs(X(1)) < dX) imin = -2
if(Symmetry == OCTANT .and. dabs(Y(1)) < dY) jmin = -2
call symmetry_bd(3,ex,f,fh,SoA)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
#if 0
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dX/cof * ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dY/cof * ( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/dZ/cof * ( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )
#else
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
#endif
endif
enddo
enddo
enddo
return
end subroutine kodis_fh
#elif (ghost_width == 4)
! sixth order code
!------------------------------------------------------------------------------------------------------------------------------

View File

@@ -488,11 +488,9 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
end subroutine lopsided
!-----------------------------------------------------------------------------
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
! Mathematically identical to calling lopsided then kodis separately.
! lopsided variant: reuses caller-provided fh work array (memory pool)
!-----------------------------------------------------------------------------
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
subroutine lopsided_fh(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,fh)
implicit none
!~~~~~~> Input parameters:
@@ -503,11 +501,9 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,intent(in) :: eps
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)),intent(inout):: fh
!~~~~~~> local variables:
! note index -2,-1,0, so we have 3 extra points
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
@@ -515,9 +511,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
! kodis parameters
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8, parameter :: cof=6.4d1 ! 2^6
dX = X(2)-X(1)
dY = Y(2)-Y(1)
@@ -542,16 +535,18 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
! Single symmetry_bd call shared by both advection and dissipation
call symmetry_bd(3,ex,f,fh,SoA)
! ---- Advection (lopsided) loop ----
! upper bound set ex-1 only for efficiency,
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
#if 0
!! old code - same as original lopsided
#else
!! new code, 2012dec27, based on bam
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
@@ -560,7 +555,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
@@ -574,7 +568,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
@@ -582,7 +575,7 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
endif
endif
! y direction
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
@@ -591,7 +584,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
@@ -605,7 +597,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
@@ -613,7 +604,7 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
endif
endif
! z direction
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
@@ -622,7 +613,6 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
@@ -636,51 +626,20 @@ subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
#endif
enddo
enddo
enddo
! ---- Dissipation (kodis) loop ----
if(eps > ZEO) then
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo
enddo
enddo
endif
return
end subroutine lopsided_kodis
end subroutine lopsided_fh
#elif (ghost_width == 4)
! sixth order code

View File

@@ -16,12 +16,6 @@ include makefile.inc
.cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
TwoPunctures.o: TwoPunctures.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -102,7 +96,7 @@ ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -10,15 +10,15 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
## -fprofile-instr-use: use collected profile data to guide optimization decisions
## (branch prediction, basic block layout, inlining, loop unrolling)
PROFDATA = ../../pgo_profile/default.profdata
## Aggressive optimization flags:
## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx

File diff suppressed because it is too large Load Diff

View File

@@ -10,18 +10,18 @@
import AMSS_NCKU_Input as input_data
import subprocess
import time
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
#NUMACTL_CPU_BIND = "taskset -c 0-111"
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96
BUILD_JOBS = 64
##################################################################
@@ -118,7 +118,6 @@ def run_ABE():
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -154,14 +153,13 @@ def run_ABE():
## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
def run_TwoPunctureABE():
tp_time1=time.time()
print( )
print( " Running the AMSS-NCKU executable file TwoPunctureABE " )
print( )
## Define the command to run
#TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command = " ./TwoPunctureABE"
TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output
@@ -182,9 +180,7 @@ def run_TwoPunctureABE():
print( )
print( " The TwoPunctureABE simulation is finished " )
print( )
tp_time2=time.time()
et=tp_time2-tp_time1
print(f"Used time: {et}")
return
##################################################################

View File

@@ -1,29 +0,0 @@
import multiprocessing
def run_plot_task(task):
"""Execute a single plotting task.
Parameters
----------
task : tuple
A tuple of (function, args_tuple) where function is a callable
plotting function and args_tuple contains its arguments.
"""
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
"""Execute a list of independent plotting tasks in parallel.
Uses the 'fork' context to create worker processes so that the main
script is NOT re-imported/re-executed in child processes.
Parameters
----------
plot_tasks : list of tuples
Each element is (function, args_tuple).
"""
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

View File

@@ -1,97 +0,0 @@
# AMSS-NCKU PGO Profile Analysis Report
## 1. Profiling Environment
| Item | Value |
|------|-------|
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
| Instrumentation Flag | `-fprofile-instr-generate` |
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
| Merged Profile | `default.profdata` (394 KB) |
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
## 2. Reduced Simulation Parameters (for profiling run)
| Parameter | Production Value | Profiling Value |
|-----------|-----------------|-----------------|
| MPI_processes | 64 | 1 |
| grid_level | 9 | 4 |
| static_grid_level | 5 | 3 |
| static_grid_number | 96 | 24 |
| moving_grid_number | 48 | 16 |
| largest_box_xyz_max | 320^3 | 160^3 |
| Final_Evolution_Time | 1000.0 | 10.0 |
| Evolution_Step_Number | 10,000,000 | 1,000 |
| Detector_Number | 12 | 2 |
## 3. Profile Summary
| Metric | Value |
|--------|-------|
| Total instrumented functions | 1,392 |
| Functions with non-zero counts | 117 (8.4%) |
| Functions with zero counts | 1,275 (91.6%) |
| Maximum function entry count | 386,459,248 |
| Maximum internal block count | 370,477,680 |
| Total block count | 4,198,023,118 |
## 4. Top 20 Hotspot Functions
| Rank | Total Count | Max Block Count | Function | Category |
|------|------------|-----------------|----------|----------|
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
## 5. Hotspot Category Breakdown
Top 20 functions account for ~98% of total execution counts:
| Category | Functions | Combined Count | Share |
|----------|-----------|---------------|-------|
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
| Dissipation | kodis_ | ~92M | ~2% |
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
## 6. Conclusions
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
## 7. PGO Phase 2 Usage
To apply the profile, use the following flags in `makefile.inc`:
```makefile
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-align array64byte -fpp -I${MKLROOT}/include
```

Binary file not shown.

Binary file not shown.

View File

@@ -11,8 +11,6 @@
import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing
import math
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations

View File

@@ -8,23 +8,16 @@
##
#################################################
## Restrict OpenMP to one thread per process so that running
## many workers in parallel does not create an O(workers * BLAS_threads)
## thread explosion. The variable MUST be set before numpy/scipy
## are imported, because the BLAS library reads them only at load time.
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
import numpy
import scipy
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D
## import torch
import AMSS_NCKU_Input as input_data
import os
#########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
####################################################################################
####################################################################################
## Allow this module to be run as a standalone script so that each
## binary-data plot can be executed in a fresh subprocess whose BLAS
## environment variables (set above) take effect before numpy loads.
##
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
####################################################################################
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -8,8 +8,6 @@
#################################################
import numpy ## numpy for array operations
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob
@@ -17,9 +15,6 @@ import os ## operating system utilities
import plot_binary_data
import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x)
print(x)
## Plot each file in parallel using subprocesses.
## Each subprocess is a fresh Python process where the BLAS thread-count
## environment variables (set at the top of plot_binary_data.py) take
## effect before numpy is imported. This avoids the thread explosion
## that occurs when multiprocessing.Pool with 'fork' context inherits
## already-initialized multi-threaded BLAS from the parent.
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
running = []
failed = []
## Plot each file in the list
for filename in file_list:
print(filename)
proc = subprocess.Popen(
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
## Keep at most max_workers subprocesses active at a time
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
## Wait for all remaining subprocesses to finish
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
print( )
print( " Binary Data Plot Has been Finished " )