Compare commits

..

2 Commits

Author SHA1 Message Date
714c6e90c6 Add OpenMP parallelization to Fortran compute kernels
Add !$omp parallel do collapse(2) directives to all triple-loop
stencil kernels (fderivs, fdderivs, fdx/fdy/fdz, kodis, lopsided,
enforce_ag/enforce_ga) across all ghost_width variants. Add !$omp
parallel workshare to RK4/ICN/Euler whole-array update routines.

Build system: add -qopenmp to compile and link flags, switch MKL
from sequential to threaded (-lmkl_intel_thread -liomp5).

Runtime: set OMP_NUM_THREADS=96, OMP_STACKSIZE=16M, OMP_PROC_BIND=close,
OMP_PLACES=cores for 96-core server target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-10 23:40:17 +08:00
caf192b2e4 Remove MPI dependency, replace with single-process stub for non-MPI builds
- Add mpi_stub.h providing all MPI types/constants/functions as no-ops
  (nprocs=1, myrank=0) with memcpy-based Allreduce and clock_gettime Wtime
- Replace #include <mpi.h> with conditional #ifdef MPI_STUB in 31 files
  (19 headers + 12 source files) preserving ability to build with real MPI
- Change makefile.inc: CLINKER mpiicpx->icpx, add -DMPI_STUB to CXXAPPFLAGS
- Update makefile_and_run.py: run ./ABE directly instead of mpirun -np N
- Set MPI_processes=1 in AMSS_NCKU_Input.py

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-02-10 22:51:11 +08:00
57 changed files with 14347 additions and 15347 deletions

View File

@@ -16,7 +16,7 @@ import numpy
File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long
MPI_processes = 64 ## number of mpi processes used in the simulation
MPI_processes = 1 ## number of processes (MPI removed, single-process mode)
GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)

View File

@@ -8,14 +8,6 @@
##
##################################################################
## Guard against re-execution by multiprocessing child processes.
## Without this, using 'spawn' or 'forkserver' context would cause every
## worker to re-run the entire script, spawning exponentially more
## workers (fork bomb).
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
##################################################################
@@ -66,8 +58,7 @@ if os.path.exists(File_directory):
## Prompt whether to overwrite the existing directory
while True:
try:
## inputvalue = input()
inputvalue = "continue"
inputvalue = input()
## If the user agrees to overwrite, proceed and remove the existing directory
if ( inputvalue == "continue" ):
print( " Continue the calculation !!! " )
@@ -433,31 +424,26 @@ print(
import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory) ) )
plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_puncture_orbit_plot( binary_results_directory, figure_directory )
plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
## Plot black hole separation vs. time
plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
## Plot ADM mass evolution
for i in range(input_data.Detector_Number):
plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level):
plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
run_plot_tasks_parallel(plot_tasks)
plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
## Plot stored binary data
plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )

View File

@@ -20,7 +20,11 @@ using namespace std;
#include <map.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "macrodef.h"

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#define PI M_PI

View File

@@ -2,7 +2,11 @@
#ifndef BLOCK_H
#define BLOCK_H
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "macrodef.h" //need dim here; Vertex or Cell
#include "var.h"
#include "MyList.h"

View File

@@ -4,7 +4,11 @@
#include <stdarg.h>
#include <string.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "myglobal.h"

View File

@@ -442,6 +442,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next;
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group.
// Since each point's data is non-zero only on the owner rank,
@@ -506,9 +507,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
// Targeted point-to-point overload: each owner sends each point only to
// the one rank that needs it for integration (consumer), reducing
// communication volume by ~nprocs times compared to the Bcast version.
/*
double t_calc_end, t_calc_total = 0;
double t_calc_start = MPI_Wtime();*/
int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -609,15 +607,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
Bp = Bp->next;
}
}
/*
t_calc_end = MPI_Wtime();
t_calc_total = t_calc_end - t_calc_start;*/
// --- Error check for unfound points ---
for (int j = 0; j < NN; j++)
{
if (owner_rank[j] < 0 && myrank == 0)
{
cout<<owner_rank[j-1]<<endl;
cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++)
{
@@ -769,63 +764,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] recv_count;
delete[] consumer_rank;
delete[] owner_rank;
/*
// 4. 汇总并输出真正干活最慢的 Top 4
struct RankStats {
int rank;
double calc_time; // 净计算时间
};
// 创建当前进程的统计数据
RankStats local_stat;
local_stat.rank = myrank;
local_stat.calc_time = t_calc_total;
// 为所有进程的统计数据分配内存
RankStats *all_stats = nullptr;
if (myrank == 0) {
all_stats = new RankStats[nprocs];
}
// 使用MPI_Gather收集所有进程的数据到rank 0
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
all_stats, sizeof(RankStats), MPI_BYTE,
0, MPI_COMM_WORLD);
// 准备输出前4个rank的信息所有rank都参与确保广播后一致
int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
int num_top10 = 0;
if (myrank == 0) {
// 按 calc_time净计算时间排序
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
return a.calc_time > b.calc_time;
});
// 取前4个
num_top10 = (nprocs < 10) ? nprocs : 10;
for (int i = 0; i < num_top10; i++) {
top10_ranks[i] = all_stats[i].rank;
top10_times[i] = all_stats[i].calc_time;
}
printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
for (int i = 0; i < num_top10; i++) {
printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
}
// 清理分配的内存
delete[] all_stats;
}
// 广播前4个rank的信息给所有进程
MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (num_top10 > 0) {
MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
*/
}
void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX,

View File

@@ -2,7 +2,11 @@
#ifndef PATCH_H
#define PATCH_H
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "MyList.h"
#include "Block.h"
#include "var.h"

View File

@@ -8,7 +8,11 @@
#include <limits.h>
#include <float.h>
#include <math.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "util_Table.h"
#include "cctk.h"

View File

@@ -23,8 +23,11 @@ using namespace std;
#include <complex.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#include <memory.h>
#endif
#include "MyList.h"
#include "Block.h"
#include "Parallel.h"

View File

@@ -23,7 +23,11 @@ using namespace std;
#include <complex.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "MyList.h"
#include "Block.h"
#include "Parallel.h"

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
#include <cmath>
#include <new>
using namespace std;
#include <memory.h>
#include "Parallel_bam.h"
#include "var.h"
#include "MPatch.h"
@@ -32,25 +32,6 @@ namespace Parallel
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig,
Patch* PP, int r_1, int r_2,
int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block);
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig,
Patch* PP, int r_1, int r_2, int r_3, int r_4,
int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block);
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
int block_id, int ingfsi, int fngfsi, int lev);
void KillBlocks(MyList<Patch> *PatchLIST);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -116,7 +97,6 @@ Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
MPI_Request *reqs;
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
SyncCache();
void invalidate();
void destroy();
@@ -149,15 +129,6 @@ Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void Prolong(Patch *Patc, Patch *Patf,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
int Symmetry);
@@ -227,18 +198,6 @@ Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
// Redistribute blocks with time statistics for load balancing
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif
// Dynamic load balancing: split blocks for heavy ranks
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
int split_factor, int cpusize, int ingfsi, int fngfsi);
// Check if load balancing is needed based on interpolation times
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
}
#endif /*PARALLEL_H */
}
#endif /*PARALLEL_H */

View File

@@ -2,7 +2,11 @@
#ifndef SHELLPATCH_H
#define SHELLPATCH_H
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "MyList.h"
#include "Block.h"
#include "Parallel.h"

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "cgh.h"
#include "ShellPatch.h"

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "cgh.h"
#include "ShellPatch.h"

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "cgh.h"
#include "ShellPatch.h"

View File

@@ -5819,11 +5819,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5870,11 +5880,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5949,11 +5969,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5971,11 +6001,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -6036,11 +6076,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6060,11 +6110,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6101,11 +6161,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
}
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6184,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
else // no time refinement levels and for all same time levels
{
#if (RPB == 0)
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
Pp = GH->PatL[lev];
while (Pp)
{
#if (MIXOUTB == 0)
Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
#endif
Pp = Pp->next;
}
Ppc = Ppc->next;
}
#elif (RPB == 1)
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "macrodef.h"
#include "cgh.h"

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "macrodef.h"
#include "cgh.h"

View File

@@ -945,60 +945,103 @@
SSA(2)=SYM
SSA(3)=ANTI
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation.
!!!!!!!!!advection term part
call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#if 1
!! bam does not apply dissipation on gauge variables
call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
#endif
#else
! No dissipation on gauge variables (advection only)
call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
!!
call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
#endif
if(eps>0)then
! usual Kreiss-Oliger dissipation
call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "before",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
#if 0
#define i 42
#define j 40
#define k 40
if(Lev == 1)then
write(*,*) X(i),Y(j),Z(k)
write(*,*) "after",Axx_rhs(i,j,k)
endif
#undef i
#undef j
#undef k
!!stop
#endif
call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
#if 1
!! bam does not apply dissipation on gauge variables
call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
#endif
#endif
endif
if(co == 0)then
! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho

View File

@@ -20,7 +20,11 @@ using namespace std;
#include <map.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "macrodef.h"
#include "misc.h"
@@ -43,14 +47,6 @@ cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
end_rank = 0;
#endif
// Initialize load balancing variables
enable_load_balance = false;
load_balance_check_interval = 10; // Check every 10 time steps
current_time_step = 0;
rank_interp_times = nullptr;
heavy_ranks = nullptr;
num_heavy_ranks = 0;
if (!checkrun)
{
read_bbox(Symmetry, filename);
@@ -121,12 +117,6 @@ cgh::~cgh()
delete[] Porgls[lev];
}
delete[] Porgls;
// Clean up load balancing memory
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
}
//================================================================================================
@@ -144,7 +134,7 @@ void cgh::compose_cgh(int nprocs)
for (int lev = 0; lev < levels; lev++)
{
checkPatchList(PatL[lev], false);
Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false);
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
#if (RPB == 1)
// we need distributed box of PatL[lev] and PatL[lev-1]
if (lev > 0)
@@ -1719,121 +1709,3 @@ void cgh::settrfls(const int lev)
{
trfls = lev;
}
//================================================================================================
// Load Balancing Functions
//================================================================================================
// Initialize load balancing
void cgh::init_load_balance(int nprocs)
{
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
rank_interp_times = new double[nprocs];
heavy_ranks = new int[4]; // Maximum 4 heavy ranks
num_heavy_ranks = 0;
for (int i = 0; i < nprocs; i++)
rank_interp_times[i] = 0.0;
}
// Update interpolation time for a rank
void cgh::update_interp_time(int rank, double time)
{
if (rank_interp_times && rank >= 0)
{
rank_interp_times[rank] = time;
}
}
// Check and perform load balancing if needed
bool cgh::check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// Only check at specified intervals
current_time_step++;
if (current_time_step % load_balance_check_interval != 0)
return false;
if (myrank == 0)
{
cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
}
// Collect all rank times on rank 0
double *all_times = nullptr;
if (myrank == 0)
{
all_times = new double[nprocs];
}
MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
bool need_rebalance = false;
if (myrank == 0)
{
// Check if load balancing is needed
need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
if (need_rebalance)
{
cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
cout << "Top " << num_heavy_ranks << " heavy ranks: ";
for (int i = 0; i < num_heavy_ranks; i++)
{
cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
}
cout << endl;
// Analyze blocks that need to be split
Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
// Set lev_flag to trigger recompose_cgh
cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
}
else
{
cout << "=== Load is balanced, no rebalancing needed ===" << endl;
}
delete[] all_times;
}
// Broadcast the decision to all ranks
MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
if (need_rebalance)
{
// Broadcast heavy ranks information
MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
// Perform recompose_cgh on the specified level
if (myrank == 0)
{
cout << "=== Performing recompose_cgh ===" << endl;
}
// Call recompose_cgh_Onelevel for the specified level
bool *lev_flag = new bool[1];
lev_flag[0] = true;
recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
delete[] lev_flag;
// Reset time counter after rebalancing
current_time_step = 0;
return true;
}
return false;
}

View File

@@ -2,7 +2,11 @@
#ifndef CGH_H
#define CGH_H
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "MyList.h"
#include "MPatch.h"
#include "macrodef.h"
@@ -87,21 +91,6 @@ public:
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs);
#endif
// Load balancing support
bool enable_load_balance; // Enable load balancing
int load_balance_check_interval; // Check interval (in time steps)
int current_time_step; // Current time step counter
double *rank_interp_times; // Store interpolation times for each rank
int *heavy_ranks; // Store heavy rank numbers
int num_heavy_ranks; // Number of heavy ranks
void init_load_balance(int nprocs);
void update_interp_time(int rank, double time);
bool check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
};
#endif /* CGH_H */

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <time.h>
#include <stdlib.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "var.h"
#include "MyList.h"

View File

@@ -69,8 +69,7 @@
fy = ZEO
fz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -153,6 +152,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -220,6 +220,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -284,6 +285,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -373,8 +375,7 @@
fxz = ZEO
fyz = ZEO
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -473,6 +474,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -535,6 +537,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -598,6 +601,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -661,6 +665,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -723,6 +728,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -784,6 +790,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -870,6 +877,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1001,6 +1009,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1155,6 +1164,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1231,6 +1241,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1301,6 +1312,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1405,6 +1417,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1580,6 +1593,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1647,6 +1661,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1716,6 +1731,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1785,6 +1801,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1855,6 +1872,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1923,6 +1941,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2015,6 +2034,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2131,6 +2151,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2216,6 +2237,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2292,6 +2314,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2410,6 +2433,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2597,6 +2621,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2669,6 +2694,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2744,6 +2770,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2819,6 +2846,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2899,6 +2927,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2977,6 +3006,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3084,6 +3114,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3220,6 +3251,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3315,6 +3347,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3399,6 +3432,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3534,6 +3568,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3806,6 +3841,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3887,6 +3923,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3971,6 +4008,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4055,6 +4093,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4157,6 +4196,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4257,6 +4297,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -81,6 +81,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -179,6 +180,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -262,6 +264,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -342,6 +345,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -443,6 +447,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -553,6 +558,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -627,6 +633,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -702,6 +709,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -777,6 +785,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -851,6 +860,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -924,6 +934,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1019,6 +1030,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1134,6 +1146,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1227,6 +1240,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1314,6 +1328,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1430,6 +1445,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1580,6 +1596,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1659,6 +1676,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1740,6 +1758,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1821,6 +1840,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1903,6 +1923,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1983,6 +2004,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2087,6 +2109,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2219,6 +2242,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2321,6 +2345,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2414,6 +2439,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2544,6 +2570,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2743,6 +2770,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2827,6 +2855,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -2914,6 +2943,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3001,6 +3031,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3093,6 +3124,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3183,6 +3215,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3302,6 +3335,7 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3454,6 +3488,7 @@
fx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3566,6 +3601,7 @@
fy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3667,6 +3703,7 @@
fz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -3814,6 +3851,7 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4098,6 +4136,7 @@
fxx = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4191,6 +4230,7 @@
fyy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4287,6 +4327,7 @@
fzz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4383,6 +4424,7 @@
fxy = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4497,6 +4539,7 @@
fxz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4609,6 +4652,7 @@
fyz = ZEO
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4679,6 +4723,7 @@ subroutine fderivs_shc(ex,f,fx,fy,fz,crho,sigma,R,SYM1,SYM2,SYM3,Symmetry,Lev,ss
#if 0
integer :: i,j,k
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -4729,6 +4774,7 @@ subroutine fdderivs_shc(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,crho,sigma,R,SYM1,SYM2,SYM3
#if 0
integer :: i,j,k
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -27,6 +27,7 @@
!~~~~~~>
!$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,ldetg,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA,lscale)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -104,6 +105,7 @@
!~~~~~~>
!$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,lscale,lgxy,lgxz,lgyz,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -6,7 +6,11 @@
#include <stdio.h>
#include <assert.h>
#include <math.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "util_Table.h"
#include "cctk.h"

View File

@@ -6,7 +6,11 @@
#include <stdio.h>
#include <assert.h>
#include <math.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "cctk.h"

View File

@@ -883,17 +883,13 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
enddo
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
do i=0,ord-1
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
enddo
@@ -1116,7 +1112,6 @@ end subroutine d2dump
! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint
subroutine polint(xa, ya, x, y, dy, ordn)
implicit none

View File

@@ -65,8 +65,7 @@ real*8,intent(in) :: eps
! dx^4
! note the sign (-1)^r-1, now r=2
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
!DIR$ UNROLL PARTIAL(4)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -161,6 +160,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
call symmetry_bd(3,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -275,6 +275,7 @@ real*8,intent(in) :: eps
! dx^8
! note the sign (-1)^r-1, now r=4
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -387,6 +388,7 @@ real*8,intent(in) :: eps
! dx^10
! note the sign (-1)^r-1, now r=5
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -80,6 +80,7 @@ real*8,intent(in) :: eps
! dx^4
! note the sign (-1)^r-1, now r=2
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -178,6 +179,7 @@ real*8,intent(in) :: eps
! dx^4
! note the sign (-1)^r-1, now r=2
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -273,6 +275,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(2,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -369,6 +372,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -510,6 +514,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -598,6 +603,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(3,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -694,6 +700,7 @@ real*8,intent(in) :: eps
! dx^8
! note the sign (-1)^r-1, now r=4
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -794,6 +801,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(4,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -903,6 +911,7 @@ real*8,intent(in) :: eps
! dx^10
! note the sign (-1)^r-1, now r=5
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
@@ -1006,6 +1015,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
call symmetry_stbd(5,ex,f,fh,SoA)
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)

View File

@@ -68,6 +68,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -233,6 +234,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -487,201 +489,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
end subroutine lopsided
!-----------------------------------------------------------------------------
! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
! Mathematically identical to calling lopsided then kodis separately.
!-----------------------------------------------------------------------------
subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
implicit none
!~~~~~~> Input parameters:
integer, intent(in) :: ex(1:3),Symmetry
real*8, intent(in) :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
real*8,dimension(ex(1),ex(2),ex(3)),intent(in) :: f,Sfx,Sfy,Sfz
real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
real*8,dimension(3),intent(in) ::SoA
real*8,intent(in) :: eps
!~~~~~~> local variables:
! note index -2,-1,0, so we have 3 extra points
real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3)) :: fh
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
real*8 :: dX,dY,dZ
real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
real*8, parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
real*8, parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
real*8, parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
! kodis parameters
real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
real*8, parameter :: cof=6.4d1 ! 2^6
dX = X(2)-X(1)
dY = Y(2)-Y(1)
dZ = Z(2)-Z(1)
d12dx = ONE/F12/dX
d12dy = ONE/F12/dY
d12dz = ONE/F12/dZ
d2dx = ONE/TWO/dX
d2dy = ONE/TWO/dY
d2dz = ONE/TWO/dZ
imax = ex(1)
jmax = ex(2)
kmax = ex(3)
imin = 1
jmin = 1
kmin = 1
if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
! Single symmetry_bd call shared by both advection and dissipation
call symmetry_bd(3,ex,f,fh,SoA)
! ---- Advection (lopsided) loop ----
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
! x direction
if(Sfx(i,j,k) > ZEO)then
if(i+3 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
elseif(i+2 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i+1 <= imax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
endif
elseif(Sfx(i,j,k) < ZEO)then
if(i-3 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-F6*fh(i-2,j,k)+ fh(i-3,j,k))
elseif(i-2 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
elseif(i-1 >= imin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-F6*fh(i+2,j,k)+ fh(i+3,j,k))
endif
endif
! y direction
if(Sfy(i,j,k) > ZEO)then
if(j+3 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
elseif(j+2 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j+1 <= jmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
endif
elseif(Sfy(i,j,k) < ZEO)then
if(j-3 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-F6*fh(i,j-2,k)+ fh(i,j-3,k))
elseif(j-2 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
elseif(j-1 >= jmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-F6*fh(i,j+2,k)+ fh(i,j+3,k))
endif
endif
! z direction
if(Sfz(i,j,k) > ZEO)then
if(k+3 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
elseif(k+2 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k+1 <= kmax)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
endif
elseif(Sfz(i,j,k) < ZEO)then
if(k-3 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)- &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-F6*fh(i,j,k-2)+ fh(i,j,k-3))
elseif(k-2 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
elseif(k-1 >= kmin)then
f_rhs(i,j,k)=f_rhs(i,j,k)+ &
Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-F6*fh(i,j,k+2)+ fh(i,j,k+3))
endif
endif
enddo
enddo
enddo
! ---- Dissipation (kodis) loop ----
if(eps > ZEO) then
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
if(i-3 >= imin .and. i+3 <= imax .and. &
j-3 >= jmin .and. j+3 <= jmax .and. &
k-3 >= kmin .and. k+3 <= kmax) then
f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof *( ( &
(fh(i-3,j,k)+fh(i+3,j,k)) - &
SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
TWT* fh(i,j,k) )/dX + &
( &
(fh(i,j-3,k)+fh(i,j+3,k)) - &
SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
TWT* fh(i,j,k) )/dY + &
( &
(fh(i,j,k-3)+fh(i,j,k+3)) - &
SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
TWT* fh(i,j,k) )/dZ )
endif
enddo
enddo
enddo
endif
return
end subroutine lopsided_kodis
#elif (ghost_width == 4)
! sixth order code
! Compute advection terms in right hand sides of field equations
@@ -753,6 +560,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -969,6 +777,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
! upper bound set ex-1 only for efficiency,
! the loop body will set ex 0 also
!$omp parallel do collapse(2) private(i,j,k)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -96,7 +96,7 @@ misc.o : zbesh.o
# projects
ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)

View File

@@ -6,25 +6,23 @@
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Using OpenMP-threaded MKL for parallel performance
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
## -fprofile-instr-use: use collected profile data to guide optimization decisions
## (branch prediction, basic block layout, inlining, loop unrolling)
PROFDATA = ../../pgo_profile/default.profdata
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
PROFDATA = /home/amss/AMSS-NCKU/pgo_profile/default.profdata
CXXAPPFLAGS = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
-DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
CLINKER = icpx
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include

View File

@@ -14,7 +14,11 @@ using namespace std;
#include <string.h>
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "macrodef.h"

View File

@@ -24,7 +24,11 @@ using namespace std;
#include <complex.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
namespace misc
{

View File

@@ -20,7 +20,11 @@ using namespace std;
#endif
#include <time.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
class monitor
{

153
AMSS_NCKU_source/mpi_stub.h Normal file
View File

@@ -0,0 +1,153 @@
#ifndef MPI_STUB_H
#define MPI_STUB_H
/*
* MPI Stub Header — single-process shim for AMSS-NCKU ABE solver.
* Provides all MPI types, constants, and functions used in the codebase
* as no-ops or trivial implementations for nprocs=1, myrank=0.
*/
#include <cstring>
#include <cstdlib>
#include <cstdio>
#include <time.h>
/* ── Types ─────────────────────────────────────────────────────────── */
typedef int MPI_Comm;
typedef int MPI_Datatype;
typedef int MPI_Op;
typedef int MPI_Request;
typedef int MPI_Group;
typedef struct MPI_Status {
int MPI_SOURCE;
int MPI_TAG;
int MPI_ERROR;
} MPI_Status;
/* ── Constants ─────────────────────────────────────────────────────── */
#define MPI_COMM_WORLD 0
#define MPI_INT 1
#define MPI_DOUBLE 2
#define MPI_DOUBLE_PRECISION 2
#define MPI_DOUBLE_INT 3
#define MPI_SUM 1
#define MPI_MAX 2
#define MPI_MAXLOC 3
#define MPI_STATUS_IGNORE ((MPI_Status *)0)
#define MPI_STATUSES_IGNORE ((MPI_Status *)0)
#define MPI_MAX_PROCESSOR_NAME 256
/* ── Helper: sizeof for MPI_Datatype ──────────────────────────────── */
static inline size_t mpi_stub_sizeof(MPI_Datatype type) {
switch (type) {
case MPI_INT: return sizeof(int);
case MPI_DOUBLE: return sizeof(double);
case MPI_DOUBLE_INT: return sizeof(double) + sizeof(int);
default: return 0;
}
}
/* ── Init / Finalize ──────────────────────────────────────────────── */
static inline int MPI_Init(int *, char ***) { return 0; }
static inline int MPI_Finalize() { return 0; }
/* ── Communicator queries ─────────────────────────────────────────── */
static inline int MPI_Comm_rank(MPI_Comm, int *rank) { *rank = 0; return 0; }
static inline int MPI_Comm_size(MPI_Comm, int *size) { *size = 1; return 0; }
static inline int MPI_Comm_split(MPI_Comm comm, int, int, MPI_Comm *newcomm) {
*newcomm = comm;
return 0;
}
static inline int MPI_Comm_free(MPI_Comm *) { return 0; }
/* ── Group operations ─────────────────────────────────────────────── */
static inline int MPI_Comm_group(MPI_Comm, MPI_Group *group) {
*group = 0;
return 0;
}
static inline int MPI_Group_translate_ranks(MPI_Group, int n,
const int *ranks1, MPI_Group, int *ranks2) {
for (int i = 0; i < n; ++i) ranks2[i] = ranks1[i];
return 0;
}
static inline int MPI_Group_free(MPI_Group *) { return 0; }
/* ── Collective operations ────────────────────────────────────────── */
static inline int MPI_Allreduce(const void *sendbuf, void *recvbuf,
int count, MPI_Datatype datatype, MPI_Op, MPI_Comm) {
std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
return 0;
}
static inline int MPI_Iallreduce(const void *sendbuf, void *recvbuf,
int count, MPI_Datatype datatype, MPI_Op, MPI_Comm,
MPI_Request *request) {
std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
*request = 0;
return 0;
}
static inline int MPI_Bcast(void *, int, MPI_Datatype, int, MPI_Comm) {
return 0;
}
static inline int MPI_Barrier(MPI_Comm) { return 0; }
/* ── Point-to-point (never reached with nprocs=1) ─────────────────── */
static inline int MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm) {
return 0;
}
static inline int MPI_Recv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status *) {
return 0;
}
static inline int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm,
MPI_Request *req) {
*req = 0;
return 0;
}
static inline int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm,
MPI_Request *req) {
*req = 0;
return 0;
}
/* ── Completion ───────────────────────────────────────────────────── */
static inline int MPI_Wait(MPI_Request *, MPI_Status *) { return 0; }
static inline int MPI_Waitall(int, MPI_Request *, MPI_Status *) { return 0; }
/* ── Utility ──────────────────────────────────────────────────────── */
static inline int MPI_Abort(MPI_Comm, int error_code) {
std::fprintf(stderr, "MPI_Abort called with error code %d\n", error_code);
std::exit(error_code);
return 0;
}
static inline double MPI_Wtime() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
}
static inline int MPI_Get_processor_name(char *name, int *resultlen) {
const char *stub_name = "localhost";
std::strcpy(name, stub_name);
*resultlen = (int)std::strlen(stub_name);
return 0;
}
#endif /* MPI_STUB_H */

View File

@@ -24,7 +24,11 @@ using namespace std;
#include <map.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
namespace parameters
{

View File

@@ -30,7 +30,11 @@ using namespace std;
#include <sys/time.h>
#include <sys/resource.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
/* Real time */
#define TimerSignal SIGALRM

View File

@@ -109,23 +109,33 @@
if( RK4 == 0 ) then
!$omp parallel workshare
f1 = f0 + HLF * dT * f_rhs
!$omp end parallel workshare
elseif(RK4 == 1 ) then
!$omp parallel workshare
f_rhs = f_rhs + TWO * f1
!$omp end parallel workshare
!$omp parallel workshare
f1 = f0 + HLF * dT * f1
!$omp end parallel workshare
elseif(RK4 == 2 ) then
!$omp parallel workshare
f_rhs = f_rhs + TWO * f1
!$omp end parallel workshare
!$omp parallel workshare
f1 = f0 + dT * f1
!$omp end parallel workshare
elseif( RK4 == 3 ) then
!$omp parallel workshare
f1 = f0 +F1o6 * dT *(f1 + f_rhs)
!$omp end parallel workshare
else
@@ -215,11 +225,15 @@
if( RK4 == 0 ) then
!$omp parallel workshare
f1 = f0 + dT * f_rhs
!$omp end parallel workshare
else
!$omp parallel workshare
f1 = f0 + HLF * dT * (f1+f_rhs)
!$omp end parallel workshare
endif
@@ -239,7 +253,9 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) ::f_rhs
real*8, dimension(ex(1),ex(2),ex(3)),intent(out) ::f1
!$omp parallel workshare
f1 = f0 + dT * f_rhs
!$omp end parallel workshare
return

View File

@@ -19,7 +19,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "cgh.h"
#include "ShellPatch.h"

View File

@@ -18,7 +18,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "microdef.h"

View File

@@ -3,7 +3,11 @@
#include <math.h>
#include <string.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "util_Table.h"
#include "cctk.h"

View File

@@ -11,8 +11,6 @@
#include <strstream>
#include <cmath>
#include <map>
#include <vector>
#include <algorithm>
using namespace std;
#else
#include <iostream.h>
@@ -22,7 +20,11 @@ using namespace std;
#include <math.h>
#include <map.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "cgh.h"
@@ -240,6 +242,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
//|~~~~~> Integrate the dot product of Dphi with the surface normal.
double *RP_out, *IP_out;
RP_out = new double[NN];
IP_out = new double[NN];

View File

@@ -18,7 +18,11 @@ using namespace std;
#include <math.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "macrodef.h"

View File

@@ -20,7 +20,11 @@ using namespace std;
#include <map.h>
#endif
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "misc.h"
#include "macrodef.h"

View File

@@ -9,7 +9,11 @@
using namespace std;
#include <time.h>
#ifdef MPI_STUB
#include "mpi_stub.h"
#else
#include <mpi.h>
#endif
#include "var.h"

View File

@@ -11,17 +11,30 @@
import AMSS_NCKU_Input as input_data
import subprocess
import time
import os
## OpenMP configuration for threaded Fortran kernels
## OMP_NUM_THREADS: set to number of physical cores (not hyperthreads)
## OMP_PROC_BIND: bind threads to cores to avoid migration overhead
## OMP_STACKSIZE: each thread needs stack space for fh arrays (~3.6MB)
if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = "96"
os.environ["OMP_STACKSIZE"] = "16M"
os.environ["OMP_PROC_BIND"] = "close"
os.environ["OMP_PLACES"] = "cores"
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
#NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
#NUMACTL_CPU_BIND = "taskset -c 8-15"
NUMACTL_CPU_BIND = ""
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96
BUILD_JOBS = 16
##################################################################
@@ -117,27 +130,26 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
run_command = NUMACTL_CPU_BIND + " ./ABE"
run_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
run_command = NUMACTL_CPU_BIND + " ./ABEGPU"
run_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
## Execute the command and stream output
run_process = subprocess.Popen(run_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0:
with open(run_command_outfile, 'w') as file0:
## Read and print output lines; also write each line to file
for line in mpi_process.stdout:
for line in run_process.stdout:
print(line, end='') # stream output in real time
file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
file0.close()
## Wait for the process to finish
mpi_return_code = mpi_process.wait()
run_return_code = run_process.wait()
print( )
print( " The ABE/ABEGPU simulation is finished " )

View File

@@ -1,29 +0,0 @@
import multiprocessing
def run_plot_task(task):
"""Execute a single plotting task.
Parameters
----------
task : tuple
A tuple of (function, args_tuple) where function is a callable
plotting function and args_tuple contains its arguments.
"""
func, args = task
return func(*args)
def run_plot_tasks_parallel(plot_tasks):
"""Execute a list of independent plotting tasks in parallel.
Uses the 'fork' context to create worker processes so that the main
script is NOT re-imported/re-executed in child processes.
Parameters
----------
plot_tasks : list of tuples
Each element is (function, args_tuple).
"""
ctx = multiprocessing.get_context('fork')
with ctx.Pool() as pool:
pool.map(run_plot_task, plot_tasks)

Binary file not shown.

Binary file not shown.

View File

@@ -11,8 +11,6 @@
import numpy ## numpy for array operations
import scipy ## scipy for interpolation and signal processing
import math
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting
import os ## os for system/file operations

View File

@@ -8,23 +8,16 @@
##
#################################################
## Restrict OpenMP to one thread per process so that running
## many workers in parallel does not create an O(workers * BLAS_threads)
## thread explosion. The variable MUST be set before numpy/scipy
## are imported, because the BLAS library reads them only at load time.
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
import numpy
import scipy
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D
## import torch
import AMSS_NCKU_Input as input_data
import os
#########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
####################################################################################
####################################################################################
## Allow this module to be run as a standalone script so that each
## binary-data plot can be executed in a fresh subprocess whose BLAS
## environment variables (set above) take effect before numpy loads.
##
## Usage: python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
####################################################################################
if __name__ == '__main__':
import sys
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
sys.exit(1)
plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -8,8 +8,6 @@
#################################################
import numpy ## numpy for array operations
import matplotlib
matplotlib.use('Agg') ## use non-interactive backend for multiprocessing safety
import matplotlib.pyplot as plt ## matplotlib for plotting
from mpl_toolkits.mplot3d import Axes3D ## needed for 3D plots
import glob
@@ -17,9 +15,6 @@ import os ## operating system utilities
import plot_binary_data
import AMSS_NCKU_Input as input_data
import subprocess
import sys
import multiprocessing
# plt.rcParams['text.usetex'] = True ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
file_list.append(x)
print(x)
## Plot each file in parallel using subprocesses.
## Each subprocess is a fresh Python process where the BLAS thread-count
## environment variables (set at the top of plot_binary_data.py) take
## effect before numpy is imported. This avoids the thread explosion
## that occurs when multiprocessing.Pool with 'fork' context inherits
## already-initialized multi-threaded BLAS from the parent.
script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
running = []
failed = []
## Plot each file in the list
for filename in file_list:
print(filename)
proc = subprocess.Popen(
[sys.executable, script, filename, binary_outdir, figure_outdir],
)
running.append( (proc, filename) )
## Keep at most max_workers subprocesses active at a time
if len(running) >= max_workers:
p, fn = running.pop(0)
p.wait()
if p.returncode != 0:
failed.append(fn)
## Wait for all remaining subprocesses to finish
for p, fn in running:
p.wait()
if p.returncode != 0:
failed.append(fn)
if failed:
print( " WARNING: the following binary data plots failed:" )
for fn in failed:
print( " ", fn )
plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
print( )
print( " Binary Data Plot Has been Finished " )