Add OpenMP parallelization to Fortran compute kernels

Add !$omp parallel do collapse(2) directives to all triple-loop stencil kernels (fderivs, fdderivs, fdx/fdy/fdz, kodis, lopsided, enforce_ag/enforce_ga) across all ghost_width variants. Add !$omp parallel workshare to RK4/ICN/Euler whole-array update routines. Build system: add -qopenmp to compile and link flags, switch MKL from sequential to threaded (-lmkl_intel_thread -liomp5). Runtime: set OMP_NUM_THREADS=96, OMP_STACKSIZE=16M, OMP_PROC_BIND=close, OMP_PLACES=cores for 96-core server target. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Remove MPI dependency, replace with single-process stub for non-MPI builds
2026-02-10 23:40:17 +08:00 · 2026-02-10 22:51:11 +08:00
56 changed files with 789 additions and 748 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 1                              ## number of processes (MPI removed, single-process mode)
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script, spawning exponentially more
 ## workers (fork bomb).
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
@@ -432,31 +424,26 @@ print(
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -20,7 +20,11 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/Ansorg.h
+++ b/AMSS_NCKU_source/Ansorg.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #define PI M_PI
--- a/AMSS_NCKU_source/Block.h
+++ b/AMSS_NCKU_source/Block.h
@@ -2,7 +2,11 @@
 #ifndef BLOCK_H
 #define BLOCK_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h" //need dim here; Vertex or Cell
 #include "var.h"
 #include "MyList.h"
--- a/AMSS_NCKU_source/IntPnts0.C
+++ b/AMSS_NCKU_source/IntPnts0.C
@@ -4,7 +4,11 @@
 #include <stdarg.h>
 #include <string.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "myglobal.h"
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -2,7 +2,11 @@
 #ifndef PATCH_H
 #define PATCH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "var.h"
--- a/AMSS_NCKU_source/Newton.C
+++ b/AMSS_NCKU_source/Newton.C
@@ -8,7 +8,11 @@
 #include <limits.h>
 #include <float.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/NullShellPatch.h
+++ b/AMSS_NCKU_source/NullShellPatch.h
@@ -23,7 +23,11 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/NullShellPatch2.h
+++ b/AMSS_NCKU_source/NullShellPatch2.h
@@ -23,7 +23,11 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3853,8 +3853,7 @@ void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmet
 Parallel::SyncCache::SyncCache()
    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
-      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0),
+      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0)
      lengths_valid(false)
 {
 }
 // SyncCache invalidate: free grid segment lists but keep buffers
@@ -3872,7 +3871,6 @@ void Parallel::SyncCache::invalidate()
    send_lengths[i] = recv_lengths[i] = 0;
  }
  valid = false;
  lengths_valid = false;
 }
 // SyncCache destroy: free everything
 void Parallel::SyncCache::destroy()
@@ -4174,13 +4172,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
  {
    if (node == myrank)
    {
-      int length;
+      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      if (!cache.lengths_valid) {
        length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      cache.recv_lengths[node] = length;
      } else {
        length = cache.recv_lengths[node];
      }
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
@@ -4194,13 +4187,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
    }
    else
    {
-      int slength;
+      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      if (!cache.lengths_valid) {
        slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      cache.send_lengths[node] = slength;
      } else {
        slength = cache.send_lengths[node];
      }
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
@@ -4212,13 +4200,8 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
-      int rlength;
+      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
      if (!cache.lengths_valid) {
        rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
      cache.recv_lengths[node] = rlength;
      } else {
        rlength = cache.recv_lengths[node];
      }
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
@@ -4231,7 +4214,6 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
      }
    }
  }
  cache.lengths_valid = true;
 }
 // Sync_finish: wait for async MPI operations and unpack
 void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
@@ -5286,203 +5268,6 @@ void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 // Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                  MyList<var> *VarList1, MyList<var> *VarList2,
                                  int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
 void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                     MyList<var> *VarList1, MyList<var> *VarList2,
                                     int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  // Use transfermix instead of transfer for mix-mode interpolation
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  int req_no = 0;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
  MPI_Waitall(req_no, cache.reqs, cache.stats);
  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // collect all buffer grid segments or blocks for given patch
 MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
 {
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -97,7 +97,6 @@ namespace Parallel
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    SyncCache();
    void invalidate();
    void destroy();
@@ -130,15 +129,6 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                          MyList<var> *VarList1, MyList<var> *VarList2,
                          int Symmetry, SyncCache &cache);
  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                             MyList<var> *VarList1, MyList<var> *VarList2,
                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
--- a/AMSS_NCKU_source/ShellPatch.h
+++ b/AMSS_NCKU_source/ShellPatch.h
@@ -2,7 +2,11 @@
 #ifndef SHELLPATCH_H
 #define SHELLPATCH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -321,7 +321,22 @@ void Z4c_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -453,16 +468,24 @@ void Z4c_class::Step(int lev, int YN)
      sPp = sPp->next;
    }
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req_pre;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -481,24 +504,6 @@ void Z4c_class::Step(int lev, int YN)
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -688,7 +693,23 @@ void Z4c_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -829,16 +850,25 @@ void Z4c_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -856,25 +886,6 @@ void Z4c_class::Step(int lev, int YN)
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -1241,7 +1252,22 @@ void Z4c_class::Step(int lev, int YN)
 	 }
  }
 #endif
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // evolve Shell Patches
  if (lev == 0)
@@ -1516,15 +1542,23 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req_pre;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
  if (lev == 0)
  {
@@ -1586,22 +1620,6 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif
  }
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1823,7 +1841,23 @@ void Z4c_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    // evolve Shell Patches
    if (lev == 0)
@@ -2069,15 +2103,24 @@ void Z4c_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
    if (lev == 0)
    {
@@ -2127,23 +2170,6 @@ void Z4c_class::Step(int lev, int YN)
    }
 // end smooth
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
--- a/AMSS_NCKU_source/Z4c_class.h
+++ b/AMSS_NCKU_source/Z4c_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssnEM_class.h
+++ b/AMSS_NCKU_source/bssnEM_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/bssnEScalar_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -5819,11 +5819,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5870,11 +5880,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5949,11 +5969,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5971,11 +6001,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -6036,11 +6076,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6060,11 +6110,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6101,11 +6161,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6184,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "cgh.h"
--- a/AMSS_NCKU_source/bssn_gpu_class.h
+++ b/AMSS_NCKU_source/bssn_gpu_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "cgh.h"
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -945,60 +945,103 @@
  SSA(2)=SYM
  SSA(3)=ANTI
-!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
+!!!!!!!!!advection term part
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
 ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
 ! so the constant offset has no effect on dissipation.
-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
-  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
-  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
-
+!!
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #else
 ! No dissipation on gauge variables (advection only)
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
  if(eps>0)then 
 ! usual Kreiss-Oliger dissipation      
  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "before",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "after",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
 #endif
 #endif
  endif
  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -20,7 +20,11 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "misc.h"
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -2,7 +2,11 @@
 #ifndef CGH_H
 #define CGH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "MPatch.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/checkpoint.h
+++ b/AMSS_NCKU_source/checkpoint.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <time.h>
 #include <stdlib.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "var.h"
 #include "MyList.h"
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,6 +69,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -151,6 +152,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -218,6 +220,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -282,6 +285,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -371,6 +375,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -469,6 +474,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -531,6 +537,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -594,6 +601,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -657,6 +665,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -719,6 +728,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -780,6 +790,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -866,6 +877,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -997,6 +1009,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1151,6 +1164,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1227,6 +1241,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1297,6 +1312,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1401,6 +1417,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1576,6 +1593,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1643,6 +1661,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1712,6 +1731,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1781,6 +1801,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1851,6 +1872,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1919,6 +1941,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2011,6 +2034,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2127,6 +2151,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2212,6 +2237,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2288,6 +2314,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2406,6 +2433,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2593,6 +2621,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2665,6 +2694,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2740,6 +2770,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2815,6 +2846,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2895,6 +2927,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2973,6 +3006,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3080,6 +3114,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3216,6 +3251,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3311,6 +3347,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3395,6 +3432,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3530,6 +3568,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3802,6 +3841,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3883,6 +3923,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3967,6 +4008,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4051,6 +4093,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4153,6 +4196,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4253,6 +4297,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/diff_new_sh.f90
+++ b/AMSS_NCKU_source/diff_new_sh.f90
@@ -81,6 +81,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -179,6 +180,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -262,6 +264,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -342,6 +345,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -443,6 +447,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -553,6 +558,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -627,6 +633,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -702,6 +709,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -777,6 +785,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -851,6 +860,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -924,6 +934,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1019,6 +1030,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1134,6 +1146,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1227,6 +1240,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1314,6 +1328,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1430,6 +1445,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1580,6 +1596,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1659,6 +1676,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1740,6 +1758,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1821,6 +1840,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1903,6 +1923,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1983,6 +2004,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2087,6 +2109,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2219,6 +2242,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2321,6 +2345,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2414,6 +2439,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2544,6 +2570,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2743,6 +2770,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2827,6 +2855,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2914,6 +2943,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3001,6 +3031,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3093,6 +3124,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3183,6 +3215,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3302,6 +3335,7 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3454,6 +3488,7 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3566,6 +3601,7 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3667,6 +3703,7 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3814,6 +3851,7 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4098,6 +4136,7 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4191,6 +4230,7 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4287,6 +4327,7 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4383,6 +4424,7 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4497,6 +4539,7 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4609,6 +4652,7 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4679,6 +4723,7 @@ subroutine fderivs_shc(ex,f,fx,fy,fz,crho,sigma,R,SYM1,SYM2,SYM3,Symmetry,Lev,ss
 #if 0  
  integer :: i,j,k
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4729,6 +4774,7 @@ subroutine fdderivs_shc(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,crho,sigma,R,SYM1,SYM2,SYM3
 #if 0  
  integer :: i,j,k
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -27,6 +27,7 @@
 !~~~~~~>
  !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,ldetg,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA,lscale)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -104,6 +105,7 @@
 !~~~~~~>
  !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,lscale,lgxy,lgxz,lgyz,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/expansion.C
+++ b/AMSS_NCKU_source/expansion.C
@@ -6,7 +6,11 @@
 #include <stdio.h>
 #include <assert.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/find_horizons.C
+++ b/AMSS_NCKU_source/find_horizons.C
@@ -6,7 +6,11 @@
 #include <stdio.h>
 #include <assert.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cctk.h"
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,6 +65,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -159,6 +160,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
  call symmetry_bd(3,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -273,6 +275,7 @@ real*8,intent(in) :: eps
 !                                              dx^8
 !  note the sign (-1)^r-1, now r=4
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -385,6 +388,7 @@ real*8,intent(in) :: eps
 !                                                              dx^10
 !  note the sign (-1)^r-1, now r=5
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/kodiss_sh.f90
+++ b/AMSS_NCKU_source/kodiss_sh.f90
@@ -80,6 +80,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -178,6 +179,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -273,6 +275,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(2,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -369,6 +372,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -510,6 +514,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -598,6 +603,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -694,6 +700,7 @@ real*8,intent(in) :: eps
 !                                              dx^8
 !  note the sign (-1)^r-1, now r=4
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -794,6 +801,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(4,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -903,6 +911,7 @@ real*8,intent(in) :: eps
 !                                                              dx^10
 !  note the sign (-1)^r-1, now r=5
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1006,6 +1015,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(5,ex,f,fh,SoA)
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -68,6 +68,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -233,6 +234,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -487,201 +489,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
 ! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
 ! Mathematically identical to calling lopsided then kodis separately.
 !-----------------------------------------------------------------------------
 subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,intent(in) :: eps
 !~~~~~~> local variables:
 ! note index -2,-1,0, so we have 3 extra points
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
 ! kodis parameters
  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
  real*8, parameter :: cof=6.4d1   ! 2^6
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
 ! Single symmetry_bd call shared by both advection and dissipation
  call symmetry_bd(3,ex,f,fh,SoA)
 ! ---- Advection (lopsided) loop ----
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction   
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction   
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
  enddo
  enddo
  enddo
 ! ---- Dissipation (kodis) loop ----
  if(eps > ZEO) then
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
  endif
  enddo
  enddo
  enddo
  endif
  return
  end subroutine lopsided_kodis
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
@@ -753,6 +560,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -969,6 +777,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -96,7 +96,7 @@ misc.o : zbesh.o
 # projects
 ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
 ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -6,25 +6,23 @@
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Using OpenMP-threaded MKL for parallel performance
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl
 ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
 ## -fprofile-instr-use: use collected profile data to guide optimization decisions
 ##   (branch prediction, basic block layout, inlining, loop unrolling)
-PROFDATA     = ../../pgo_profile/default.profdata
+PROFDATA     = /home/amss/AMSS-NCKU/pgo_profile/default.profdata
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+CXXAPPFLAGS  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
-               -fprofile-instr-use=$(PROFDATA) \
+               -DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include
-               -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
 f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -fprofile-instr-use=$(PROFDATA) \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
-CLINKER      = mpiicpx
+CLINKER      = icpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
--- a/AMSS_NCKU_source/misc.C
+++ b/AMSS_NCKU_source/misc.C
@@ -14,7 +14,11 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/misc.h
+++ b/AMSS_NCKU_source/misc.h
@@ -24,7 +24,11 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 namespace misc
 {
--- a/AMSS_NCKU_source/monitor.h
+++ b/AMSS_NCKU_source/monitor.h
@@ -20,7 +20,11 @@ using namespace std;
 #endif
 #include <time.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 class monitor
 {
--- a/AMSS_NCKU_source/mpi_stub.h
+++ b/AMSS_NCKU_source/mpi_stub.h
@@ -0,0 +1,153 @@
 #ifndef MPI_STUB_H
 #define MPI_STUB_H
 /*
 * MPI Stub Header — single-process shim for AMSS-NCKU ABE solver.
 * Provides all MPI types, constants, and functions used in the codebase
 * as no-ops or trivial implementations for nprocs=1, myrank=0.
 */
 #include <cstring>
 #include <cstdlib>
 #include <cstdio>
 #include <time.h>
 /* ── Types ─────────────────────────────────────────────────────────── */
 typedef int MPI_Comm;
 typedef int MPI_Datatype;
 typedef int MPI_Op;
 typedef int MPI_Request;
 typedef int MPI_Group;
 typedef struct MPI_Status {
    int MPI_SOURCE;
    int MPI_TAG;
    int MPI_ERROR;
 } MPI_Status;
 /* ── Constants ─────────────────────────────────────────────────────── */
 #define MPI_COMM_WORLD  0
 #define MPI_INT              1
 #define MPI_DOUBLE           2
 #define MPI_DOUBLE_PRECISION 2
 #define MPI_DOUBLE_INT       3
 #define MPI_SUM    1
 #define MPI_MAX    2
 #define MPI_MAXLOC 3
 #define MPI_STATUS_IGNORE   ((MPI_Status *)0)
 #define MPI_STATUSES_IGNORE ((MPI_Status *)0)
 #define MPI_MAX_PROCESSOR_NAME 256
 /* ── Helper: sizeof for MPI_Datatype ──────────────────────────────── */
 static inline size_t mpi_stub_sizeof(MPI_Datatype type) {
    switch (type) {
        case MPI_INT:        return sizeof(int);
        case MPI_DOUBLE:     return sizeof(double);
        case MPI_DOUBLE_INT: return sizeof(double) + sizeof(int);
        default:             return 0;
    }
 }
 /* ── Init / Finalize ──────────────────────────────────────────────── */
 static inline int MPI_Init(int *, char ***) { return 0; }
 static inline int MPI_Finalize() { return 0; }
 /* ── Communicator queries ─────────────────────────────────────────── */
 static inline int MPI_Comm_rank(MPI_Comm, int *rank) { *rank = 0; return 0; }
 static inline int MPI_Comm_size(MPI_Comm, int *size) { *size = 1; return 0; }
 static inline int MPI_Comm_split(MPI_Comm comm, int, int, MPI_Comm *newcomm) {
    *newcomm = comm;
    return 0;
 }
 static inline int MPI_Comm_free(MPI_Comm *) { return 0; }
 /* ── Group operations ─────────────────────────────────────────────── */
 static inline int MPI_Comm_group(MPI_Comm, MPI_Group *group) {
    *group = 0;
    return 0;
 }
 static inline int MPI_Group_translate_ranks(MPI_Group, int n,
        const int *ranks1, MPI_Group, int *ranks2) {
    for (int i = 0; i < n; ++i) ranks2[i] = ranks1[i];
    return 0;
 }
 static inline int MPI_Group_free(MPI_Group *) { return 0; }
 /* ── Collective operations ────────────────────────────────────────── */
 static inline int MPI_Allreduce(const void *sendbuf, void *recvbuf,
        int count, MPI_Datatype datatype, MPI_Op, MPI_Comm) {
    std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
    return 0;
 }
 static inline int MPI_Iallreduce(const void *sendbuf, void *recvbuf,
        int count, MPI_Datatype datatype, MPI_Op, MPI_Comm,
        MPI_Request *request) {
    std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
    *request = 0;
    return 0;
 }
 static inline int MPI_Bcast(void *, int, MPI_Datatype, int, MPI_Comm) {
    return 0;
 }
 static inline int MPI_Barrier(MPI_Comm) { return 0; }
 /* ── Point-to-point (never reached with nprocs=1) ─────────────────── */
 static inline int MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm) {
    return 0;
 }
 static inline int MPI_Recv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status *) {
    return 0;
 }
 static inline int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm,
        MPI_Request *req) {
    *req = 0;
    return 0;
 }
 static inline int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm,
        MPI_Request *req) {
    *req = 0;
    return 0;
 }
 /* ── Completion ───────────────────────────────────────────────────── */
 static inline int MPI_Wait(MPI_Request *, MPI_Status *) { return 0; }
 static inline int MPI_Waitall(int, MPI_Request *, MPI_Status *) { return 0; }
 /* ── Utility ──────────────────────────────────────────────────────── */
 static inline int MPI_Abort(MPI_Comm, int error_code) {
    std::fprintf(stderr, "MPI_Abort called with error code %d\n", error_code);
    std::exit(error_code);
    return 0;
 }
 static inline double MPI_Wtime() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
 }
 static inline int MPI_Get_processor_name(char *name, int *resultlen) {
    const char *stub_name = "localhost";
    std::strcpy(name, stub_name);
    *resultlen = (int)std::strlen(stub_name);
    return 0;
 }
 #endif /* MPI_STUB_H */
--- a/AMSS_NCKU_source/parameters.h
+++ b/AMSS_NCKU_source/parameters.h
@@ -24,7 +24,11 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 namespace parameters
 {
--- a/AMSS_NCKU_source/perf.h
+++ b/AMSS_NCKU_source/perf.h
@@ -30,7 +30,11 @@ using namespace std;
 #include <sys/time.h>
 #include <sys/resource.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 /* Real time */
 #define TimerSignal SIGALRM
--- a/AMSS_NCKU_source/rungekutta4_rout.f90
+++ b/AMSS_NCKU_source/rungekutta4_rout.f90
@@ -109,23 +109,33 @@
  if( RK4 == 0 ) then
   !$omp parallel workshare
   f1 = f0 + HLF * dT * f_rhs
   !$omp end parallel workshare
  elseif(RK4 == 1 ) then
   !$omp parallel workshare
   f_rhs = f_rhs + TWO * f1
-
+   !$omp end parallel workshare
   !$omp parallel workshare
   f1 = f0 + HLF * dT * f1
   !$omp end parallel workshare
  elseif(RK4 == 2 ) then
   !$omp parallel workshare
   f_rhs = f_rhs + TWO * f1
-
+   !$omp end parallel workshare
   !$omp parallel workshare
   f1 = f0 +       dT * f1
   !$omp end parallel workshare
  elseif( RK4 == 3 ) then
   !$omp parallel workshare
   f1 = f0 +F1o6 * dT *(f1 + f_rhs)
   !$omp end parallel workshare
  else
@@ -215,11 +225,15 @@
  if( RK4 == 0 ) then
   !$omp parallel workshare
   f1 = f0 + dT * f_rhs
   !$omp end parallel workshare
  else
   !$omp parallel workshare
   f1 = f0 + HLF * dT * (f1+f_rhs)
   !$omp end parallel workshare
  endif
@@ -239,7 +253,9 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in) ::f_rhs
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) ::f1
  !$omp parallel workshare
   f1 = f0 + dT * f_rhs
  !$omp end parallel workshare
  return
--- a/AMSS_NCKU_source/scalar_class.h
+++ b/AMSS_NCKU_source/scalar_class.h
@@ -19,7 +19,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/scalarwaves.C
+++ b/AMSS_NCKU_source/scalarwaves.C
@@ -18,7 +18,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "microdef.h"
--- a/AMSS_NCKU_source/setup.C
+++ b/AMSS_NCKU_source/setup.C
@@ -3,7 +3,11 @@
 #include <math.h>
 #include <string.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -20,7 +20,11 @@ using namespace std;
 #include <math.h>
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "cgh.h"
--- a/AMSS_NCKU_source/testNull.C
+++ b/AMSS_NCKU_source/testNull.C
@@ -18,7 +18,11 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/testNull2.C
+++ b/AMSS_NCKU_source/testNull2.C
@@ -20,7 +20,11 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/var.C
+++ b/AMSS_NCKU_source/var.C
@@ -9,7 +9,11 @@
 using namespace std;
 #include <time.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "var.h"
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,17 +11,30 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
 import os
 ## OpenMP configuration for threaded Fortran kernels
 ## OMP_NUM_THREADS: set to number of physical cores (not hyperthreads)
 ## OMP_PROC_BIND: bind threads to cores to avoid migration overhead
 ## OMP_STACKSIZE: each thread needs stack space for fh arrays (~3.6MB)
 if "OMP_NUM_THREADS" not in os.environ:
    os.environ["OMP_NUM_THREADS"] = "96"
 os.environ["OMP_STACKSIZE"] = "16M"
 os.environ["OMP_PROC_BIND"] = "close"
 os.environ["OMP_PLACES"] = "cores"
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 0-111"
-NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
+#NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
 #NUMACTL_CPU_BIND = "taskset -c 8-15"
 NUMACTL_CPU_BIND = ""
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
-BUILD_JOBS = 96
+BUILD_JOBS = 16
 ##################################################################
@@ -117,27 +130,26 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        run_command         = NUMACTL_CPU_BIND + " ./ABE"
-        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        run_command_outfile = "ABE_out.log"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        run_command         = NUMACTL_CPU_BIND + " ./ABEGPU"
-        mpi_command_outfile = "ABEGPU_out.log"
+        run_command_outfile = "ABEGPU_out.log"
-    ## Execute the MPI command and stream output
+    ## Execute the command and stream output
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    run_process = subprocess.Popen(run_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    ## Write ABE run output to file while printing to stdout
-    with open(mpi_command_outfile, 'w') as file0:  
+    with open(run_command_outfile, 'w') as file0:
        ## Read and print output lines; also write each line to file
-        for line in mpi_process.stdout:
+        for line in run_process.stdout:
            print(line, end='')  # stream output in real time
            file0.write(line)    # write the line to file
            file0.flush()        # flush to ensure each line is written immediately (optional)            
    file0.close()
    ## Wait for the process to finish
-    mpi_return_code = mpi_process.wait()
+    run_return_code = run_process.wait()
    print(                                           )
    print( " The ABE/ABEGPU simulation is finished " ) 
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
 import multiprocessing
 def run_plot_task(task):
    """Execute a single plotting task.
    Parameters
    ----------
    task : tuple
        A tuple of (function, args_tuple) where function is a callable
        plotting function and args_tuple contains its arguments.
    """
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    """Execute a list of independent plotting tasks in parallel.
    Uses the 'fork' context to create worker processes so that the main
    script is NOT re-imported/re-executed in child processes.
    Parameters
    ----------
    plot_tasks : list of tuples
        Each element is (function, args_tuple).
    """
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################
 ## Restrict OpenMP to one thread per process so that running
 ## many workers in parallel does not create an O(workers * BLAS_threads)
 ## thread explosion.  The variable MUST be set before numpy/scipy
 ## are imported, because the BLAS library reads them only at load time.
 import os
 os.environ.setdefault("OMP_NUM_THREADS",        "1")
 import numpy
 import scipy
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 import os
 #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
 ####################################################################################
 ####################################################################################
 ## Allow this module to be run as a standalone script so that each
 ## binary-data plot can be executed in a fresh subprocess whose BLAS
 ## environment variables (set above) take effect before numpy loads.
 ##
 ## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
 ####################################################################################
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################
 import numpy                               ## numpy for array operations
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in parallel using subprocesses.
+    ## Plot each file in the list
    ## Each subprocess is a fresh Python process where the BLAS thread-count
    ## environment variables (set at the top of plot_binary_data.py) take
    ## effect before numpy is imported.  This avoids the thread explosion
    ## that occurs when multiprocessing.Pool with 'fork' context inherits
    ## already-initialized multi-threaded BLAS from the parent.
    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
    running = []
    failed  = []
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        ## Keep at most max_workers subprocesses active at a time
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    ## Wait for all remaining subprocesses to finish
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )