Trigger-Discipline: parallelize result plotting

Trigger-Discipline: port TwoPuncture OpenMP optimizations
.gitignore updated
2026-04-24 10:04:57 +08:00 · 2026-04-24 09:25:13 +08:00 · 2026-04-24 09:10:12 +08:00 · 2026-04-24 09:09:50 +08:00 · 2026-02-05 19:53:55 +08:00
64 changed files with 1522 additions and 2681 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 __pycache__
 GW150914
-GW150914-origin
+GW150914*
-docs
+.codex
-*.tmp
+docs/
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 1                              ## number of processes (MPI removed, single-process mode)
+MPI_processes    = 64                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -9,9 +9,19 @@
 ##################################################################
-##################################################################
+##################################################################
-
+
-## Print program introduction
+## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script.
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
 ## Print program introduction
 import print_information
@@ -422,31 +432,36 @@ print( " Plotting the txt and binary results data from the AMSS-NCKU simulation
 print(                                                                          )
-import plot_xiaoqu
+import plot_xiaoqu
-import plot_GW_strain_amplitude_xiaoqu
+import plot_GW_strain_amplitude_xiaoqu
-
+from parallel_plot_helper import run_plot_tasks_parallel
-## Plot black hole trajectory
+
-plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
+plot_tasks = []
-plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
+
-
+## Plot black hole trajectory
-## Plot black hole separation vs. time
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
-plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
-
+
-## Plot gravitational waveforms (psi4 and strain amplitude)
+## Plot black hole separation vs. time
-for i in range(input_data.Detector_Number):
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
-    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
+
-    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
+## Plot gravitational waveforms (psi4 and strain amplitude)
-
+for i in range(input_data.Detector_Number):
-## Plot ADM mass evolution
+    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
-for i in range(input_data.Detector_Number):
+    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
-    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
+
-
+## Plot ADM mass evolution
-## Plot Hamiltonian constraint violation over time
+for i in range(input_data.Detector_Number):
-for i in range(input_data.grid_level):
+    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
-    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
+
-
+## Plot Hamiltonian constraint violation over time
-## Plot stored binary data
+for i in range(input_data.grid_level):
-plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
+    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
 print(                                                 )
 print( f" This Program Cost = {elapsed_time} Seconds " )
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,279 +0,0 @@
 #!/usr/bin/env python3
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
 import sys
 import os
 # ANSI Color Codes
 class Color:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
 def get_status_text(passed):
    if passed:
        return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
    else:
        return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
 def load_bh_trajectory(filepath):
    """Load black hole trajectory data"""
    data = np.loadtxt(filepath)
    return {
        'time': data[:, 0],
        'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
        'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
    }
 def load_constraint_data(filepath):
    """Load constraint violation data"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
    x1_new = bh_data_target['x1'][:M]
    y1_new = bh_data_target['y1'][:M]
    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
    # Calculate RMS for BH1
    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
    # Calculate RMS for BH2
    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
    # Avoid division by zero for BH1
    valid_mask1 = r1_max > 1e-15
    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    # Avoid division by zero for BH2
    valid_mask2 = r2_max > 1e-15
    if np.sum(valid_mask2) < 10:
        return None, "Insufficient valid data points for BH2"
    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
    Analyze ADM constraint violation
    Return maximum constraint violation for Grid Level 0
    """
    # Extract Grid Level 0 data (first entry for each time step)
    level0_data = constraint_data[::n_levels]
    # Calculate maximum absolute value for each constraint
    results = {
        'Ham': np.max(np.abs(level0_data[:, 1])),
        'Px': np.max(np.abs(level0_data[:, 2])),
        'Py': np.max(np.abs(level0_data[:, 3])),
        'Pz': np.max(np.abs(level0_data[:, 4])),
        'Gx': np.max(np.abs(level0_data[:, 5])),
        'Gy': np.max(np.abs(level0_data[:, 6])),
        'Gz': np.max(np.abs(level0_data[:, 7]))
    }
    results['max_violation'] = max(results.values())
    return results
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
 def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
    passed = rms_rel < threshold
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
        print(f"   Max |{name:3}|: {results[name]:.6f}", end="   ")
        if (i + 1) % 2 == 0: print()
    if len(names) % 2 != 0: print()
    passed = results['max_violation'] < threshold
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -20,11 +20,7 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/Ansorg.h
+++ b/AMSS_NCKU_source/Ansorg.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #define PI M_PI
--- a/AMSS_NCKU_source/Block.h
+++ b/AMSS_NCKU_source/Block.h
@@ -2,11 +2,7 @@
 #ifndef BLOCK_H
 #define BLOCK_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h" //need dim here; Vertex or Cell
 #include "var.h"
 #include "MyList.h"
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,57 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
  m=nn
 1 if ((m.ge.2).and.(j.gt.m)) then
  j=j-m
  m=m/2
 goto 1
  endif
 j=j+m
 enddo
 mmax=2
 2  if (n.gt.mmax) then
     istep=2*mmax
     theta=6.28318530717959d0/(isign*mmax)
     wpr=-2.d0*sin(0.5d0*theta)**2
     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Commit the descriptor
 status = DftiCommitDescriptor(desc)
 if (status /= 0) then
   status = DftiFreeDescriptor(desc)
   return
 endif
 ! Execute FFT based on direction
 if (isign == 1) then
   ! Forward FFT: exp(-2*pi*i*k*n/N)
   status = DftiComputeForward(desc, dataa)
 else
   ! Backward FFT: exp(+2*pi*i*k*n/N)
   status = DftiComputeBackward(desc, dataa)
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/IntPnts0.C
+++ b/AMSS_NCKU_source/IntPnts0.C
@@ -4,11 +4,7 @@
 #include <stdarg.h>
 #include <string.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "myglobal.h"
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -341,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank, nprocs;
+  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
@@ -355,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] records which MPI rank owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  // All ranks traverse the same block list so they all agree on ownership
+  int *weight;
-  int *owner_rank;
+  weight = new int[NN];
-  owner_rank = new int[NN];
+  memset(weight, 0, sizeof(int) * NN);
-  for (int j = 0; j < NN; j++)
+
-    owner_rank[j] = -1;
+  double *DH, *llb, *uub;
  DH = new double[dim];
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -398,6 +403,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -422,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -430,11 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -443,327 +456,103 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Replace MPI_Allreduce with per-owner MPI_Bcast:
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  // Group consecutive points by owner rank and broadcast each group.
+  int *Weight;
-  // Since each point's data is non-zero only on the owner rank,
+  Weight = new int[NN];
-  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  //  misc::tillherecheck("print me");
  for (int i = 0; i < NN; i++)
  {
-    int j = 0;
+    if (Weight[i] > 1)
    while (j < NN)
    {
-      int cur_owner = owner_rank[j];
+      if (myrank == 0)
-      if (cur_owner < 0)
+        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
-      {
+      for (int j = 0; j < num_var; j++)
-        if (myrank == 0)
+        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
        {
          cout << "ERROR: Patch::Interp_Points fails to find point (";
          for (int d = 0; d < dim; d++)
          {
            cout << XX[d][j];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")";
          }
          cout << " on Patch (";
          for (int d = 0; d < dim; d++)
          {
            cout << bbox[d] << "+" << lli[d] * DH[d];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")--";
          }
          cout << "(";
          for (int d = 0; d < dim; d++)
          {
            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
            if (d < dim - 1)
              cout << ",";
            else
              cout << ")" << endl;
          }
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        j++;
        continue;
      }
      // Find contiguous run of points with the same owner
      int jstart = j;
      while (j < NN && owner_rank[j] == cur_owner)
        j++;
      int count = (j - jstart) * num_var;
      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
    }
-  }
+    else if (Weight[i] == 0 && myrank == 0)
  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
                          double *Shellf, int Symmetry,
                          int Nmin_consumer, int Nmax_consumer)
 {
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
  varl = VarList;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }
  memset(Shellf, 0, sizeof(double) * NN * num_var);
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int i = 0; i < dim; i++)
    {
      pox[i] = XX[i][j];
      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
      {
        cout << "Patch::Interp_Points: point (";
        for (int k = 0; k < dim; k++)
        {
          cout << XX[k][j];
          if (k < dim - 1)
            cout << ",";
          else
            cout << ") is out of current Patch." << endl;
        }
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    MyList<Block> *Bp = blb;
    bool notfind = true;
    while (notfind && Bp)
    {
      Block *BP = Bp->data;
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
          flag = false;
          break;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          varl = VarList;
          int k = 0;
          while (varl)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
    if (owner_rank[j] < 0 && myrank == 0)
    {
      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << XX[d][j];
+        cout << XX[j][i];
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")";
      }
      cout << " on Patch (";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << bbox[d] << "+" << lli[d] * DH[d];
+        cout << bbox[j] << "+" << lli[j] * getdX(j);
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")--";
      }
      cout << "(";
-      for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-        if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
 #if 0
       checkBlock();
 #else
      cout << "splited domains:" << endl;
      {
        MyList<Block> *Bp = blb;
        while (Bp)
        {
          Block *BP = Bp->data;
          for (int i = 0; i < dim; i++)
          {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
            llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
            uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
            llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
            uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
          }
          cout << "(";
          for (int j = 0; j < dim; j++)
          {
            cout << llb[j] << ":" << uub[j];
            if (j < dim - 1)
              cout << ",";
            else
              cout << ")" << endl;
          }
          if (Bp == ble)
            break;
          Bp = Bp->next;
        }
      }
 #endif
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
-  // --- Targeted point-to-point communication phase ---
+  delete[] shellf;
-  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  delete[] weight;
-  int *consumer_rank = new int[NN];
+  delete[] Weight;
-  {
+  delete[] DH;
-    int mp = NN / nprocs;
+  delete[] llb;
-    int Lp = NN - nprocs * mp;
+  delete[] uub;
    for (int j = 0; j < NN; j++)
    {
      if (j < Lp * (mp + 1))
        consumer_rank[j] = j / (mp + 1);
      else
        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
    }
  }
  // Count sends and recvs per rank
  int *send_count = new int[nprocs];
  int *recv_count = new int[nprocs];
  memset(send_count, 0, sizeof(int) * nprocs);
  memset(recv_count, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own == con)
      continue; // local — no communication needed
    if (own == myrank)
      send_count[con]++;
    if (con == myrank)
      recv_count[own]++;
  }
  // Build send buffers: for each destination rank, pack (index, data) pairs
  // Each entry: 1 int (point index j) + num_var doubles
  int total_send = 0, total_recv = 0;
  int *send_offset = new int[nprocs];
  int *recv_offset = new int[nprocs];
  for (int r = 0; r < nprocs; r++)
  {
    send_offset[r] = total_send;
    total_send += send_count[r];
    recv_offset[r] = total_recv;
    total_recv += recv_count[r];
  }
  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
  double *sendbuf = new double[total_send * stride];
  double *recvbuf = new double[total_recv * stride];
  // Temporary counters for packing
  int *pack_pos = new int[nprocs];
  memset(pack_pos, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own != myrank || con == myrank)
      continue;
    int pos = (send_offset[con] + pack_pos[con]) * stride;
    sendbuf[pos] = (double)j; // point index
    for (int v = 0; v < num_var; v++)
      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
    pack_pos[con]++;
  }
  // Post non-blocking recvs and sends
  int n_req = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0) n_req++;
    if (send_count[r] > 0) n_req++;
  }
  MPI_Request *reqs = new MPI_Request[n_req];
  int req_idx = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0)
    {
      MPI_Irecv(recvbuf + recv_offset[r] * stride,
                recv_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  for (int r = 0; r < nprocs; r++)
  {
    if (send_count[r] > 0)
    {
      MPI_Isend(sendbuf + send_offset[r] * stride,
                send_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  if (n_req > 0)
    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
  // Unpack recv buffers into Shellf
  for (int i = 0; i < total_recv; i++)
  {
    int pos = i * stride;
    int j = (int)recvbuf[pos];
    for (int v = 0; v < num_var; v++)
      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
  }
  delete[] reqs;
  delete[] sendbuf;
  delete[] recvbuf;
  delete[] pack_pos;
  delete[] send_offset;
  delete[] recv_offset;
  delete[] send_count;
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -784,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] stores the global rank that owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  int *owner_rank;
+  int *weight;
-  owner_rank = new int[NN];
+  weight = new int[NN];
-  for (int j = 0; j < NN; j++)
+  memset(weight, 0, sizeof(int) * NN);
    owner_rank[j] = -1;
-  // Build global-to-local rank translation for Comm_here
+  double *DH, *llb, *uub;
-  MPI_Group world_group, local_group;
+  DH = new double[dim];
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -831,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -855,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -863,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -876,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Collect unique global owner ranks and translate to local ranks in Comm_here
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-  // Then broadcast each owner's points via MPI_Bcast on Comm_here
+  int *Weight;
-  {
+  Weight = new int[NN];
-    int j = 0;
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
    while (j < NN)
    {
      int cur_owner_global = owner_rank[j];
      if (cur_owner_global < 0)
      {
        // Point not found — skip (error check disabled for sub-communicator levels)
        j++;
        continue;
      }
      // Translate global rank to local rank in Comm_here
      int cur_owner_local;
      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
-      // Find contiguous run of points with the same owner
+  //  misc::tillherecheck("print me");
-      int jstart = j;
+  //  if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
-      while (j < NN && owner_rank[j] == cur_owner_global)
+
-        j++;
+  for (int i = 0; i < NN; i++)
-      int count = (j - jstart) * num_var;
+  {
-      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
+    if (Weight[i] > 1)
    {
      if (lmyrank == 0)
        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
      for (int j = 0; j < num_var; j++)
        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
    }
 #if 0 // for not involved levels, this may fail     
     else if(Weight[i] == 0 && lmyrank == 0)
     {
       cout<<"ERROR: Patch::Interp_Points fails to find point (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<XX[j][i];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")";
       }
       cout<<" on Patch (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")--";
       }
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 #if 0
       checkBlock();
 #else
  cout<<"splited domains:"<<endl;
  {
     MyList<Block> *Bp=blb;
     while(Bp)
     {
 	Block *BP=Bp->data;
 	for(int i=0;i<dim;i++)
 	{
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif    
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +(ghost_width-0.5)*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
 #else
 #ifdef Cell
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +ghost_width*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif 
 	}       
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<llb[j]<<":"<<uub[j];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 	if(Bp == ble) break;
 	Bp=Bp->next;
     }
  }
 #endif       
       MPI_Abort(MPI_COMM_WORLD,1);
     }
 #endif
  }
-  MPI_Group_free(&world_group);
+  delete[] shellf;
-  MPI_Group_free(&local_group);
+  delete[] weight;
-  delete[] owner_rank;
+  delete[] Weight;
  delete[] DH;
  delete[] llb;
  delete[] uub;
 }
 void Patch::checkBlock()
 {
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -2,11 +2,7 @@
 #ifndef PATCH_H
 #define PATCH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "var.h"
@@ -43,10 +39,6 @@ public:
   bool Find_Point(double *XX);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry,
                      int Nmin_consumer, int Nmax_consumer);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry, MPI_Comm Comm_here);
--- a/AMSS_NCKU_source/Newton.C
+++ b/AMSS_NCKU_source/Newton.C
@@ -8,11 +8,7 @@
 #include <limits.h>
 #include <float.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/NullShellPatch.h
+++ b/AMSS_NCKU_source/NullShellPatch.h
@@ -23,11 +23,7 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/NullShellPatch2.h
+++ b/AMSS_NCKU_source/NullShellPatch2.h
@@ -23,11 +23,7 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -3756,484 +3756,6 @@ void Parallel::Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 // Merged Sync: collect all intra-patch and inter-patch grid segment lists,
 // then issue a single transfer() call instead of N+1 separate ones.
 void Parallel::Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
 {
  int cpusize;
  MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
  MyList<Parallel::gridseg> **combined_src = new MyList<Parallel::gridseg> *[cpusize];
  MyList<Parallel::gridseg> **combined_dst = new MyList<Parallel::gridseg> *[cpusize];
  for (int node = 0; node < cpusize; node++)
    combined_src[node] = combined_dst[node] = 0;
  // Phase A: Intra-patch ghost exchange segments
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    Patch *Pat = Pp->data;
    MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
      if (tsrc)
      {
        if (combined_src[node])
          combined_src[node]->catList(tsrc);
        else
          combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (combined_dst[node])
          combined_dst[node]->catList(tdst);
        else
          combined_dst[node] = tdst;
      }
      if (src_owned)
        src_owned->destroyList();
    }
    if (dst_ghost)
      dst_ghost->destroyList();
    Pp = Pp->next;
  }
  // Phase B: Inter-patch buffer exchange segments
  MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
  for (int node = 0; node < cpusize; node++)
  {
    MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
    MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
    build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
    if (tsrc)
    {
      if (combined_src[node])
        combined_src[node]->catList(tsrc);
      else
        combined_src[node] = tsrc;
    }
    if (tdst)
    {
      if (combined_dst[node])
        combined_dst[node]->catList(tdst);
      else
        combined_dst[node] = tdst;
    }
    if (src_owned)
      src_owned->destroyList();
  }
  if (dst_buffer)
    dst_buffer->destroyList();
  // Phase C: Single transfer
  transfer(combined_src, combined_dst, VarList, VarList, Symmetry);
  // Phase D: Cleanup
  for (int node = 0; node < cpusize; node++)
  {
    if (combined_src[node])
      combined_src[node]->destroyList();
    if (combined_dst[node])
      combined_dst[node]->destroyList();
  }
  delete[] combined_src;
  delete[] combined_dst;
 }
 // SyncCache constructor
 Parallel::SyncCache::SyncCache()
    : valid(false), cpusize(0), combined_src(0), combined_dst(0),
      send_lengths(0), recv_lengths(0), send_bufs(0), recv_bufs(0),
      send_buf_caps(0), recv_buf_caps(0), reqs(0), stats(0), max_reqs(0)
 {
 }
 // SyncCache invalidate: free grid segment lists but keep buffers
 void Parallel::SyncCache::invalidate()
 {
  if (!valid)
    return;
  for (int i = 0; i < cpusize; i++)
  {
    if (combined_src[i])
      combined_src[i]->destroyList();
    if (combined_dst[i])
      combined_dst[i]->destroyList();
    combined_src[i] = combined_dst[i] = 0;
    send_lengths[i] = recv_lengths[i] = 0;
  }
  valid = false;
 }
 // SyncCache destroy: free everything
 void Parallel::SyncCache::destroy()
 {
  invalidate();
  if (combined_src) delete[] combined_src;
  if (combined_dst) delete[] combined_dst;
  if (send_lengths) delete[] send_lengths;
  if (recv_lengths) delete[] recv_lengths;
  if (send_buf_caps) delete[] send_buf_caps;
  if (recv_buf_caps) delete[] recv_buf_caps;
  for (int i = 0; i < cpusize; i++)
  {
    if (send_bufs && send_bufs[i]) delete[] send_bufs[i];
    if (recv_bufs && recv_bufs[i]) delete[] recv_bufs[i];
  }
  if (send_bufs) delete[] send_bufs;
  if (recv_bufs) delete[] recv_bufs;
  if (reqs) delete[] reqs;
  if (stats) delete[] stats;
  combined_src = combined_dst = 0;
  send_lengths = recv_lengths = 0;
  send_buf_caps = recv_buf_caps = 0;
  send_bufs = recv_bufs = 0;
  reqs = 0; stats = 0;
  cpusize = 0; max_reqs = 0;
 }
 // transfer_cached: reuse pre-allocated buffers from SyncCache
 void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel::gridseg> **dst,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
 {
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  int req_no = 0;
  int node;
  for (node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      // send
      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      // recv
      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
  MPI_Waitall(req_no, cache.reqs, cache.stats);
  for (node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // Sync_cached: build grid segment lists on first call, reuse on subsequent calls
 void Parallel::Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    // Allocate cache arrays if needed
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }
    // Build intra-patch segments (same as Sync_merged Phase A)
    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }
    // Build inter-patch segments (same as Sync_merged Phase B)
    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();
    cache.valid = true;
  }
  // Use cached lists with buffer-reusing transfer
  transfer_cached(cache.combined_src, cache.combined_dst, VarList, VarList, Symmetry, cache);
 }
 // Sync_start: pack and post MPI_Isend/Irecv, return immediately
 void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                          SyncCache &cache, AsyncSyncState &state)
 {
  // Ensure cache is built
  if (!cache.valid)
  {
    // Build cache (same logic as Sync_cached)
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    for (int node = 0; node < cpusize; node++)
    {
      cache.combined_src[node] = cache.combined_dst[node] = 0;
      cache.send_lengths[node] = cache.recv_lengths[node] = 0;
    }
    MyList<Patch> *Pp = PatL;
    while (Pp)
    {
      Patch *Pat = Pp->data;
      MyList<Parallel::gridseg> *dst_ghost = build_ghost_gsl(Pat);
      for (int node = 0; node < cpusize; node++)
      {
        MyList<Parallel::gridseg> *src_owned = build_owned_gsl0(Pat, node);
        MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
        build_gstl(src_owned, dst_ghost, &tsrc, &tdst);
        if (tsrc)
        {
          if (cache.combined_src[node])
            cache.combined_src[node]->catList(tsrc);
          else
            cache.combined_src[node] = tsrc;
        }
        if (tdst)
        {
          if (cache.combined_dst[node])
            cache.combined_dst[node]->catList(tdst);
          else
            cache.combined_dst[node] = tdst;
        }
        if (src_owned) src_owned->destroyList();
      }
      if (dst_ghost) dst_ghost->destroyList();
      Pp = Pp->next;
    }
    MyList<Parallel::gridseg> *dst_buffer = build_buffer_gsl(PatL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatL, node, 5, Symmetry);
      MyList<Parallel::gridseg> *tsrc = 0, *tdst = 0;
      build_gstl(src_owned, dst_buffer, &tsrc, &tdst);
      if (tsrc)
      {
        if (cache.combined_src[node])
          cache.combined_src[node]->catList(tsrc);
        else
          cache.combined_src[node] = tsrc;
      }
      if (tdst)
      {
        if (cache.combined_dst[node])
          cache.combined_dst[node]->catList(tdst);
        else
          cache.combined_dst[node] = tdst;
      }
      if (src_owned) src_owned->destroyList();
    }
    if (dst_buffer) dst_buffer->destroyList();
    cache.valid = true;
  }
  // Now pack and post async MPI operations
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  state.req_no = 0;
  state.active = true;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packer(cache.recv_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      }
    }
    else
    {
      int slength = data_packer(0, src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packer(cache.send_bufs[node], src[myrank], dst[myrank], node, PACK, VarList, VarList, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
      int rlength = data_packer(0, src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 2, MPI_COMM_WORLD, cache.reqs + state.req_no++);
      }
    }
  }
 }
 // Sync_finish: wait for async MPI operations and unpack
 void Parallel::Sync_finish(SyncCache &cache, AsyncSyncState &state,
                           MyList<var> *VarList, int Symmetry)
 {
  if (!state.active)
    return;
  MPI_Waitall(state.req_no, cache.reqs, cache.stats);
  int cpusize = cache.cpusize;
  MyList<Parallel::gridseg> **src = cache.combined_src;
  MyList<Parallel::gridseg> **dst = cache.combined_dst;
  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packer(cache.recv_bufs[node], src[node], dst[node], node, UNPACK, VarList, VarList, Symmetry);
  state.active = false;
 }
 // collect buffer grid segments or blocks for the periodic boundary condition of given patch
 // ---------------------------------------------------
 // |con |                                       |con |
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -81,42 +81,6 @@ namespace Parallel
                   int Symmetry);
  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  struct SyncCache {
    bool valid;
    int cpusize;
    MyList<gridseg> **combined_src;
    MyList<gridseg> **combined_dst;
    int *send_lengths;
    int *recv_lengths;
    double **send_bufs;
    double **recv_bufs;
    int *send_buf_caps;
    int *recv_buf_caps;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
    SyncCache();
    void invalidate();
    void destroy();
  };
  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  struct AsyncSyncState {
    int req_no;
    bool active;
    AsyncSyncState() : req_no(0), active(false) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                  SyncCache &cache, AsyncSyncState &state);
  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
--- a/AMSS_NCKU_source/ShellPatch.h
+++ b/AMSS_NCKU_source/ShellPatch.h
@@ -2,11 +2,7 @@
 #ifndef SHELLPATCH_H
 #define SHELLPATCH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/Z4c_class.h
+++ b/AMSS_NCKU_source/Z4c_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssnEM_class.h
+++ b/AMSS_NCKU_source/bssnEM_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/bssnEScalar_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -730,12 +730,6 @@ void bssn_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
  delete Azzz;
 #endif
  // Destroy sync caches before GH
  if (sync_cache_pre)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_pre[i].destroy();
    delete[] sync_cache_pre;
  }
  if (sync_cache_cor)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_cor[i].destroy();
    delete[] sync_cache_cor;
  }
  if (sync_cache_rp_coarse)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
  if (sync_cache_rp_fine)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_fine[i].destroy();
    delete[] sync_cache_rp_fine;
  }
  delete GH;
 #ifdef WithShell
  delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2429,7 +2396,6 @@ void bssn_class::RecursiveStep(int lev)
  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2608,7 +2574,6 @@ void bssn_class::ParallelStep()
  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2775,7 +2740,6 @@ void bssn_class::ParallelStep()
        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -2790,7 +2754,6 @@ void bssn_class::ParallelStep()
      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -2809,7 +2772,6 @@ void bssn_class::ParallelStep()
          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -2825,7 +2787,6 @@ void bssn_class::ParallelStep()
          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -3215,9 +3190,9 @@ void bssn_class::Step(int lev, int YN)
        {
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3363,29 +3347,12 @@ void bssn_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
 #if (MAPBH == 0)
  // for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -3579,9 +3563,9 @@ void bssn_class::Step(int lev, int YN)
          {
 #if (AGM == 0)
            f_enforce_ga(cg->shape,
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #elif (AGM == 1)
            if (iter_count == 3)
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" 
                                << iter_count << " variables at t = " 
                                << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -3723,31 +3717,12 @@ void bssn_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
 #if (MAPBH == 0)
    // for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -4077,15 +4067,15 @@ void bssn_class::Step(int lev, int YN)
        {
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
          if (f_compute_rhs_bssn_ss(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                    cg->fgfs[fngfs + ShellPatch::gx],
+                                    cg->fgfs[fngfs + ShellPatch::gx], 
-                                    cg->fgfs[fngfs + ShellPatch::gy],
+                                    cg->fgfs[fngfs + ShellPatch::gy], 
                                    cg->fgfs[fngfs + ShellPatch::gz],
                                    cg->fgfs[fngfs + ShellPatch::drhodx], 
                                    cg->fgfs[fngfs + ShellPatch::drhody], 
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " 
                              << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -4222,27 +4221,9 @@ void bssn_class::Step(int lev, int YN)
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+      << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
+      << " seconds! " << endl;
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -4423,9 +4420,9 @@ void bssn_class::Step(int lev, int YN)
          {
 #if (AGM == 0)
            f_enforce_ga(cg->shape,
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #elif (AGM == 1)
            if (iter_count == 3)
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -4567,30 +4573,11 @@ void bssn_class::Step(int lev, int YN)
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
    }
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #if (MAPBH == 0)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,34 +5140,30 @@ void bssn_class::Step(int lev, int YN)
    //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
+          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
-                                << " variables at t = " << PhysTime
+                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
 #if (MAPBH == 0)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
 #if (PSTR == 1 || PSTR == 2)
 //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
 #endif
-  // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  {
@@ -5484,25 +5473,12 @@ void bssn_class::SHStep()
    {
      prev_clock = curr_clock;
      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
+      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
    }
  }
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    {
@@ -5661,26 +5647,12 @@ void bssn_class::SHStep()
      {
        prev_clock = curr_clock;
        curr_clock = clock();
-        cout << " Shell stuff synchronization used "
+        cout << " Shell stuff synchronization used " 
-             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
+             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
      }
    }
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    sPp = SH->PatL;
    while (sPp)
    {
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5870,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5908,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //    a_stream.clear();
@@ -5966,7 +5938,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
@@ -5998,7 +5970,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
@@ -6022,7 +5994,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
  }
 }
@@ -6073,7 +6045,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
@@ -6107,7 +6079,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
@@ -6131,7 +6103,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
@@ -6214,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
 #undef MIXOUTB
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "cgh.h"
@@ -130,11 +126,6 @@ public:
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;
       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
--- a/AMSS_NCKU_source/bssn_gpu_class.h
+++ b/AMSS_NCKU_source/bssn_gpu_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "cgh.h"
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -106,8 +106,7 @@
  call getpbh(BHN,Porg,Mass)
 #endif
-!!! sanity check (disabled in production builds for performance)
+!!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -137,7 +136,6 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -20,11 +20,7 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "macrodef.h"
 #include "misc.h"
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -2,11 +2,7 @@
 #ifndef CGH_H
 #define CGH_H
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "MyList.h"
 #include "MPatch.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/checkpoint.h
+++ b/AMSS_NCKU_source/checkpoint.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <time.h>
 #include <stdlib.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "var.h"
 #include "MyList.h"
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,7 +69,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -152,7 +151,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -220,7 +218,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -285,7 +282,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -375,7 +371,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -474,7 +469,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -537,7 +531,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -601,7 +594,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -665,7 +657,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -728,7 +719,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -790,7 +780,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -877,7 +866,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1009,7 +997,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1164,7 +1151,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1241,7 +1227,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1312,7 +1297,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1417,7 +1401,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1593,7 +1576,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1661,7 +1643,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1731,7 +1712,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1801,7 +1781,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1872,7 +1851,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1941,7 +1919,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2034,7 +2011,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2151,7 +2127,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2237,7 +2212,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2314,7 +2288,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2433,7 +2406,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2621,7 +2593,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2694,7 +2665,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2770,7 +2740,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2846,7 +2815,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -2927,7 +2895,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3006,7 +2973,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3114,7 +3080,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3251,7 +3216,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3347,7 +3311,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3432,7 +3395,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3568,7 +3530,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3841,7 +3802,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -3923,7 +3883,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4008,7 +3967,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4093,7 +4051,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4196,7 +4153,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -4297,7 +4253,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/diff_new_sh.f90
+++ b/AMSS_NCKU_source/diff_new_sh.f90
@@ -81,7 +81,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -180,7 +179,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -264,7 +262,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -345,7 +342,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -447,7 +443,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -558,7 +553,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -633,7 +627,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -709,7 +702,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -785,7 +777,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -860,7 +851,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -934,7 +924,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1030,7 +1019,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1146,7 +1134,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1240,7 +1227,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1328,7 +1314,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1445,7 +1430,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1596,7 +1580,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1676,7 +1659,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1758,7 +1740,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1840,7 +1821,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1923,7 +1903,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2004,7 +1983,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2109,7 +2087,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2242,7 +2219,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2345,7 +2321,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2439,7 +2414,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2570,7 +2544,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2770,7 +2743,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2855,7 +2827,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -2943,7 +2914,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3031,7 +3001,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3124,7 +3093,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3215,7 +3183,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3335,7 +3302,6 @@
  fy = ZEO
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3488,7 +3454,6 @@
  fx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3601,7 +3566,6 @@
  fy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3703,7 +3667,6 @@
  fz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -3851,7 +3814,6 @@
  fxz = ZEO
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4136,7 +4098,6 @@
  fxx = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4230,7 +4191,6 @@
  fyy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4327,7 +4287,6 @@
  fzz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4424,7 +4383,6 @@
  fxy = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4539,7 +4497,6 @@
  fxz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4652,7 +4609,6 @@
  fyz = ZEO
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4723,7 +4679,6 @@ subroutine fderivs_shc(ex,f,fx,fy,fz,crho,sigma,R,SYM1,SYM2,SYM3,Symmetry,Lev,ss
 #if 0  
  integer :: i,j,k
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -4774,7 +4729,6 @@ subroutine fdderivs_shc(ex,f,fxx,fxy,fxz,fyy,fyz,fzz,crho,sigma,R,SYM1,SYM2,SYM3
 #if 0  
  integer :: i,j,k
  !$omp parallel do collapse(2) private(i,j,k)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -17,63 +17,62 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
-!~~~~~~~> Local variable:
+!~~~~~~~> Local variable:
-
+
-  integer :: i,j,k
+  integer :: i,j,k
-  real*8 :: lgxx,lgyy,lgzz,ldetg
+  real*8 :: lgxx,lgyy,lgzz,ldetg
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-  real*8 :: ltrA,lscale
+  real*8 :: ltrA,lscale
-  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
+  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
-
+
-!~~~~~~>
+!~~~~~~>
-
+
-  !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,ldetg,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA,lscale)
+  do k=1,ex(3)
-  do k=1,ex(3)
+  do j=1,ex(2)
-  do j=1,ex(2)
+  do i=1,ex(1)
-  do i=1,ex(1)
+
-
+    lgxx = dxx(i,j,k) + ONE
-    lgxx = dxx(i,j,k) + ONE
+    lgyy = dyy(i,j,k) + ONE
-    lgyy = dyy(i,j,k) + ONE
+    lgzz = dzz(i,j,k) + ONE
-    lgzz = dzz(i,j,k) + ONE
+
-
+    ldetg =  lgxx * lgyy * lgzz &
-    ldetg =  lgxx * lgyy * lgzz &
+           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
-           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
+           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
-           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
+           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
-           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
+           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-           - lgxx * gyz(i,j,k) * gyz(i,j,k)
+
-
+    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
+    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
+    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
+    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
+    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
+    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+
-
+    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+                 + lgupzz * Azz(i,j,k) &
-                 + lgupzz * Azz(i,j,k) &
+         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
-         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+                 + lgupyz * Ayz(i,j,k))
-                 + lgupyz * Ayz(i,j,k))
+
-
+    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
+    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
+    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
+    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
+    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+
-
+    lscale = ONE / ( ldetg ** F1o3 )
-    lscale = ONE / ( ldetg ** F1o3 )
+
-
+    dxx(i,j,k) = lgxx * lscale - ONE
-    dxx(i,j,k) = lgxx * lscale - ONE
+    gxy(i,j,k) = gxy(i,j,k) * lscale
-    gxy(i,j,k) = gxy(i,j,k) * lscale
+    gxz(i,j,k) = gxz(i,j,k) * lscale
-    gxz(i,j,k) = gxz(i,j,k) * lscale
+    dyy(i,j,k) = lgyy * lscale - ONE
-    dyy(i,j,k) = lgyy * lscale - ONE
+    gyz(i,j,k) = gyz(i,j,k) * lscale
-    gyz(i,j,k) = gyz(i,j,k) * lscale
+    dzz(i,j,k) = lgzz * lscale - ONE
-    dzz(i,j,k) = lgzz * lscale - ONE
+
-
+  enddo
-  enddo
+  enddo
-  enddo
+  enddo
  enddo
  return
@@ -94,73 +93,72 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
-!~~~~~~~> Local variable:
+!~~~~~~~> Local variable:
-
+
-  integer :: i,j,k
+  integer :: i,j,k
-  real*8 :: lgxx,lgyy,lgzz,lscale
+  real*8 :: lgxx,lgyy,lgzz,lscale
-  real*8 :: lgxy,lgxz,lgyz
+  real*8 :: lgxy,lgxz,lgyz
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-  real*8 :: ltrA
+  real*8 :: ltrA
-  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
+  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
-
+
-!~~~~~~>
+!~~~~~~>
-
+
-  !$omp parallel do collapse(2) private(i,j,k,lgxx,lgyy,lgzz,lscale,lgxy,lgxz,lgyz,lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz,ltrA)
+  do k=1,ex(3)
-  do k=1,ex(3)
+  do j=1,ex(2)
-  do j=1,ex(2)
+  do i=1,ex(1)
-  do i=1,ex(1)
+
-
+! for g: normalize determinant first
-! for g: normalize determinant first
+    lgxx = dxx(i,j,k) + ONE
-    lgxx = dxx(i,j,k) + ONE
+    lgyy = dyy(i,j,k) + ONE
-    lgyy = dyy(i,j,k) + ONE
+    lgzz = dzz(i,j,k) + ONE
-    lgzz = dzz(i,j,k) + ONE
+    lgxy = gxy(i,j,k)
-    lgxy = gxy(i,j,k)
+    lgxz = gxz(i,j,k)
-    lgxz = gxz(i,j,k)
+    lgyz = gyz(i,j,k)
-    lgyz = gyz(i,j,k)
+
-
+    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
+            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+
-
+    lscale = ONE / ( lscale ** F1o3 )
-    lscale = ONE / ( lscale ** F1o3 )
+
-
+    lgxx = lgxx * lscale
-    lgxx = lgxx * lscale
+    lgxy = lgxy * lscale
-    lgxy = lgxy * lscale
+    lgxz = lgxz * lscale
-    lgxz = lgxz * lscale
+    lgyy = lgyy * lscale
-    lgyy = lgyy * lscale
+    lgyz = lgyz * lscale
-    lgyz = lgyz * lscale
+    lgzz = lgzz * lscale
-    lgzz = lgzz * lscale
+
-
+    dxx(i,j,k) = lgxx - ONE
-    dxx(i,j,k) = lgxx - ONE
+    gxy(i,j,k) = lgxy
-    gxy(i,j,k) = lgxy
+    gxz(i,j,k) = lgxz
-    gxz(i,j,k) = lgxz
+    dyy(i,j,k) = lgyy - ONE
-    dyy(i,j,k) = lgyy - ONE
+    gyz(i,j,k) = lgyz
-    gyz(i,j,k) = lgyz
+    dzz(i,j,k) = lgzz - ONE
-    dzz(i,j,k) = lgzz - ONE
+
-
+! for A: trace-free using normalized metric (det=1, no division needed)
-! for A: trace-free using normalized metric (det=1, no division needed)
+    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
-    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
+    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
-    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
+    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
-    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
+    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
-    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
+    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
-    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
+    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
-    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
+
-
+    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+                 + lgupzz * Azz(i,j,k) &
-                 + lgupzz * Azz(i,j,k) &
+         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
-         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
+                 + lgupyz * Ayz(i,j,k))
-                 + lgupyz * Ayz(i,j,k))
+
-
+    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
+    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
+    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
+    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
+    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+
-
+  enddo
-  enddo
+  enddo
-  enddo
+  enddo
  enddo
  return
--- a/AMSS_NCKU_source/expansion.C
+++ b/AMSS_NCKU_source/expansion.C
@@ -6,11 +6,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/find_horizons.C
+++ b/AMSS_NCKU_source/find_horizons.C
@@ -6,11 +6,7 @@
 #include <stdio.h>
 #include <assert.h>
 #include <math.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cctk.h"
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,7 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
   enddo
@@ -349,7 +349,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -377,7 +377,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -883,16 +883,20 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-   do i=0,ord-1
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
-      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-   enddo
+   do i=0,ord-1
-   do i=0,ord-1
+      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
-      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
+   enddo
-   enddo
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-   do i=0,ord-1
+   do i=0,ord-1
-      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
+      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
-   enddo
+   enddo
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
 end subroutine symmetry_bd
@@ -908,7 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -936,7 +940,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -1107,162 +1111,355 @@ end subroutine d2dump
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-! common code for cell and vertex
+! common code for cell and vertex
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-! Lagrangian polynomial interpolation
+! Lagrangian polynomial interpolation
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-
+#ifndef POLINT6_USE_BARYCENTRIC
-  subroutine polint(xa, ya, x, y, dy, ordn)
+#define POLINT6_USE_BARYCENTRIC 1
-  implicit none
+#endif
-
+
-  integer, intent(in) :: ordn
+!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
-  real*8, dimension(ordn), intent(in) :: xa, ya
+  subroutine polint6_neville(xa, ya, x, y, dy)
-  real*8, intent(in) :: x
+  implicit none
-  real*8, intent(out) :: y, dy
+
-
+  real*8, dimension(6), intent(in) :: xa, ya
-  integer :: i, m, ns, n_m
+  real*8, intent(in) :: x
-  real*8, dimension(ordn) :: c, d, ho
+  real*8, intent(out) :: y, dy
-  real*8 :: dif, dift, hp, h, den_val
+
-
+  integer :: i, m, ns, n_m
-  c = ya
+  real*8, dimension(6) :: c, d, ho
-  d = ya
+  real*8 :: dif, dift, hp, h, den_val
-  ho = xa - x
+
-
+  c = ya
-  ns = 1
+  d = ya
-  dif = abs(x - xa(1))
+  ho = xa - x
-
+
-  do i = 2, ordn
+  ns = 1
-    dift = abs(x - xa(i))
+  dif = abs(x - xa(1))
-    if (dift < dif) then
+
-      ns = i
+  do i = 2, 6
-      dif = dift
+    dift = abs(x - xa(i))
-    end if
+    if (dift < dif) then
-  end do
+      ns = i
-
+      dif = dift
-  y = ya(ns)
+    end if
-  ns = ns - 1
+  end do
-
+
-  do m = 1, ordn - 1
+  y = ya(ns)
-    n_m = ordn - m
+  ns = ns - 1
-    do i = 1, n_m
+
-      hp = ho(i)
+  do m = 1, 5
-      h  = ho(i+m)
+    n_m = 6 - m
-      den_val = hp - h
+    do i = 1, n_m
-
+      hp = ho(i)
-      if (den_val == 0.0d0) then
+      h  = ho(i+m)
-        write(*,*) 'failure in polint for point',x
+      den_val = hp - h
-        write(*,*) 'with input points: ',xa
+
-        stop
+      if (den_val == 0.0d0) then
-      end if
+        write(*,*) 'failure in polint for point',x
-
+        write(*,*) 'with input points: ',xa
-      den_val = (c(i+1) - d(i)) / den_val
+        stop
-
+      end if
-      d(i) = h * den_val
+
-      c(i) = hp * den_val
+      den_val = (c(i+1) - d(i)) / den_val
-    end do
+
-
+      d(i) = h * den_val
-    if (2 * ns < n_m) then
+      c(i) = hp * den_val
-      dy = c(ns + 1)
+    end do
-    else
+
-      dy = d(ns)
+    if (2 * ns < n_m) then
-      ns = ns - 1
+      dy = c(ns + 1)
-    end if
+    else
-    y = y + dy
+      dy = d(ns)
-  end do
+      ns = ns - 1
-
+    end if
-  return
+    y = y + dy
-  end subroutine polint
+  end do
-!------------------------------------------------------------------------------
+
-!
+  return
-! interpolation in 2 dimensions, follow yx order
+  end subroutine polint6_neville
-!
+
-!------------------------------------------------------------------------------
+!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
-  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
+  subroutine polint6_barycentric(xa, ya, x, y, dy)
-  implicit none
+  implicit none
-
+
-  integer,intent(in) :: ordn
+  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
+  real*8, intent(in) :: x
-  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
+  real*8, intent(out) :: y, dy
-  real*8, intent(in) :: x1,x2
+
-  real*8, intent(out) :: y,dy
+  integer :: i, j
-
+  logical :: is_uniform
-#ifdef POLINT_LEGACY_ORDER
+  real*8, dimension(6) :: lambda
-  integer  :: i,m
+  real*8 :: dx, den_i, term, num, den, step, tol
-  real*8, dimension(ordn) :: ymtmp
+  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
-  real*8, dimension(ordn) :: yntmp
+
-
+  do i = 1, 6
-  m=size(x1a)
+    if (x == xa(i)) then
-  do i=1,m
+      y = ya(i)
-    yntmp=ya(i,:)
+      dy = 0.d0
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+      return
-  end do
+    end if
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
+  end do
-#else
+
-  integer  :: j
+  step = xa(2) - xa(1)
-  real*8, dimension(ordn) :: ymtmp
+  is_uniform = (step /= 0.d0)
-  real*8 :: dy_temp
+  if (is_uniform) then
-
+    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
-  do j=1,ordn
+    do i = 3, 6
-    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
+      if (abs((xa(i) - xa(i-1)) - step) > tol) then
-  end do
+        is_uniform = .false.
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
+        exit
-#endif
+      end if
-
+    end do
-  return
+  end if
-  end subroutine polin2
+
-!------------------------------------------------------------------------------
+  if (is_uniform) then
-!
+    num = 0.d0
-! interpolation in 3 dimensions, follow zyx order
+    den = 0.d0
-!
+    do i = 1, 6
-!------------------------------------------------------------------------------
+      term = c_uniform(i) / (x - xa(i))
-  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
+      num = num + term * ya(i)
-  implicit none
+      den = den + term
-
+    end do
-  integer,intent(in) :: ordn
+    y = num / den
-  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
+    dy = 0.d0
-  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
+    return
-  real*8, intent(in) :: x1,x2,x3
+  end if
-  real*8, intent(out) :: y,dy
+
-
+  do i = 1, 6
-#ifdef POLINT_LEGACY_ORDER
+    den_i = 1.d0
-  integer  :: i,j,m,n
+    do j = 1, 6
-  real*8, dimension(ordn,ordn) :: yatmp
+      if (j /= i) then
-  real*8, dimension(ordn) :: ymtmp
+        dx = xa(i) - xa(j)
-  real*8, dimension(ordn) :: yntmp
+        if (dx == 0.0d0) then
-  real*8, dimension(ordn) :: yqtmp
+          write(*,*) 'failure in polint for point',x
-
+          write(*,*) 'with input points: ',xa
-  m=size(x1a)
+          stop
-  n=size(x2a)
+        end if
-  do i=1,m
+        den_i = den_i * dx
-   do j=1,n
+      end if
-    yqtmp=ya(i,j,:)
+    end do
-    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
+    lambda(i) = 1.d0 / den_i
-   end do
+  end do
-    yntmp=yatmp(i,:)
+
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+  num = 0.d0
-  end do
+  den = 0.d0
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
+  do i = 1, 6
-#else
+    term = lambda(i) / (x - xa(i))
-  integer  :: j, k
+    num = num + term * ya(i)
-  real*8, dimension(ordn,ordn) :: yatmp
+    den = den + term
-  real*8, dimension(ordn) :: ymtmp
+  end do
-  real*8 :: dy_temp
+
-
+  y = num / den
-  do k=1,ordn
+  dy = 0.d0
-    do j=1,ordn
+
-      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
+  return
-    end do
+  end subroutine polint6_barycentric
-  end do
+
-  do k=1,ordn
+!DIR$ ATTRIBUTES FORCEINLINE :: polint
-    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
+  subroutine polint(xa, ya, x, y, dy, ordn)
-  end do
+  implicit none
-  call polint(x3a, ymtmp, x3, y, dy, ordn)
+
-#endif
+  integer, intent(in) :: ordn
-
+  real*8, dimension(ordn), intent(in) :: xa, ya
-  return
+  real*8, intent(in) :: x
-  end subroutine polin3
+  real*8, intent(out) :: y, dy
  integer :: i, m, ns, n_m
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
  if (ordn == 6) then
 #if POLINT6_USE_BARYCENTRIC
    call polint6_barycentric(xa, ya, x, y, dy)
 #else
    call polint6_neville(xa, ya, x, y, dy)
 #endif
    return
  end if
  c = ya
  d = ya
  ho = xa - x
  ns = 1
  dif = abs(x - xa(1))
  do i = 2, ordn
    dift = abs(x - xa(i))
    if (dift < dif) then
      ns = i
      dif = dift
    end if
  end do
  y = ya(ns)
  ns = ns - 1
  do m = 1, ordn - 1
    n_m = ordn - m
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
        write(*,*) 'failure in polint for point',x
        write(*,*) 'with input points: ',xa
        stop
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
      dy = d(ns)
      ns = ns - 1
    end if
    y = y + dy
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 ! Compute Lagrange interpolation basis weights for one target point.
 !------------------------------------------------------------------------------
 !DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
  subroutine polint_lagrange_weights(xa, x, w, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: xa
  real*8, intent(in) :: x
  real*8, dimension(1:ordn), intent(out) :: w
  integer :: i, j
  real*8 :: num, den, dx
  do i = 1, ordn
    num = 1.d0
    den = 1.d0
    do j = 1, ordn
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        num = num * (x - xa(j))
        den = den * dx
      end if
    end do
    w(i) = num / den
  end do
  return
  end subroutine polint_lagrange_weights
 !------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
 #ifdef POLINT_LEGACY_ORDER
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
  call polint(x2a, ymtmp, x2, y, dy, ordn)
 #endif
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
 ! interpolation in 3 dimensions, follow zyx order
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
 #ifdef POLINT_LEGACY_ORDER
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  real*8, dimension(ordn) :: yqtmp
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: i, j, k
  real*8, dimension(ordn) :: w1, w2
  real*8, dimension(ordn) :: ymtmp
  real*8 :: yx_sum, x_sum
  call polint_lagrange_weights(x1a, x1, w1, ordn)
  call polint_lagrange_weights(x2a, x2, w2, ordn)
  do k = 1, ordn
    yx_sum = 0.d0
    do j = 1, ordn
      x_sum = 0.d0
      do i = 1, ordn
        x_sum = x_sum + w1(i) * ya(i,j,k)
      end do
      yx_sum = yx_sum + w2(j) * x_sum
    end do
    ymtmp(k) = yx_sum
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
-! calculate L2norm
+! calculate L2norm  
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw)
@@ -1279,9 +1476,9 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k,n_elements
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1305,20 +1502,91 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-! Optimized with oneMKL BLAS DDOT for dot product
+  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(n_elements))
-allocate(f_flat(n_elements))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
  return
  end subroutine l2normhelper
-!--------------------------------------------------------------------------------------
+!--------------------------------------------------------------------------------------
-! calculate L2norm especially for shell Blocks
+  subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                           f1,f2,f3,f4,f5,f6,f7,f_out,gw)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in ):: ex(1:3)
  real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
  integer,intent(in)::gw
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
  real*8, intent(out) :: f_out(7)
 !~~~~~~> Other variables:
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8 :: s1,s2,s3,s4,s5,s6,s7
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
   imin = gw+1
   jmin = gw+1
   kmin = gw+1
   imax = ex(1) - gw
   jmax = ex(2) - gw
   kmax = ex(3) - gw
 if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
 if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
 if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
 if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
  s1 = 0.d0
  s2 = 0.d0
  s3 = 0.d0
  s4 = 0.d0
  s5 = 0.d0
  s6 = 0.d0
  s7 = 0.d0
  do k=kmin,kmax
    do j=jmin,jmax
 !DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
      do i=imin,imax
        s1 = s1 + f1(i,j,k)*f1(i,j,k)
        s2 = s2 + f2(i,j,k)*f2(i,j,k)
        s3 = s3 + f3(i,j,k)*f3(i,j,k)
        s4 = s4 + f4(i,j,k)*f4(i,j,k)
        s5 = s5 + f5(i,j,k)*f5(i,j,k)
        s6 = s6 + f6(i,j,k)*f6(i,j,k)
        s7 = s7 + f7(i,j,k)*f7(i,j,k)
      enddo
    enddo
  enddo
  f_out(1) = s1*dX*dY*dZ
  f_out(2) = s2*dX*dY*dZ
  f_out(3) = s3*dX*dY*dZ
  f_out(4) = s4*dX*dY*dZ
  f_out(5) = s5*dX*dY*dZ
  f_out(6) = s6*dX*dY*dZ
  f_out(7) = s7*dX*dY*dZ
  return
  end subroutine l2normhelper7
 !--------------------------------------------------------------------------------------
 ! calculate L2norm especially for shell Blocks
  subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw,ogw,Symmetry)
@@ -1335,9 +1603,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k,n_elements
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1400,12 +1668,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(n_elements))
-allocate(f_flat(n_elements))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1432,9 +1699,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1497,12 +1764,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(Nout))
-allocate(f_flat(Nout))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
+  f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
-f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
  return
@@ -1603,9 +1869,12 @@ deallocate(f_flat)
 !       ^
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
-  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
+  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
-
+  integer :: i,j,k
-  fout = C1*f1+C2*f2+C3*f3
+
  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
  end do
  return
@@ -1699,8 +1968,8 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN,ORDN) :: ya
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
-  real*8, dimension(3) :: SoAh
+  real*8, dimension(3) :: SoAh
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1736,21 +2005,17 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  return
@@ -1779,8 +2044,8 @@ deallocate(f_flat)
  integer,dimension(2) :: cxB,cxT
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
-  real*8, dimension(2) :: SoAh
+  real*8, dimension(2) :: SoAh
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1810,14 +2075,12 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  ! Use BLAS DDOT for final weighted sum
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  return
@@ -1843,12 +2106,12 @@ deallocate(f_flat)
 !~~~~~~> Other parameters:
  real*8, dimension(-ORDN+1:ex(1)+ORDN,-ORDN+1:ex(2)+ORDN,ex(3)) :: fh
-  integer :: m
+  integer :: m
-  integer :: cxB,cxT
+  integer :: cxB,cxT
-  real*8, dimension(ORDN) :: ya
+  real*8, dimension(ORDN) :: ya
-  real*8 :: SoAh
+  real*8 :: SoAh
-  integer,dimension(3) :: inds
+  integer,dimension(3) :: inds
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1909,8 +2172,7 @@ deallocate(f_flat)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  ! Optimized with BLAS DDOT for weighted sum
+  f_int = DDOT(ORDN, coef, 1, ya, 1)
  f_int = DDOT(ORDN, coef, 1, ya, 1)
  return
@@ -2142,38 +2404,32 @@ deallocate(f_flat)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
-  real*8 :: gont
+  real*8 :: gont
-  integer :: i
+
-
+  integer :: i
-  ! Lookup table for factorials 0! to 20! (precomputed)
+  real*8, parameter, dimension(0:20) :: fact_table = [ &
-  real*8, parameter, dimension(0:20) :: fact_table = [ &
+    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
-    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
+    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
-    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
+    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
-    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
+    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
-    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
+    2432902008176640000.d0 ]
    2432902008176640000.d0 ]
 ! sanity check
-  if(N < 0)then
+  if(N < 0)then
-     write(*,*) "ffact: error input for factorial"
+     write(*,*) "ffact: error input for factorial"
-     gont = 1.d0
+     gont = 1.d0
-     return
+     return
-  endif
+  endif
-
+
-  ! Use lookup table for small N (fast path)
+  if(N <= 20)then
-  if(N <= 20)then
+     gont = fact_table(N)
-     gont = fact_table(N)
+  else
-  else
+     gont = exp(log_gamma(dble(N+1)))
-     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
+  endif
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/fmisc.h
+++ b/AMSS_NCKU_source/fmisc.h
@@ -12,9 +12,10 @@
 #define f_global_interpind global_interpind
 #define f_global_interpind2d global_interpind2d
 #define f_global_interpind1d global_interpind1d
-#define f_l2normhelper l2normhelper
+#define f_l2normhelper l2normhelper
-#define f_l2normhelper_sh l2normhelper_sh
+#define f_l2normhelper7 l2normhelper7
-#define f_l2normhelper_sh_rms l2normhelper_sh_rms
+#define f_l2normhelper_sh l2normhelper_sh
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms
 #define f_average average
 #define f_average3 average3
 #define f_average2 average2
@@ -41,9 +42,10 @@
 #define f_global_interpind GLOBAL_INTERPIND
 #define f_global_interpind2d GLOBAL_INTERPIND2D
 #define f_global_interpind1d GLOBAL_INTERPIND1D
-#define f_l2normhelper L2NORMHELPER
+#define f_l2normhelper L2NORMHELPER
-#define f_l2normhelper_sh L2NORMHELPER_SH
+#define f_l2normhelper7 L2NORMHELPER7
-#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
+#define f_l2normhelper_sh L2NORMHELPER_SH
 #define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
 #define f_average AVERAGE
 #define f_average3 AVERAGE3
 #define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
 #define f_global_interpind global_interpind_
 #define f_global_interpind2d global_interpind2d_
 #define f_global_interpind1d global_interpind1d_
-#define f_l2normhelper l2normhelper_
+#define f_l2normhelper l2normhelper_
-#define f_l2normhelper_sh l2normhelper_sh_
+#define f_l2normhelper7 l2normhelper7_
-#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
+#define f_l2normhelper_sh l2normhelper_sh_
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms_
 #define f_average average_
 #define f_average3 average3_
 #define f_average2 average2_
@@ -156,21 +159,30 @@ extern "C"
 							  int *, double *, int &, int &);
 }
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper(int *, double *, double *, double *,
+	void f_l2normhelper(int *, double *, double *, double *,
-						double &, double &, double &,
+						double &, double &, double &,
-						double &, double &, double &,
+						double &, double &, double &,
-						double *, double &, int &);
+						double *, double &, int &);
-}
+}
-
+
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper_sh(int *, double *, double *, double *,
+	void f_l2normhelper7(int *, double *, double *, double *,
-						   double &, double &, double &,
+						 double &, double &, double &,
-						   double &, double &, double &,
+						 double &, double &, double &,
-						   double *, double &, int &, int &, int &);
+						 double *, double *, double *, double *,
-}
+						 double *, double *, double *, double *, int &);
 }
 extern "C"
 {
 	void f_l2normhelper_sh(int *, double *, double *, double *,
 						   double &, double &, double &,
 						   double &, double &, double &,
 						   double *, double &, int &, int &, int &);
 }
 extern "C"
 {
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -16,66 +16,115 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
-
+/* Linear equation solution by Gauss-Jordan elimination.
 // Intel oneMKL LAPACK interface
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv, temp;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            for (int ii = 0; ii < n; ii++)
            {
              for (int jj = 0; jj < n; jj++)
                cout << a[ii * n + jj] << " ";
              cout << endl;
            }
            return 1; // error return
          }
        }
    ipiv[icol] = ipiv[icol] + 1;
    if (irow != icol)
    {
      for (l = 0; l < n; l++)
      {
        swap = a[irow * n + l];
        a[irow * n + l] = a[icol * n + l];
        a[icol * n + l] = swap;
      }
      swap = b[irow];
      b[irow] = b[icol];
      b[icol] = swap;
    }
    indxr[i] = irow;
    indxc[i] = icol;
    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      for (int ii = 0; ii < n; ii++)
      {
        for (int jj = 0; jj < n; jj++)
          cout << a[ii * n + jj] << " ";
        cout << endl;
      }
      return 1; // error return
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+  for (l = n - 1; l >= 0; l--)
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+  {
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    if (indxr[l] != indxc[l])
-
+      for (k = 0; k < n; k++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[k * n + indxr[l]];
-    delete[] ipiv;
+        a[k * n + indxr[l]] = a[k * n + indxc[l]];
-    delete[] a_copy;
+        a[k * n + indxc[l]] = swap;
-    return 1;
+      }
  }
  // Step 2: Compute matrix inverse A^(-1) using LU factorization
  // First do LU factorization of original matrix a
  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  // Then compute inverse from LU factorization
  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/ilucg.f90
+++ b/AMSS_NCKU_source/ilucg.f90
@@ -512,10 +512,11 @@
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      DIMENSION V(N),W(N)
 !     SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
 !     Optimized using Intel oneMKL BLAS ddot
 !     Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
-      DOUBLE PRECISION, EXTERNAL :: DDOT
+      SUM = 0.0D0
-      DGVV = DDOT(N, V, 1, W, 1)
+            DO 10 I = 1,N
            SUM = SUM + V(I)*W(I)
 10          CONTINUE
      DGVV = SUM
      RETURN
      END
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,8 +65,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -160,8 +159,7 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
  call symmetry_bd(3,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -275,8 +273,7 @@ real*8,intent(in) :: eps
 !                                              dx^8
 !  note the sign (-1)^r-1, now r=4
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -388,8 +385,7 @@ real*8,intent(in) :: eps
 !                                                              dx^10
 !  note the sign (-1)^r-1, now r=5
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/kodiss_sh.f90
+++ b/AMSS_NCKU_source/kodiss_sh.f90
@@ -80,8 +80,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -179,8 +178,7 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -275,8 +273,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(2,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -372,8 +369,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -514,8 +510,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -603,8 +598,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(3,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -700,8 +694,7 @@ real*8,intent(in) :: eps
 !                                              dx^8
 !  note the sign (-1)^r-1, now r=4
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -801,8 +794,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(4,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -911,8 +903,7 @@ real*8,intent(in) :: eps
 !                                                              dx^10
 !  note the sign (-1)^r-1, now r=5
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
@@ -1015,8 +1006,7 @@ integer, parameter :: NO_SYMM=0, EQ_SYMM=1, OCTANT=2
  call symmetry_stbd(5,ex,f,fh,SoA)
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -68,8 +68,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)-1
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
@@ -234,8 +233,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)-1
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 #if 0  
@@ -560,8 +558,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)-1
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
@@ -777,8 +774,7 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
-  !$omp parallel do collapse(2) private(i,j,k)
+  do k=1,ex(3)-1
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -2,7 +2,7 @@
 #ifndef MICRODEF_H
 #define MICRODEF_H
-#include "macrodef.fh"
+#include "macrodef.fh"
 // application parameters
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,11 +1,25 @@
-
+
-
+
-include makefile.inc
+include makefile.inc
-
+
-.SUFFIXES: .o .f90 .C .for .cu
+## polint(ordn=6) kernel selector:
-
+##   1 (default): barycentric fast path
-.f90.o:
+##   0          : fallback to Neville path
-	$(f90) $(f90appflags) -c $< -o $@
+POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 ARCH_OPT = -march=x86-64-v4
 CXXAPPFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 TP_OPTFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 .SUFFIXES: .o .f90 .C .for .cu
 .f90.o:
 	$(f90) $(f90appflags) -c $< -o $@
 .C.o:
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -13,14 +27,14 @@ include makefile.inc
 .for.o:
 	$(f77) -c $< -o $@
-.cu.o:
+.cu.o:
-	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
-
+
-TwoPunctures.o: TwoPunctures.C
+TwoPunctures.o: TwoPunctures.C
-	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
-
+
-TwoPunctureABE.o: TwoPunctureABE.C
+TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
@@ -95,14 +109,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o
 # projects
-ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
-	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
 ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
-TwoPunctureABE: $(TwoPunctureFILES)
+TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,28 +1,32 @@
 ## GCC version (commented out)
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+## Intel oneAPI version with oneMKL
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using OpenMP-threaded MKL for parallel performance
+## Use sequential oneMKL to avoid introducing extra OpenMP behavior into ABE.
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lifcore -limf -lpthread -lm -ldl
+
 ## Optional Intel oneTBB allocator, kept aligned with main's build environment.
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
 ## -fprofile-instr-use: use collected profile data to guide optimization decisions
 ##   (branch prediction, basic block layout, inlining, loop unrolling)
 PROFDATA     = /home/amss/AMSS-NCKU/pgo_profile/default.profdata
 CXXAPPFLAGS  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
               -DMPI_STUB -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags  = -O3 -march=native -fp-model fast=2 -fma -ipo -qopenmp \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
-CLINKER      = icpx
+CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
--- a/AMSS_NCKU_source/misc.C
+++ b/AMSS_NCKU_source/misc.C
@@ -14,11 +14,7 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/misc.h
+++ b/AMSS_NCKU_source/misc.h
@@ -24,11 +24,7 @@ using namespace std;
 #include <complex.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 namespace misc
 {
--- a/AMSS_NCKU_source/monitor.h
+++ b/AMSS_NCKU_source/monitor.h
@@ -20,11 +20,7 @@ using namespace std;
 #endif
 #include <time.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 class monitor
 {
--- a/AMSS_NCKU_source/mpi_stub.h
+++ b/AMSS_NCKU_source/mpi_stub.h
@@ -1,153 +0,0 @@
 #ifndef MPI_STUB_H
 #define MPI_STUB_H
 /*
 * MPI Stub Header — single-process shim for AMSS-NCKU ABE solver.
 * Provides all MPI types, constants, and functions used in the codebase
 * as no-ops or trivial implementations for nprocs=1, myrank=0.
 */
 #include <cstring>
 #include <cstdlib>
 #include <cstdio>
 #include <time.h>
 /* ── Types ─────────────────────────────────────────────────────────── */
 typedef int MPI_Comm;
 typedef int MPI_Datatype;
 typedef int MPI_Op;
 typedef int MPI_Request;
 typedef int MPI_Group;
 typedef struct MPI_Status {
    int MPI_SOURCE;
    int MPI_TAG;
    int MPI_ERROR;
 } MPI_Status;
 /* ── Constants ─────────────────────────────────────────────────────── */
 #define MPI_COMM_WORLD  0
 #define MPI_INT              1
 #define MPI_DOUBLE           2
 #define MPI_DOUBLE_PRECISION 2
 #define MPI_DOUBLE_INT       3
 #define MPI_SUM    1
 #define MPI_MAX    2
 #define MPI_MAXLOC 3
 #define MPI_STATUS_IGNORE   ((MPI_Status *)0)
 #define MPI_STATUSES_IGNORE ((MPI_Status *)0)
 #define MPI_MAX_PROCESSOR_NAME 256
 /* ── Helper: sizeof for MPI_Datatype ──────────────────────────────── */
 static inline size_t mpi_stub_sizeof(MPI_Datatype type) {
    switch (type) {
        case MPI_INT:        return sizeof(int);
        case MPI_DOUBLE:     return sizeof(double);
        case MPI_DOUBLE_INT: return sizeof(double) + sizeof(int);
        default:             return 0;
    }
 }
 /* ── Init / Finalize ──────────────────────────────────────────────── */
 static inline int MPI_Init(int *, char ***) { return 0; }
 static inline int MPI_Finalize() { return 0; }
 /* ── Communicator queries ─────────────────────────────────────────── */
 static inline int MPI_Comm_rank(MPI_Comm, int *rank) { *rank = 0; return 0; }
 static inline int MPI_Comm_size(MPI_Comm, int *size) { *size = 1; return 0; }
 static inline int MPI_Comm_split(MPI_Comm comm, int, int, MPI_Comm *newcomm) {
    *newcomm = comm;
    return 0;
 }
 static inline int MPI_Comm_free(MPI_Comm *) { return 0; }
 /* ── Group operations ─────────────────────────────────────────────── */
 static inline int MPI_Comm_group(MPI_Comm, MPI_Group *group) {
    *group = 0;
    return 0;
 }
 static inline int MPI_Group_translate_ranks(MPI_Group, int n,
        const int *ranks1, MPI_Group, int *ranks2) {
    for (int i = 0; i < n; ++i) ranks2[i] = ranks1[i];
    return 0;
 }
 static inline int MPI_Group_free(MPI_Group *) { return 0; }
 /* ── Collective operations ────────────────────────────────────────── */
 static inline int MPI_Allreduce(const void *sendbuf, void *recvbuf,
        int count, MPI_Datatype datatype, MPI_Op, MPI_Comm) {
    std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
    return 0;
 }
 static inline int MPI_Iallreduce(const void *sendbuf, void *recvbuf,
        int count, MPI_Datatype datatype, MPI_Op, MPI_Comm,
        MPI_Request *request) {
    std::memcpy(recvbuf, sendbuf, count * mpi_stub_sizeof(datatype));
    *request = 0;
    return 0;
 }
 static inline int MPI_Bcast(void *, int, MPI_Datatype, int, MPI_Comm) {
    return 0;
 }
 static inline int MPI_Barrier(MPI_Comm) { return 0; }
 /* ── Point-to-point (never reached with nprocs=1) ─────────────────── */
 static inline int MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm) {
    return 0;
 }
 static inline int MPI_Recv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status *) {
    return 0;
 }
 static inline int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm,
        MPI_Request *req) {
    *req = 0;
    return 0;
 }
 static inline int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm,
        MPI_Request *req) {
    *req = 0;
    return 0;
 }
 /* ── Completion ───────────────────────────────────────────────────── */
 static inline int MPI_Wait(MPI_Request *, MPI_Status *) { return 0; }
 static inline int MPI_Waitall(int, MPI_Request *, MPI_Status *) { return 0; }
 /* ── Utility ──────────────────────────────────────────────────────── */
 static inline int MPI_Abort(MPI_Comm, int error_code) {
    std::fprintf(stderr, "MPI_Abort called with error code %d\n", error_code);
    std::exit(error_code);
    return 0;
 }
 static inline double MPI_Wtime() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.0e-9;
 }
 static inline int MPI_Get_processor_name(char *name, int *resultlen) {
    const char *stub_name = "localhost";
    std::strcpy(name, stub_name);
    *resultlen = (int)std::strlen(stub_name);
    return 0;
 }
 #endif /* MPI_STUB_H */
--- a/AMSS_NCKU_source/parameters.h
+++ b/AMSS_NCKU_source/parameters.h
@@ -24,11 +24,7 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 namespace parameters
 {
--- a/AMSS_NCKU_source/perf.h
+++ b/AMSS_NCKU_source/perf.h
@@ -30,11 +30,7 @@ using namespace std;
 #include <sys/time.h>
 #include <sys/resource.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 /* Real time */
 #define TimerSignal SIGALRM
--- a/AMSS_NCKU_source/rungekutta4_rout.f90
+++ b/AMSS_NCKU_source/rungekutta4_rout.f90
@@ -109,33 +109,23 @@
  if( RK4 == 0 ) then
   !$omp parallel workshare
   f1 = f0 + HLF * dT * f_rhs
   !$omp end parallel workshare
  elseif(RK4 == 1 ) then
   !$omp parallel workshare
   f_rhs = f_rhs + TWO * f1
-   !$omp end parallel workshare
+
   !$omp parallel workshare
   f1 = f0 + HLF * dT * f1
   !$omp end parallel workshare
  elseif(RK4 == 2 ) then
   !$omp parallel workshare
   f_rhs = f_rhs + TWO * f1
-   !$omp end parallel workshare
+
   !$omp parallel workshare
   f1 = f0 +       dT * f1
   !$omp end parallel workshare
  elseif( RK4 == 3 ) then
-
+ 
   !$omp parallel workshare
   f1 = f0 +F1o6 * dT *(f1 + f_rhs)
   !$omp end parallel workshare
  else
@@ -144,7 +134,7 @@
  endif
-  return
+  return   
  end subroutine rungekutta4_rout
 !-----------------------------------------------------------------------------
@@ -225,19 +215,15 @@
  if( RK4 == 0 ) then
   !$omp parallel workshare
   f1 = f0 + dT * f_rhs
   !$omp end parallel workshare
  else
   !$omp parallel workshare
   f1 = f0 + HLF * dT * (f1+f_rhs)
   !$omp end parallel workshare
  endif
-  return
+  return   
  end subroutine icn_rout
 !~~~~~~~~~~~~~~~~~~  
@@ -253,10 +239,8 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in) ::f_rhs
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) ::f1
  !$omp parallel workshare
   f1 = f0 + dT * f_rhs
  !$omp end parallel workshare
-  return
+  return   
  end subroutine euler_rout
--- a/AMSS_NCKU_source/scalar_class.h
+++ b/AMSS_NCKU_source/scalar_class.h
@@ -19,11 +19,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "cgh.h"
 #include "ShellPatch.h"
--- a/AMSS_NCKU_source/scalarwaves.C
+++ b/AMSS_NCKU_source/scalarwaves.C
@@ -18,11 +18,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "microdef.h"
--- a/AMSS_NCKU_source/setup.C
+++ b/AMSS_NCKU_source/setup.C
@@ -3,11 +3,7 @@
 #include <math.h>
 #include <string.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "util_Table.h"
 #include "cctk.h"
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -20,11 +20,7 @@ using namespace std;
 #include <math.h>
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "cgh.h"
@@ -224,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -238,11 +241,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
  double *RP_out, *IP_out;
@@ -365,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -567,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -755,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1013,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1457,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1901,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2096,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2291,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2388,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -2402,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  double Chi, Psi;
  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2536,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -2805,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3088,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3673,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
--- a/AMSS_NCKU_source/testNull.C
+++ b/AMSS_NCKU_source/testNull.C
@@ -18,11 +18,7 @@ using namespace std;
 #include <math.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/testNull2.C
+++ b/AMSS_NCKU_source/testNull2.C
@@ -20,11 +20,7 @@ using namespace std;
 #include <map.h>
 #endif
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "misc.h"
 #include "macrodef.h"
--- a/AMSS_NCKU_source/var.C
+++ b/AMSS_NCKU_source/var.C
@@ -9,11 +9,7 @@
 using namespace std;
 #include <time.h>
 #ifdef MPI_STUB
 #include "mpi_stub.h"
 #else
 #include <mpi.h>
 #endif
 #include "var.h"
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -10,31 +10,6 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
 import os
 ## OpenMP configuration for threaded Fortran kernels
 ## OMP_NUM_THREADS: set to number of physical cores (not hyperthreads)
 ## OMP_PROC_BIND: bind threads to cores to avoid migration overhead
 ## OMP_STACKSIZE: each thread needs stack space for fh arrays (~3.6MB)
 if "OMP_NUM_THREADS" not in os.environ:
    os.environ["OMP_NUM_THREADS"] = "96"
 os.environ["OMP_STACKSIZE"] = "16M"
 os.environ["OMP_PROC_BIND"] = "close"
 os.environ["OMP_PLACES"] = "cores"
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 0-111"
 #NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
 #NUMACTL_CPU_BIND = "taskset -c 8-15"
 NUMACTL_CPU_BIND = ""
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 16
 ##################################################################
@@ -51,11 +26,11 @@ def makefile_ABE():
    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                        )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = "make -j96" + " ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = "make -j4" + " ABEGPU"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -92,8 +67,8 @@ def makefile_TwoPunctureABE():
    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
    print(                                                            )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+    makefile_command = "make" + " TwoPunctureABE"
    ## Execute the command with subprocess.Popen and stream output
    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
@@ -128,28 +103,28 @@ def run_ABE():
    print(                                                      )
    ## Define the command to run; cast other values to strings as needed
-
+    
    if (input_data.GPU_Calculation == "no"):
-        run_command         = NUMACTL_CPU_BIND + " ./ABE"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
-        run_command_outfile = "ABE_out.log"
+        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        run_command         = NUMACTL_CPU_BIND + " ./ABEGPU"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
-        run_command_outfile = "ABEGPU_out.log"
+        mpi_command_outfile = "ABEGPU_out.log"
-
+ 
-    ## Execute the command and stream output
+    ## Execute the MPI command and stream output
-    run_process = subprocess.Popen(run_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    ## Write ABE run output to file while printing to stdout
-    with open(run_command_outfile, 'w') as file0:
+    with open(mpi_command_outfile, 'w') as file0:  
        ## Read and print output lines; also write each line to file
-        for line in run_process.stdout:
+        for line in mpi_process.stdout:
            print(line, end='')  # stream output in real time
            file0.write(line)    # write the line to file
            file0.flush()        # flush to ensure each line is written immediately (optional)            
    file0.close()
    ## Wait for the process to finish
-    run_return_code = run_process.wait()
+    mpi_return_code = mpi_process.wait()
    print(                                           )
    print( " The ABE/ABEGPU simulation is finished " ) 
@@ -166,14 +141,13 @@ def run_ABE():
 ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
 def run_TwoPunctureABE():
-    tp_time1=time.time()
+
    print(                                                          )
    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
    print(                                                          )
    ## Define the command to run
-    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = "./TwoPunctureABE"
    TwoPuncture_command         = " ./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
@@ -194,9 +168,7 @@ def run_TwoPunctureABE():
    print(                                               )
    print( " The TwoPunctureABE simulation is finished " ) 
    print(                                               )
-    tp_time2=time.time()
+    
    et=tp_time2-tp_time1
    print(f"Used time: {et}")
    return
 ##################################################################
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -0,0 +1,12 @@
 import multiprocessing
 def run_plot_task(task):
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -1,97 +0,0 @@
 # AMSS-NCKU PGO Profile Analysis Report
 ## 1. Profiling Environment
 | Item | Value |
 |------|-------|
 | Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
 | Instrumentation Flag | `-fprofile-instr-generate` |
 | Optimization Level (instrumented) | `-O2 -xHost -fma` |
 | MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
 | Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
 | Merged Profile | `default.profdata` (394 KB) |
 | llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
 ## 2. Reduced Simulation Parameters (for profiling run)
 | Parameter | Production Value | Profiling Value |
 |-----------|-----------------|-----------------|
 | MPI_processes | 64 | 1 |
 | grid_level | 9 | 4 |
 | static_grid_level | 5 | 3 |
 | static_grid_number | 96 | 24 |
 | moving_grid_number | 48 | 16 |
 | largest_box_xyz_max | 320^3 | 160^3 |
 | Final_Evolution_Time | 1000.0 | 10.0 |
 | Evolution_Step_Number | 10,000,000 | 1,000 |
 | Detector_Number | 12 | 2 |
 ## 3. Profile Summary
 | Metric | Value |
 |--------|-------|
 | Total instrumented functions | 1,392 |
 | Functions with non-zero counts | 117 (8.4%) |
 | Functions with zero counts | 1,275 (91.6%) |
 | Maximum function entry count | 386,459,248 |
 | Maximum internal block count | 370,477,680 |
 | Total block count | 4,198,023,118 |
 ## 4. Top 20 Hotspot Functions
 | Rank | Total Count | Max Block Count | Function | Category |
 |------|------------|-----------------|----------|----------|
 | 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
 | 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
 | 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
 | 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
 | 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
 | 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
 | 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
 | 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
 | 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
 | 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
 | 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
 | 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
 | 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
 | 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
 | 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
 | 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
 | 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
 | 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
 | 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
 | 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
 ## 5. Hotspot Category Breakdown
 Top 20 functions account for ~98% of total execution counts:
 | Category | Functions | Combined Count | Share |
 |----------|-----------|---------------|-------|
 | Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
 | BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
 | Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
 | Time integration | rungekutta4_rout_ | ~119M | ~3% |
 | Dissipation | kodis_ | ~92M | ~2% |
 | Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
 ## 6. Conclusions
 1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
 2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
 3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
 4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
 5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
 ## 7. PGO Phase 2 Usage
 To apply the profile, use the following flags in `makefile.inc`:
 ```makefile
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -align array64byte -fpp -I${MKLROOT}/include
 ```
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -8,11 +8,13 @@
 ##
 #################################################
-import numpy                               ## numpy for array operations
+import numpy                               ## numpy for array operations
-import scipy                               ## scipy for interpolation and signal processing
+import scipy                               ## scipy for interpolation and signal processing
-import math
+import math
-import matplotlib.pyplot    as     plt     ## matplotlib for plotting
+import matplotlib
-import os                                  ## os for system/file operations
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
 import AMSS_NCKU_Input as input_data
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -6,17 +6,22 @@
 ## Author: Xiaoqu
 ## Dates: 2024/10/01 --- 2025/09/14
 ##
-#################################################
+#################################################
-
+
-import numpy
+## Restrict OpenMP to one thread per process so that parallel
-import scipy
+## subprocess plotting does not multiply BLAS thread counts.
-import matplotlib.pyplot    as     plt
+import os
-from   matplotlib.colors    import LogNorm
+os.environ.setdefault("OMP_NUM_THREADS", "1")
-from   mpl_toolkits.mplot3d import Axes3D
+
-## import torch
+import numpy
-import AMSS_NCKU_Input      as input_data
+import scipy
-
+import matplotlib
-import os
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 #########################################################################################
@@ -92,9 +97,9 @@ def plot_binary_data( filename, binary_outdir, figure_outdir ):
-####################################################################################
+####################################################################################
-
+
-# Plot a single binary dataset (2D slices and 3D surface)
+# Plot a single binary dataset (2D slices and 3D surface)
 def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
@@ -188,7 +193,15 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
    plt.savefig( os.path.join(figure_surfaceplot_outdir, figure_title + " time = " + str(time) + " surface_plot.pdf") )   # save figure
    plt.close()
-    return
+    return
-
+
-####################################################################################
+####################################################################################
 ## Allow standalone subprocess execution for parallel binary-data plotting.
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -6,15 +6,20 @@
 ## 2024/10/01 --- 2025/09/14
 ##
 #################################################
-
+
-import numpy                               ## numpy for array operations
+import numpy                               ## numpy for array operations
-import matplotlib.pyplot    as     plt     ## matplotlib for plotting
+import matplotlib
-from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
-import glob
+import matplotlib.pyplot    as     plt     ## matplotlib for plotting
-import os                                  ## operating system utilities
+from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
-
+import glob
-import plot_binary_data
+import os                                  ## operating system utilities
-import AMSS_NCKU_Input as input_data
+
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -50,13 +55,37 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in the list
+    ## Plot each file in parallel using subprocesses.
-    for filename in file_list:
+    ## Each subprocess starts with BLAS thread limits in plot_binary_data.py.
-        print(filename)
+    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
-        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
+    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
-
+
-    print(                        )
+    running = []
-    print( " Binary Data Plot Has been Finished " )
+    failed  = []
    for filename in file_list:
        print(filename)
        proc = subprocess.Popen(
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )
    print(                                        )
    return
Author	SHA1	Message	Date
CGH0S7	45e3c725f9	Trigger-Discipline: parallelize result plotting	2026-04-24 10:04:57 +08:00
CGH0S7	7f603f189b	Trigger-Discipline: port TwoPuncture OpenMP optimizations	2026-04-24 09:25:13 +08:00
CGH0S7	a821f21a23	.gitignore updated	2026-04-24 09:10:12 +08:00
CGH0S7	34fe3e6aa5	Trigger-Discipline: port conservative build and fmisc optimizations	2026-04-24 09:09:50 +08:00
CGH0S7	79af79d471	baseline updated	2026-02-05 19:53:55 +08:00