Trigger-Discipline: parallelize result plotting

Trigger-Discipline: port TwoPuncture OpenMP optimizations
.gitignore updated
2026-04-24 10:04:57 +08:00 · 2026-04-24 09:25:13 +08:00 · 2026-04-24 09:10:12 +08:00 · 2026-04-24 09:09:50 +08:00 · 2026-02-05 19:53:55 +08:00
26 changed files with 3133 additions and 3087 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 __pycache__
 GW150914
-GW150914-origin
+GW150914*
-GW150914-mini
+.codex
-docs
+docs/
 *.tmp
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,14 +16,12 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 8                             ## number of mpi processes used in the simulation
+MPI_processes    = 64                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
--- a/AMSS_NCKU_Input_Mini.py
+++ b/AMSS_NCKU_Input_Mini.py
@@ -1,233 +0,0 @@
 #################################################
 ##
 ## This file provides the input parameters required for numerical relativity.
 ## XIAOQU
 ## 2024/03/19 --- 2025/09/14
 ## Modified for GW150914-mini test case
 ##
 #################################################
 import numpy    
 #################################################
 ## Setting MPI processes and the output file directory
 File_directory   = "GW150914-mini"               ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
 MPI_processes    = 4                             ## number of mpi processes used in the simulation (Reduced for laptop)
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 #################################################
 #################################################
 ## Setting the physical system and numerical method
 Symmetry                 = "equatorial-symmetry"   ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
 Equation_Class           = "BSSN"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
                                                   ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
 Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
 #################################################
 ## Setting the time evolutionary information
 Start_Evolution_Time     = 0.0                    ## start evolution time t0
 Final_Evolution_Time     = 100.0                  ## final evolution time t1 (Reduced for quick test)
 Check_Time               = 10.0
 Dump_Time                = 10.0                   ## time inteval dT for dumping binary data
 D2_Dump_Time             = 10.0                   ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 1.0                    ## dump the puncture position and GW psi4 after dT"
 Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
 #################################################
 #################################################
 ## Setting the grid structure
 basic_grid_set    = "Patch"                          ## grid structure: choose "Patch" or "Shell-Patch"
 grid_center_set   = "Cell"                           ## grid center: chose "Cell" or "Vertex"
 grid_level        = 7                                ## total number of AMR grid levels (Reduced from 9)
 static_grid_level = 4                                ## number of AMR static grid levels (Reduced from 5)
 moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
 analysis_level    = 0
 refinement_level  = 3                                ## time refinement start from this grid level
 largest_box_xyz_max = [320.0, 320.0, 320.0]          ## scale of the largest box
                                                     ## not ne cess ary to be cubic for "Patch" grid s tructure
                                                     ## need to be a cubic box for "Shell-Patch" grid structure
 largest_box_xyz_min = - numpy.array(largest_box_xyz_max)  
 static_grid_number = 48                              ## grid points of each static AMR grid (in x direction) (Reduced from 96)
                                                     ## (grid points in y and z directions are automatically adjusted)
 moving_grid_number = 24                              ## grid points of each moving AMR grid (Reduced from 48)
 shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
                                                     ## in (phi, theta, r) direction
 devide_factor      = 2.0                             ## resolution between different grid levels dh0/dh1, only support 2.0 now
 static_grid_type   = 'Linear'                        ## AMR static grid structure , only supports "Linear"
 moving_grid_type   = 'Linear'                        ## AMR moving grid structure , only supports "Linear"
 quarter_sphere_number = 48                           ## grid number of 1/4 s pher ical surface (Reduced from 96)
                                                     ## (which is needed for evaluating the spherical surface integral)
 #################################################
 #################################################
 ## Setting the puncture information
 puncture_number       = 2                                     
 position_BH           = numpy.zeros( (puncture_number, 3) )   
 parameter_BH          = numpy.zeros( (puncture_number, 3) )   
 dimensionless_spin_BH = numpy.zeros( (puncture_number, 3) )   
 momentum_BH           = numpy.zeros( (puncture_number, 3) )   
 puncture_data_set     = "Manually"                       ## Method to give Puncture’s positions and momentum
                                                         ## choose "Manually" or "Automatically-BBH"
                                                         ## Prefer to choose "Manually", because "Automatically-BBH" is developing now
 ## initial orbital distance and ellipticity for BBHs system
 ## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
 Distance = 10.0
 e0       = 0.0
 ## black hole parameter (M Q* a*)
 parameter_BH[0] = [ 36.0/(36.0+29.0),  0.0,  +0.31 ]   
 parameter_BH[1] = [ 29.0/(36.0+29.0),  0.0,  -0.46 ]  
 ## dimensionless spin in each direction
 dimensionless_spin_BH[0] = [ 0.0,  0.0,  +0.31 ]   
 dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]  
 ## use Brugmann's convention
 ##  -----0-----> y
 ##   -      +     
 #---------------------------------------------
 ## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
 ## initial position for each puncture
 position_BH[0]  = [  0.0,  10.0*29.0/(36.0+29.0), 0.0 ]  
 position_BH[1]  = [  0.0, -10.0*36.0/(36.0+29.0), 0.0 ] 
 ## initial mumentum for each puncture
 ## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
 momentum_BH[0]  = [ -0.09530152296974252,  -0.00084541526517121,   0.0 ]
 momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
 #################################################
 #################################################
 ## Setting the gravitational wave information
 GW_L_max        = 4                      ## maximal L number in gravitational wave
 GW_M_max        = 4                      ## maximal M number in gravitational wave
 Detector_Number = 12                     ## number of dector
 Detector_Rmin   = 50.0                   ## nearest dector distance
 Detector_Rmax   = 160.0                  ## farest dector distance
 #################################################
 #################################################
 ## Setting the apprent horizon
 AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"
 AHF_Find_Every = 24
 AHF_Dump_Time  = 20.0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## Only influence the Equation_Class = "BSSN-EScalar" case
 FR_a2     = 3.0        ## f(R) = R + a2 * R^2    
 FR_l2     = 10000.0
 FR_phi0   = 0.00005
 FR_r0     = 120.0
 FR_sigma0 = 8.0
 FR_Choice = 2          ## Choice options: 1 2 3 4 5
                       ## 1: phi(r) = phi0 * Exp(-(r-r0)**2/sigma0)   
                       ##    V(r)   = 0
                       ## 2: phi(r) =  phi0 * a2^2/(1+a2^2)  
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 3: Schrodinger-Newton gived by system phi(r) 
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 4: phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma0) - tanh((r-r0)/sigma0) )  
                       ##    V(r)   = 0
                       ##    f(R)   = R + a2*R^2  with a2 = +oo
                       ## 5: phi(r) = phi0 * Exp(-(r-r0)**2/sigma)   
                       ##    V(r)   = 0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## (please do not change if not necessary)
 boundary_choice = "BAM-choice"     ## Sommerfeld boundary condition : choose "BAM-choice" or "Shibata-choice" 
                                   ## prefer "BAM-choice"
 gauge_choice  = 0                  ## gauge choice
                                   ## 0: B^i gauge
                                   ## 1: David's puncture gauge
                                   ## 2: MB B^i gauge               
                                   ## 3: RIT B^i gauge
                                   ## 4: MB beta gauge 
                                   ## 5: RIT beta gauge 
                                   ## 6: MGB1 B^i gauge
                                   ## 7: MGB2 B^i gauge
                                   ## prefer 0 or 1
 tetrad_type  = 2                   ## tetradtype 
                                   ##  v:r; u: phi; w: theta
                                   ##      v^a = (x,y,z)
                                   ## 0: orthonormal order: v,u,w
                                   ##    v^a = (x,y,z)   
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## 1: orthonormal order: w,u,v
                                   ##    m = (theta + i phi)/sqrt(2) 
                                   ##    following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)    
                                   ## 2: orthonormal order: v,u,w
                                   ##    v_a = (x,y,z)
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## this version recommend set to 2
                                   ## prefer 2
 #################################################
--- a/AMSS_NCKU_MiniProgram.py
+++ b/AMSS_NCKU_MiniProgram.py
@@ -1,224 +0,0 @@
 ##################################################################
 ##
 ## AMSS-NCKU Numerical Relativity Mini Test Program
 ## Author: Assistant (based on Xiaoqu's code)
 ## 2026/01/20
 ##
 ## This script runs a scaled-down version of the GW150914 test case
 ## suitable for laptop testing.
 ##
 ##################################################################
 import os
 import shutil
 import sys
 import time
 # --- Context Manager for Input File Swapping ---
 class InputFileSwapper:
    def __init__(self, mini_file="AMSS_NCKU_Input_Mini.py", target_file="AMSS_NCKU_Input.py"):
        self.mini_file = mini_file
        self.target_file = target_file
        self.backup_file = target_file + ".bak"
        self.swapped = False
    def __enter__(self):
        print(f"[MiniProgram] Swapping {self.target_file} with {self.mini_file}...")
        if os.path.exists(self.target_file):
            shutil.move(self.target_file, self.backup_file)
        shutil.copy(self.mini_file, self.target_file)
        self.swapped = True
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        if self.swapped:
            print(f"[MiniProgram] Restoring original {self.target_file}...")
            os.remove(self.target_file)
            if os.path.exists(self.backup_file):
                shutil.move(self.backup_file, self.target_file)
 def main():
    # Use the swapper to ensure all imported modules see the mini configuration
    with InputFileSwapper():
        # Import modules AFTER swapping input file
        try:
            import AMSS_NCKU_Input as input_data
            import print_information
            import setup
            import numerical_grid
            import generate_macrodef
            import makefile_and_run
            import generate_TwoPuncture_input
            import renew_puncture_parameter
            import plot_xiaoqu
            import plot_GW_strain_amplitude_xiaoqu
        except ImportError as e:
            print(f"Error importing modules: {e}")
            return
        print_information.print_program_introduction()
        print("\n" + "#"*60)
        print(" RUNNING MINI TEST CASE: GW150914-mini")
        print("#"*60 + "\n")
        # --- Directory Setup ---
        File_directory = os.path.join(input_data.File_directory)
        if os.path.exists(File_directory):
            print(f" Output directory '{File_directory}' exists. Removing for mini test...")
            shutil.rmtree(File_directory, ignore_errors=True)
        os.mkdir(File_directory)
        shutil.copy("AMSS_NCKU_Input.py", File_directory) # Copies the current (mini) input
        output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
        os.mkdir(output_directory)
        binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
        os.mkdir(binary_results_directory)
        figure_directory = os.path.join(File_directory, "figure")
        os.mkdir(figure_directory)
        print(" Output directories generated.\n")
        # --- Setup and Input Generation ---
        setup.print_input_data(File_directory)
        setup.generate_AMSSNCKU_input()
        setup.print_puncture_information()
        print("\n Generating AMSS-NCKU input parfile...")
        numerical_grid.append_AMSSNCKU_cgh_input()
        print("\n Plotting initial grid...")
        numerical_grid.plot_initial_grid()
        print("\n Generating macro files...")
        generate_macrodef.generate_macrodef_h()
        generate_macrodef.generate_macrodef_fh()
        # --- Compilation Preparation ---
        print("\n Preparing to compile and run...")
        AMSS_NCKU_source_path = "AMSS_NCKU_source"
        AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
        if not os.path.exists(AMSS_NCKU_source_path):
             print(" Error: AMSS_NCKU_source not found! Please run in the project root.")
             return
        shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
        macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
        macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
        shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
        shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
        # --- Compilation ---
        cwd = os.getcwd()
        os.chdir(AMSS_NCKU_source_copy)
        print(" Compiling ABE...")
        makefile_and_run.makefile_ABE()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ): 
            print(" Compiling TwoPunctureABE...")
            makefile_and_run.makefile_TwoPunctureABE()
        os.chdir(cwd)
        # --- Copy Executables ---
        if (input_data.GPU_Calculation == "no"):
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
        else:
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
        if not os.path.exists(ABE_file):
            print(" Error: ABE executable compilation failed.")
            return
        shutil.copy2(ABE_file, output_directory)
        TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
            if not os.path.exists(TwoPuncture_file):
                print(" Error: TwoPunctureABE compilation failed.")
                return
            shutil.copy2(TwoPuncture_file, output_directory)
        # --- Execution ---
        start_time = time.time()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
             print("\n Generating TwoPuncture input...")
             generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
             AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
             AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
             shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
             print(" Running TwoPunctureABE...")
             os.chdir(output_directory)
             makefile_and_run.run_TwoPunctureABE()
             os.chdir(cwd)
        # Update Puncture Parameter
        renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
        AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
        AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
        shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
        print("\n Input files ready. Launching ABE...")
        os.chdir(output_directory)
        makefile_and_run.run_ABE()
        os.chdir(cwd)
        end_time = time.time()
        elapsed_time = end_time - start_time
        # --- Post-processing ---
        print("\n Copying output files for inspection...")
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
        for fname in ["bssn_BH.dat", "bssn_ADMQs.dat", "bssn_psi4.dat", "bssn_constraint.dat"]:
            fpath = os.path.join(binary_results_directory, fname)
            if os.path.exists(fpath):
                shutil.copy(fpath, os.path.join(output_directory, fname))
        # --- Plotting ---
        print("\n Plotting results...")
        try:
            plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
                plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.grid_level):
                plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
            plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
        except Exception as e:
            print(f"Warning: Plotting failed: {e}")
        print(f"\n Program Cost = {elapsed_time:.2f} Seconds \n")
        print(" AMSS-NCKU-Python simulation finished (Mini Test).\n")
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -9,9 +9,19 @@
 ##################################################################
-##################################################################
+##################################################################
-
+
-## Print program introduction
+## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script.
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
 ## Print program introduction
 import print_information
@@ -422,31 +432,36 @@ print( " Plotting the txt and binary results data from the AMSS-NCKU simulation
 print(                                                                          )
-import plot_xiaoqu
+import plot_xiaoqu
-import plot_GW_strain_amplitude_xiaoqu
+import plot_GW_strain_amplitude_xiaoqu
-
+from parallel_plot_helper import run_plot_tasks_parallel
-## Plot black hole trajectory
+
-plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
+plot_tasks = []
-plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
+
-
+## Plot black hole trajectory
-## Plot black hole separation vs. time
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
-plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
-
+
-## Plot gravitational waveforms (psi4 and strain amplitude)
+## Plot black hole separation vs. time
-for i in range(input_data.Detector_Number):
+plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
-    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
+
-    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
+## Plot gravitational waveforms (psi4 and strain amplitude)
-
+for i in range(input_data.Detector_Number):
-## Plot ADM mass evolution
+    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
-for i in range(input_data.Detector_Number):
+    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
-    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
+
-
+## Plot ADM mass evolution
-## Plot Hamiltonian constraint violation over time
+for i in range(input_data.Detector_Number):
-for i in range(input_data.grid_level):
+    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
-    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
+
-
+## Plot Hamiltonian constraint violation over time
-## Plot stored binary data
+for i in range(input_data.grid_level):
-plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
+    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
 print(                                                 )
 print( f" This Program Cost = {elapsed_time} Seconds " )
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,279 +0,0 @@
 #!/usr/bin/env python3
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
 import sys
 import os
 # ANSI Color Codes
 class Color:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
 def get_status_text(passed):
    if passed:
        return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
    else:
        return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
 def load_bh_trajectory(filepath):
    """Load black hole trajectory data"""
    data = np.loadtxt(filepath)
    return {
        'time': data[:, 0],
        'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
        'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
    }
 def load_constraint_data(filepath):
    """Load constraint violation data"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
    x1_new = bh_data_target['x1'][:M]
    y1_new = bh_data_target['y1'][:M]
    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
    # Calculate RMS for BH1
    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
    # Calculate RMS for BH2
    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
    # Avoid division by zero for BH1
    valid_mask1 = r1_max > 1e-15
    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    # Avoid division by zero for BH2
    valid_mask2 = r2_max > 1e-15
    if np.sum(valid_mask2) < 10:
        return None, "Insufficient valid data points for BH2"
    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
    Analyze ADM constraint violation
    Return maximum constraint violation for Grid Level 0
    """
    # Extract Grid Level 0 data (first entry for each time step)
    level0_data = constraint_data[::n_levels]
    # Calculate maximum absolute value for each constraint
    results = {
        'Ham': np.max(np.abs(level0_data[:, 1])),
        'Px': np.max(np.abs(level0_data[:, 2])),
        'Py': np.max(np.abs(level0_data[:, 3])),
        'Pz': np.max(np.abs(level0_data[:, 4])),
        'Gx': np.max(np.abs(level0_data[:, 5])),
        'Gy': np.max(np.abs(level0_data[:, 6])),
        'Gz': np.max(np.abs(level0_data[:, 7]))
    }
    results['max_violation'] = max(results.values())
    return results
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
 def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
    passed = rms_rel < threshold
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
        print(f"   Max |{name:3}|: {results[name]:.6f}", end="   ")
        if (i + 1) % 2 == 0: print()
    if len(names) % 2 != 0: print()
    passed = results['max_violation'] < threshold
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,57 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
  m=nn
 1 if ((m.ge.2).and.(j.gt.m)) then
  j=j-m
  m=m/2
 goto 1
  endif
 j=j+m
 enddo
 mmax=2
 2  if (n.gt.mmax) then
     istep=2*mmax
     theta=6.28318530717959d0/(isign*mmax)
     wpr=-2.d0*sin(0.5d0*theta)**2
     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Commit the descriptor
 status = DftiCommitDescriptor(desc)
 if (status /= 0) then
   status = DftiFreeDescriptor(desc)
   return
 endif
 ! Execute FFT based on direction
 if (isign == 1) then
   ! Forward FFT: exp(-2*pi*i*k*n/N)
   status = DftiComputeForward(desc, dataa)
 else
   ! Backward FFT: exp(+2*pi*i*k*n/N)
   status = DftiComputeBackward(desc, dataa)
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,7 +1,8 @@
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 #include <omp.h>
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -32,7 +33,7 @@ private:
       int npoints_A, npoints_B, npoints_phi;
       double target_M_plus, target_M_minus;
-       
+
       double admMass;
       double adm_tol;
@@ -42,6 +43,18 @@ private:
       int ntotal;
       // ===== Precomputed spectral derivative matrices =====
       double *D1_A, *D2_A;
       double *D1_B, *D2_B;
       double *DF1_phi, *DF2_phi;
       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
       int max_threads;
       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
       struct parameters
       {
              int nvar, n1, n2, n3;
@@ -58,6 +71,28 @@ public:
                    int Newtonmaxit);
       ~TwoPunctures();
       // 02/07: New/modified methods
       void allocate_workspace();
       void free_workspace();
       void precompute_derivative_matrices();
       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
                                double *l, double *u_ws, double *d, double *y);
       void LineRelax_be_omp(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD, int tid);
       void LineRelax_al_omp(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD, int tid);
       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void Solve();
       void set_initial_guess(derivs v);
       int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -116,23 +151,11 @@ public:
       double BY_KKofxyz(double x, double y, double z);
       void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
       void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void LineRelax_be(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD);
       void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                         int n3, derivs dv, derivs u, double *values);
       void LinEquations(double A, double B, double X, double R,
                         double x, double r, double phi,
                         double y, double z, derivs dU, derivs U, double *values);
       void LineRelax_al(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD);
       void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
       void Save(char *fname);
       // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -141,4 +164,4 @@ public:
       void SpecCoef(parameters par, int ivar, double *v, double *cf);
 };
-#endif /* TWO_PUNCTURES_H */
+#endif /* TWO_PUNCTURES_H */
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -1939,309 +1939,6 @@
  return
  end subroutine fddyz
  subroutine fderivs_batch4(ex,f1,f2,f3,f4, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z,f4x,f4y,f4z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3,f4
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f4x,f4y,f4z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3,fh4
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  call symmetry_bd(2,ex,f4,fh4,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  f4x = ZEO; f4y = ZEO; f4z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
      f4x(i,j,k)=d12dx*(fh4(i-2,j,k)-EIT*fh4(i-1,j,k)+EIT*fh4(i+1,j,k)-fh4(i+2,j,k))
      f4y(i,j,k)=d12dy*(fh4(i,j-2,k)-EIT*fh4(i,j-1,k)+EIT*fh4(i,j+1,k)-fh4(i,j+2,k))
      f4z(i,j,k)=d12dz*(fh4(i,j,k-2)-EIT*fh4(i,j,k-1)+EIT*fh4(i,j,k+1)-fh4(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
      f4x(i,j,k)=d2dx*(-fh4(i-1,j,k)+fh4(i+1,j,k))
      f4y(i,j,k)=d2dy*(-fh4(i,j-1,k)+fh4(i,j+1,k))
      f4z(i,j,k)=d2dz*(-fh4(i,j,k-1)+fh4(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch4
 !-----------------------------------------------------------------------------
 ! batch first derivatives (3 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch3(ex,f1,f2,f3, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch3
 !-----------------------------------------------------------------------------
 ! batch first derivatives (2 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch2(ex,f1,f2, &
                            f1x,f1y,f1z,f2x,f2y,f2z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch2
 #elif (ghost_width == 4)
 ! sixth order code
@@ -2380,9 +2077,6 @@
  end subroutine fderivs
 !-----------------------------------------------------------------------------
 ! batch first derivatives (4 fields), same symmetry setup
 !-----------------------------------------------------------------------------
 !-----------------------------------------------------------------------------
 !
 ! single derivatives dx
 !
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -17,50 +17,62 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
-!~~~~~~~> Local variable:
+!~~~~~~~> Local variable:
-  
+
-  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
+  integer :: i,j,k
-  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
+  real*8 :: lgxx,lgyy,lgzz,ldetg
-  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
+  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
+  real*8 :: ltrA,lscale
-
+  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
-!~~~~~~>
+
-
+!~~~~~~>
-  gxx = dxx + ONE
+
-  gyy = dyy + ONE
+  do k=1,ex(3)
-  gzz = dzz + ONE
+  do j=1,ex(2)
-
+  do i=1,ex(1)
-  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+
-          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+    lgxx = dxx(i,j,k) + ONE
-  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
+    lgyy = dyy(i,j,k) + ONE
-  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
+    lgzz = dzz(i,j,k) + ONE
-  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
+
-  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
+    ldetg =  lgxx * lgyy * lgzz &
-  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
+           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
-  gupzz =   ( gxx * gyy - gxy * gxy ) / detg
+           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
-
+           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
-  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
-       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
+           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-
+
-  Axx = Axx - F1o3 * gxx * trA
+    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
-  Axy = Axy - F1o3 * gxy * trA
+    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
-  Axz = Axz - F1o3 * gxz * trA
+    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
-  Ayy = Ayy - F1o3 * gyy * trA
+    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
-  Ayz = Ayz - F1o3 * gyz * trA
+    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
-  Azz = Azz - F1o3 * gzz * trA
+    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
-
+
-  detg = ONE / ( detg ** F1o3 ) 
+    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
-  
+                 + lgupzz * Azz(i,j,k) &
-  gxx = gxx * detg
+         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
-  gxy = gxy * detg
+                 + lgupyz * Ayz(i,j,k))
-  gxz = gxz * detg
+
-  gyy = gyy * detg
+    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
-  gyz = gyz * detg
+    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
-  gzz = gzz * detg
+    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
-
+    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
-  dxx = gxx - ONE
+    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
-  dyy = gyy - ONE
+    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
-  dzz = gzz - ONE
+
    lscale = ONE / ( ldetg ** F1o3 )
    dxx(i,j,k) = lgxx * lscale - ONE
    gxy(i,j,k) = gxy(i,j,k) * lscale
    gxz(i,j,k) = gxz(i,j,k) * lscale
    dyy(i,j,k) = lgyy * lscale - ONE
    gyz(i,j,k) = gyz(i,j,k) * lscale
    dzz(i,j,k) = lgzz * lscale - ONE
  enddo
  enddo
  enddo
  return
@@ -81,52 +93,72 @@
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Axx,Axy,Axz
  real*8, dimension(ex(1),ex(2),ex(3)), intent(inout) :: Ayy,Ayz,Azz
-!~~~~~~~> Local variable:
+!~~~~~~~> Local variable:
-  
+
-  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
+  integer :: i,j,k
-  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
+  real*8 :: lgxx,lgyy,lgzz,lscale
-  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
+  real*8 :: lgxy,lgxz,lgyz
-  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
+  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
-
+  real*8 :: ltrA
-!~~~~~~>
+  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
-
+
-  gxx = dxx + ONE
+!~~~~~~>
-  gyy = dyy + ONE
+
-  gzz = dzz + ONE
+  do k=1,ex(3)
-! for g
+  do j=1,ex(2)
-  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+  do i=1,ex(1)
-           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+
-
+! for g: normalize determinant first
-  gupzz = ONE / ( gupzz ** F1o3 ) 
+    lgxx = dxx(i,j,k) + ONE
-  
+    lgyy = dyy(i,j,k) + ONE
-  gxx = gxx * gupzz
+    lgzz = dzz(i,j,k) + ONE
-  gxy = gxy * gupzz
+    lgxy = gxy(i,j,k)
-  gxz = gxz * gupzz
+    lgxz = gxz(i,j,k)
-  gyy = gyy * gupzz
+    lgyz = gyz(i,j,k)
-  gyz = gyz * gupzz
+
-  gzz = gzz * gupzz
+    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
-
+            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
-  dxx = gxx - ONE
+            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
-  dyy = gyy - ONE
+
-  dzz = gzz - ONE
+    lscale = ONE / ( lscale ** F1o3 )
-! for A  
+
-
+    lgxx = lgxx * lscale
-  gupxx =   ( gyy * gzz - gyz * gyz )
+    lgxy = lgxy * lscale
-  gupxy = - ( gxy * gzz - gyz * gxz )
+    lgxz = lgxz * lscale
-  gupxz =   ( gxy * gyz - gyy * gxz )
+    lgyy = lgyy * lscale
-  gupyy =   ( gxx * gzz - gxz * gxz )
+    lgyz = lgyz * lscale
-  gupyz = - ( gxx * gyz - gxy * gxz )
+    lgzz = lgzz * lscale
-  gupzz =   ( gxx * gyy - gxy * gxy )
+
-
+    dxx(i,j,k) = lgxx - ONE
-  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
+    gxy(i,j,k) = lgxy
-       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
+    gxz(i,j,k) = lgxz
-
+    dyy(i,j,k) = lgyy - ONE
-  Axx = Axx - F1o3 * gxx * trA
+    gyz(i,j,k) = lgyz
-  Axy = Axy - F1o3 * gxy * trA
+    dzz(i,j,k) = lgzz - ONE
-  Axz = Axz - F1o3 * gxz * trA
+
-  Ayy = Ayy - F1o3 * gyy * trA
+! for A: trace-free using normalized metric (det=1, no division needed)
-  Ayz = Ayz - F1o3 * gyz * trA
+    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
-  Azz = Azz - F1o3 * gzz * trA
+    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
  enddo
  enddo
  enddo
  return
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,8 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
   enddo
@@ -350,8 +349,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -379,8 +377,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -886,17 +883,20 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
-   do i=0,ord-1
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
+   do i=0,ord-1
-   enddo
+      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
-   do i=0,ord-1
+   enddo
-      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-   enddo
+   do i=0,ord-1
-   do i=0,ord-1
+      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
-      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
+   enddo
-   enddo
+!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
 end subroutine symmetry_bd
@@ -912,8 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -941,8 +940,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
-  funcc = 0.d0
+  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
      funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-i,1:extc(2),1:extc(3))*SoA(1)
@@ -1113,153 +1111,355 @@ end subroutine d2dump
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-! common code for cell and vertex
+! common code for cell and vertex
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-! Lagrangian polynomial interpolation
+! Lagrangian polynomial interpolation
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-
+#ifndef POLINT6_USE_BARYCENTRIC
-  subroutine polint(xa,ya,x,y,dy,ordn)
+#define POLINT6_USE_BARYCENTRIC 1
-
+#endif
-  implicit none
+
-
+!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
-!~~~~~~> Input Parameter:
+  subroutine polint6_neville(xa, ya, x, y, dy)
-  integer,intent(in) :: ordn
+  implicit none
-  real*8, dimension(ordn), intent(in) :: xa,ya
+
-  real*8, intent(in) :: x
+  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, intent(out) :: y,dy
+  real*8, intent(in) :: x
-
+  real*8, intent(out) :: y, dy
-!~~~~~~> Other parameter:
+
-
+  integer :: i, m, ns, n_m
-  integer :: m,n,ns
+  real*8, dimension(6) :: c, d, ho
-  real*8, dimension(ordn) :: c,d,den,ho
+  real*8 :: dif, dift, hp, h, den_val
-  real*8 :: dif,dift
+
-
+  c = ya
-!~~~~~~>
+  d = ya
-
+  ho = xa - x
-  n=ordn
+
-  m=ordn
+  ns = 1
-
+  dif = abs(x - xa(1))
-  c=ya
+
-  d=ya
+  do i = 2, 6
-  ho=xa-x
+    dift = abs(x - xa(i))
-
+    if (dift < dif) then
-  ns=1
+      ns = i
-  dif=abs(x-xa(1))
+      dif = dift
-  do m=1,n
+    end if
-   dift=abs(x-xa(m))
+  end do
-   if(dift < dif) then
+
-    ns=m
+  y = ya(ns)
-    dif=dift
+  ns = ns - 1
-   end if
+
-  end do
+  do m = 1, 5
-
+    n_m = 6 - m
-  y=ya(ns)
+    do i = 1, n_m
-  ns=ns-1
+      hp = ho(i)
-  do m=1,n-1
+      h  = ho(i+m)
-    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
+      den_val = hp - h
-    if (any(den(1:n-m) == 0.0))then
+
-      write(*,*) 'failure in polint for point',x
+      if (den_val == 0.0d0) then
-      write(*,*) 'with input points: ',xa
+        write(*,*) 'failure in polint for point',x
-      stop
+        write(*,*) 'with input points: ',xa
-    endif
+        stop
-    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
+      end if
-    d(1:n-m)=ho(1+m:n)*den(1:n-m)
+
-    c(1:n-m)=ho(1:n-m)*den(1:n-m)
+      den_val = (c(i+1) - d(i)) / den_val
-    if (2*ns < n-m) then
+
-      dy=c(ns+1)
+      d(i) = h * den_val
-    else
+      c(i) = hp * den_val
-      dy=d(ns)
+    end do
-      ns=ns-1
+
-    end if
+    if (2 * ns < n_m) then
-    y=y+dy
+      dy = c(ns + 1)
-  end do
+    else
-
+      dy = d(ns)
-  return
+      ns = ns - 1
-
+    end if
-  end subroutine polint
+    y = y + dy
-!------------------------------------------------------------------------------
+  end do
-!
+
-! interpolation in 2 dimensions, follow yx order
+  return
-!
+  end subroutine polint6_neville
-!------------------------------------------------------------------------------
+
-  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
+!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
-
+  subroutine polint6_barycentric(xa, ya, x, y, dy)
-  implicit none
+  implicit none
-
+
-!~~~~~~> Input parameters:
+  real*8, dimension(6), intent(in) :: xa, ya
-  integer,intent(in) :: ordn
+  real*8, intent(in) :: x
-  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
+  real*8, intent(out) :: y, dy
-  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
+
-  real*8, intent(in) :: x1,x2
+  integer :: i, j
-  real*8, intent(out) :: y,dy
+  logical :: is_uniform
-
+  real*8, dimension(6) :: lambda
-!~~~~~~> Other parameters:
+  real*8 :: dx, den_i, term, num, den, step, tol
-
+  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
-  integer  :: i,m
+
-  real*8, dimension(ordn) :: ymtmp
+  do i = 1, 6
-  real*8, dimension(ordn) :: yntmp
+    if (x == xa(i)) then
-
+      y = ya(i)
-  m=size(x1a)
+      dy = 0.d0
-  
+      return
-  do i=1,m
+    end if
-
+  end do
-    yntmp=ya(i,:)
+
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+  step = xa(2) - xa(1)
-
+  is_uniform = (step /= 0.d0)
-  end do
+  if (is_uniform) then
-
+    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
+    do i = 3, 6
-
+      if (abs((xa(i) - xa(i-1)) - step) > tol) then
-  return
+        is_uniform = .false.
-
+        exit
-  end subroutine polin2
+      end if
-!------------------------------------------------------------------------------
+    end do
-!
+  end if
-! interpolation in 3 dimensions, follow zyx order
+
-!
+  if (is_uniform) then
-!------------------------------------------------------------------------------
+    num = 0.d0
-  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
+    den = 0.d0
-
+    do i = 1, 6
-  implicit none
+      term = c_uniform(i) / (x - xa(i))
-
+      num = num + term * ya(i)
-!~~~~~~> Input parameters:
+      den = den + term
-  integer,intent(in) :: ordn
+    end do
-  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
+    y = num / den
-  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
+    dy = 0.d0
-  real*8, intent(in) :: x1,x2,x3
+    return
-  real*8, intent(out) :: y,dy
+  end if
-
+
-!~~~~~~> Other parameters:
+  do i = 1, 6
-
+    den_i = 1.d0
-  integer  :: i,j,m,n
+    do j = 1, 6
-  real*8, dimension(ordn,ordn) :: yatmp
+      if (j /= i) then
-  real*8, dimension(ordn) :: ymtmp
+        dx = xa(i) - xa(j)
-  real*8, dimension(ordn) :: yntmp
+        if (dx == 0.0d0) then
-  real*8, dimension(ordn) :: yqtmp
+          write(*,*) 'failure in polint for point',x
-
+          write(*,*) 'with input points: ',xa
-  m=size(x1a)
+          stop
-  n=size(x2a)
+        end if
-  
+        den_i = den_i * dx
-  do i=1,m
+      end if
-   do j=1,n
+    end do
-
+    lambda(i) = 1.d0 / den_i
-    yqtmp=ya(i,j,:)
+  end do
-    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
+
-
+  num = 0.d0
-   end do
+  den = 0.d0
-
+  do i = 1, 6
-    yntmp=yatmp(i,:)
+    term = lambda(i) / (x - xa(i))
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+    num = num + term * ya(i)
-
+    den = den + term
-  end do
+  end do
-
+
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
+  y = num / den
-
+  dy = 0.d0
-  return
+
-
+  return
-  end subroutine polin3
+  end subroutine polint6_barycentric
 !DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa, ya, x, y, dy, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
  integer :: i, m, ns, n_m
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
  if (ordn == 6) then
 #if POLINT6_USE_BARYCENTRIC
    call polint6_barycentric(xa, ya, x, y, dy)
 #else
    call polint6_neville(xa, ya, x, y, dy)
 #endif
    return
  end if
  c = ya
  d = ya
  ho = xa - x
  ns = 1
  dif = abs(x - xa(1))
  do i = 2, ordn
    dift = abs(x - xa(i))
    if (dift < dif) then
      ns = i
      dif = dift
    end if
  end do
  y = ya(ns)
  ns = ns - 1
  do m = 1, ordn - 1
    n_m = ordn - m
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
        write(*,*) 'failure in polint for point',x
        write(*,*) 'with input points: ',xa
        stop
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
      dy = d(ns)
      ns = ns - 1
    end if
    y = y + dy
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 ! Compute Lagrange interpolation basis weights for one target point.
 !------------------------------------------------------------------------------
 !DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
  subroutine polint_lagrange_weights(xa, x, w, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: xa
  real*8, intent(in) :: x
  real*8, dimension(1:ordn), intent(out) :: w
  integer :: i, j
  real*8 :: num, den, dx
  do i = 1, ordn
    num = 1.d0
    den = 1.d0
    do j = 1, ordn
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        num = num * (x - xa(j))
        den = den * dx
      end if
    end do
    w(i) = num / den
  end do
  return
  end subroutine polint_lagrange_weights
 !------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
 #ifdef POLINT_LEGACY_ORDER
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
  call polint(x2a, ymtmp, x2, y, dy, ordn)
 #endif
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
 ! interpolation in 3 dimensions, follow zyx order
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
 #ifdef POLINT_LEGACY_ORDER
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  real*8, dimension(ordn) :: yqtmp
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: i, j, k
  real*8, dimension(ordn) :: w1, w2
  real*8, dimension(ordn) :: ymtmp
  real*8 :: yx_sum, x_sum
  call polint_lagrange_weights(x1a, x1, w1, ordn)
  call polint_lagrange_weights(x2a, x2, w2, ordn)
  do k = 1, ordn
    yx_sum = 0.d0
    do j = 1, ordn
      x_sum = 0.d0
      do i = 1, ordn
        x_sum = x_sum + w1(i) * ya(i,j,k)
      end do
      yx_sum = yx_sum + w2(j) * x_sum
    end do
    ymtmp(k) = yx_sum
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
-! calculate L2norm
+! calculate L2norm  
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw)
@@ -1276,9 +1476,9 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k,n_elements
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1302,20 +1502,91 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-! Optimized with oneMKL BLAS DDOT for dot product
+  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(n_elements))
-allocate(f_flat(n_elements))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
  return
  end subroutine l2normhelper
-!--------------------------------------------------------------------------------------
+!--------------------------------------------------------------------------------------
-! calculate L2norm especially for shell Blocks
+  subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                           f1,f2,f3,f4,f5,f6,f7,f_out,gw)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in ):: ex(1:3)
  real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
  integer,intent(in)::gw
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
  real*8, intent(out) :: f_out(7)
 !~~~~~~> Other variables:
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8 :: s1,s2,s3,s4,s5,s6,s7
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
   imin = gw+1
   jmin = gw+1
   kmin = gw+1
   imax = ex(1) - gw
   jmax = ex(2) - gw
   kmax = ex(3) - gw
 if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
 if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
 if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
 if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
  s1 = 0.d0
  s2 = 0.d0
  s3 = 0.d0
  s4 = 0.d0
  s5 = 0.d0
  s6 = 0.d0
  s7 = 0.d0
  do k=kmin,kmax
    do j=jmin,jmax
 !DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
      do i=imin,imax
        s1 = s1 + f1(i,j,k)*f1(i,j,k)
        s2 = s2 + f2(i,j,k)*f2(i,j,k)
        s3 = s3 + f3(i,j,k)*f3(i,j,k)
        s4 = s4 + f4(i,j,k)*f4(i,j,k)
        s5 = s5 + f5(i,j,k)*f5(i,j,k)
        s6 = s6 + f6(i,j,k)*f6(i,j,k)
        s7 = s7 + f7(i,j,k)*f7(i,j,k)
      enddo
    enddo
  enddo
  f_out(1) = s1*dX*dY*dZ
  f_out(2) = s2*dX*dY*dZ
  f_out(3) = s3*dX*dY*dZ
  f_out(4) = s4*dX*dY*dZ
  f_out(5) = s5*dX*dY*dZ
  f_out(6) = s6*dX*dY*dZ
  f_out(7) = s7*dX*dY*dZ
  return
  end subroutine l2normhelper7
 !--------------------------------------------------------------------------------------
 ! calculate L2norm especially for shell Blocks
  subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw,ogw,Symmetry)
@@ -1332,9 +1603,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k,n_elements
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1397,12 +1668,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(n_elements))
-allocate(f_flat(n_elements))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1429,9 +1699,9 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k
+  integer::i,j,k
-  real*8, dimension(:), allocatable :: f_flat
+  real*8, dimension(:), allocatable :: f_flat
-  real*8, external :: DDOT
+  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1494,12 +1764,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+  allocate(f_flat(Nout))
-allocate(f_flat(Nout))
+  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
-f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
+  f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
-f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
+  deallocate(f_flat)
 deallocate(f_flat)
  return
@@ -1600,9 +1869,12 @@ deallocate(f_flat)
 !       ^
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
-  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
+  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
-
+  integer :: i,j,k
-  fout = C1*f1+C2*f2+C3*f3
+
  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
  end do
  return
@@ -1696,8 +1968,8 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN,ORDN) :: ya
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
-  real*8, dimension(3) :: SoAh
+  real*8, dimension(3) :: SoAh
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1733,21 +2005,17 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  return
@@ -1776,8 +2044,8 @@ deallocate(f_flat)
  integer,dimension(2) :: cxB,cxT
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
-  real*8, dimension(2) :: SoAh
+  real*8, dimension(2) :: SoAh
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1807,14 +2075,12 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  ! Use BLAS DDOT for final weighted sum
+  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
  return
@@ -1840,12 +2106,12 @@ deallocate(f_flat)
 !~~~~~~> Other parameters:
  real*8, dimension(-ORDN+1:ex(1)+ORDN,-ORDN+1:ex(2)+ORDN,ex(3)) :: fh
-  integer :: m
+  integer :: m
-  integer :: cxB,cxT
+  integer :: cxB,cxT
-  real*8, dimension(ORDN) :: ya
+  real*8, dimension(ORDN) :: ya
-  real*8 :: SoAh
+  real*8 :: SoAh
-  integer,dimension(3) :: inds
+  integer,dimension(3) :: inds
-  real*8, external :: DDOT
+  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1906,8 +2172,7 @@ deallocate(f_flat)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  ! Optimized with BLAS DDOT for weighted sum
+  f_int = DDOT(ORDN, coef, 1, ya, 1)
  f_int = DDOT(ORDN, coef, 1, ya, 1)
  return
@@ -2139,38 +2404,32 @@ deallocate(f_flat)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
-  real*8 :: gont
+  real*8 :: gont
-  integer :: i
+
-
+  integer :: i
-  ! Lookup table for factorials 0! to 20! (precomputed)
+  real*8, parameter, dimension(0:20) :: fact_table = [ &
-  real*8, parameter, dimension(0:20) :: fact_table = [ &
+    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
-    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
+    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
-    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
+    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
-    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
+    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
-    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
+    2432902008176640000.d0 ]
    2432902008176640000.d0 ]
 ! sanity check
-  if(N < 0)then
+  if(N < 0)then
-     write(*,*) "ffact: error input for factorial"
+     write(*,*) "ffact: error input for factorial"
-     gont = 1.d0
+     gont = 1.d0
-     return
+     return
-  endif
+  endif
-
+
-  ! Use lookup table for small N (fast path)
+  if(N <= 20)then
-  if(N <= 20)then
+     gont = fact_table(N)
-     gont = fact_table(N)
+  else
-  else
+     gont = exp(log_gamma(dble(N+1)))
-     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
+  endif
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/fmisc.h
+++ b/AMSS_NCKU_source/fmisc.h
@@ -12,9 +12,10 @@
 #define f_global_interpind global_interpind
 #define f_global_interpind2d global_interpind2d
 #define f_global_interpind1d global_interpind1d
-#define f_l2normhelper l2normhelper
+#define f_l2normhelper l2normhelper
-#define f_l2normhelper_sh l2normhelper_sh
+#define f_l2normhelper7 l2normhelper7
-#define f_l2normhelper_sh_rms l2normhelper_sh_rms
+#define f_l2normhelper_sh l2normhelper_sh
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms
 #define f_average average
 #define f_average3 average3
 #define f_average2 average2
@@ -41,9 +42,10 @@
 #define f_global_interpind GLOBAL_INTERPIND
 #define f_global_interpind2d GLOBAL_INTERPIND2D
 #define f_global_interpind1d GLOBAL_INTERPIND1D
-#define f_l2normhelper L2NORMHELPER
+#define f_l2normhelper L2NORMHELPER
-#define f_l2normhelper_sh L2NORMHELPER_SH
+#define f_l2normhelper7 L2NORMHELPER7
-#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
+#define f_l2normhelper_sh L2NORMHELPER_SH
 #define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
 #define f_average AVERAGE
 #define f_average3 AVERAGE3
 #define f_average2 AVERAGE2
@@ -70,9 +72,10 @@
 #define f_global_interpind global_interpind_
 #define f_global_interpind2d global_interpind2d_
 #define f_global_interpind1d global_interpind1d_
-#define f_l2normhelper l2normhelper_
+#define f_l2normhelper l2normhelper_
-#define f_l2normhelper_sh l2normhelper_sh_
+#define f_l2normhelper7 l2normhelper7_
-#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
+#define f_l2normhelper_sh l2normhelper_sh_
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms_
 #define f_average average_
 #define f_average3 average3_
 #define f_average2 average2_
@@ -156,21 +159,30 @@ extern "C"
 							  int *, double *, int &, int &);
 }
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper(int *, double *, double *, double *,
+	void f_l2normhelper(int *, double *, double *, double *,
-						double &, double &, double &,
+						double &, double &, double &,
-						double &, double &, double &,
+						double &, double &, double &,
-						double *, double &, int &);
+						double *, double &, int &);
-}
+}
-
+
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper_sh(int *, double *, double *, double *,
+	void f_l2normhelper7(int *, double *, double *, double *,
-						   double &, double &, double &,
+						 double &, double &, double &,
-						   double &, double &, double &,
+						 double &, double &, double &,
-						   double *, double &, int &, int &, int &);
+						 double *, double *, double *, double *,
-}
+						 double *, double *, double *, double *, int &);
 }
 extern "C"
 {
 	void f_l2normhelper_sh(int *, double *, double *, double *,
 						   double &, double &, double &,
 						   double &, double &, double &,
 						   double *, double &, int &, int &, int &);
 }
 extern "C"
 {
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -16,66 +16,115 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
-
+/* Linear equation solution by Gauss-Jordan elimination.
 // Intel oneMKL LAPACK interface
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv, temp;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            for (int ii = 0; ii < n; ii++)
            {
              for (int jj = 0; jj < n; jj++)
                cout << a[ii * n + jj] << " ";
              cout << endl;
            }
            return 1; // error return
          }
        }
    ipiv[icol] = ipiv[icol] + 1;
    if (irow != icol)
    {
      for (l = 0; l < n; l++)
      {
        swap = a[irow * n + l];
        a[irow * n + l] = a[icol * n + l];
        a[icol * n + l] = swap;
      }
      swap = b[irow];
      b[irow] = b[icol];
      b[icol] = swap;
    }
    indxr[i] = irow;
    indxc[i] = icol;
    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      for (int ii = 0; ii < n; ii++)
      {
        for (int jj = 0; jj < n; jj++)
          cout << a[ii * n + jj] << " ";
        cout << endl;
      }
      return 1; // error return
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+  for (l = n - 1; l >= 0; l--)
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+  {
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    if (indxr[l] != indxc[l])
-
+      for (k = 0; k < n; k++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[k * n + indxr[l]];
-    delete[] ipiv;
+        a[k * n + indxr[l]] = a[k * n + indxc[l]];
-    delete[] a_copy;
+        a[k * n + indxc[l]] = swap;
-    return 1;
+      }
  }
  // Step 2: Compute matrix inverse A^(-1) using LU factorization
  // First do LU factorization of original matrix a
  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  // Then compute inverse from LU factorization
  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/ilucg.f90
+++ b/AMSS_NCKU_source/ilucg.f90
@@ -512,10 +512,11 @@
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      DIMENSION V(N),W(N)
 !     SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
 !     Optimized using Intel oneMKL BLAS ddot
 !     Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
-      DOUBLE PRECISION, EXTERNAL :: DDOT
+      SUM = 0.0D0
-      DGVV = DDOT(N, V, 1, W, 1)
+            DO 10 I = 1,N
            SUM = SUM + V(I)*W(I)
 10          CONTINUE
      DGVV = SUM
      RETURN
      END
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -2,7 +2,7 @@
 #ifndef MICRODEF_H
 #define MICRODEF_H
-#include "macrodef.fh"
+#include "macrodef.fh"
 // application parameters
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,11 +1,25 @@
-
+
-
+
-include makefile.inc
+include makefile.inc
-
+
-.SUFFIXES: .o .f90 .C .for .cu
+## polint(ordn=6) kernel selector:
-
+##   1 (default): barycentric fast path
-.f90.o:
+##   0          : fallback to Neville path
-	$(f90) $(f90appflags) -c $< -o $@
+POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 ARCH_OPT = -march=x86-64-v4
 CXXAPPFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 TP_OPTFLAGS = -O3 $(ARCH_OPT) -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 .SUFFIXES: .o .f90 .C .for .cu
 .f90.o:
 	$(f90) $(f90appflags) -c $< -o $@
 .C.o:
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -13,8 +27,14 @@ include makefile.inc
 .for.o:
 	$(f77) -c $< -o $@
-.cu.o:
+.cu.o:
-	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
+	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 TwoPunctures.o: TwoPunctures.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
@@ -95,8 +115,8 @@ ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
-TwoPunctureABE: $(TwoPunctureFILES)
+TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,30 +1,32 @@
 ## GCC version (commented out)
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+## Intel oneAPI version with oneMKL
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Use sequential oneMKL to avoid introducing extra OpenMP behavior into ABE.
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+
 ## Optional Intel oneTBB allocator, kept aligned with main's build environment.
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
 ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
-CLINKER      = mpiicpx 
+CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -392,17 +392,6 @@ def generate_macrodef_fh():
        print( "# Finite_Difference_Method #define ghost_width setting error!!!",   file=file1 )
        print(                                                   file=file1 )
    # Define macro DEBUG_NAN_CHECK
    # 0: off (default), 1: on
    debug_nan_check = getattr(input_data, "Debug_NaN_Check", 0)
    if debug_nan_check:
        print( "#define DEBUG_NAN_CHECK 1", file=file1 )
        print(                             file=file1 )
    else:
        print( "#define DEBUG_NAN_CHECK 0", file=file1 )
        print(                             file=file1 )
    # Whether to use a shell-patch grid
    # use shell or not
@@ -525,9 +514,6 @@ def generate_macrodef_fh():
    print( "    6th order: 4",                                                                      file=file1 )
    print( "    8th order: 5",                                                                      file=file1 )
    print(                                                                                          file=file1 )
    print( "define DEBUG_NAN_CHECK",                                                                file=file1 )
    print( "    0: off (default), 1: on",                                                           file=file1 )
    print(                                                                                          file=file1 )
    print( "define WithShell",                                                                      file=file1 )
    print( "    use shell or not",                                                                  file=file1 )
    print(                                                                                          file=file1 )
--- a/inputfile_example/AMSS_NCKU_Input.py
+++ b/inputfile_example/AMSS_NCKU_Input.py
@@ -35,8 +35,7 @@ Equation_Class           = "BSSN"                  ## Evolution Equation: choose
                                                   ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
-Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
+Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,18 +11,6 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 NUMACTL_CPU_BIND = ""
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 14
 ##################################################################
@@ -38,11 +26,11 @@ def makefile_ABE():
    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                        )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = "make -j96" + " ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = "make -j4" + " ABEGPU"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -79,8 +67,8 @@ def makefile_TwoPunctureABE():
    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
    print(                                                            )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+    makefile_command = "make" + " TwoPunctureABE"
    ## Execute the command with subprocess.Popen and stream output
    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
@@ -117,10 +105,10 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
@@ -159,7 +147,7 @@ def run_TwoPunctureABE():
    print(                                                          )
    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = "./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -0,0 +1,12 @@
 import multiprocessing
 def run_plot_task(task):
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -8,11 +8,13 @@
 ##
 #################################################
-import numpy                               ## numpy for array operations
+import numpy                               ## numpy for array operations
-import scipy                               ## scipy for interpolation and signal processing
+import scipy                               ## scipy for interpolation and signal processing
-import math
+import math
-import matplotlib.pyplot    as     plt     ## matplotlib for plotting
+import matplotlib
-import os                                  ## os for system/file operations
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
 import AMSS_NCKU_Input as input_data
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -6,17 +6,22 @@
 ## Author: Xiaoqu
 ## Dates: 2024/10/01 --- 2025/09/14
 ##
-#################################################
+#################################################
-
+
-import numpy
+## Restrict OpenMP to one thread per process so that parallel
-import scipy
+## subprocess plotting does not multiply BLAS thread counts.
-import matplotlib.pyplot    as     plt
+import os
-from   matplotlib.colors    import LogNorm
+os.environ.setdefault("OMP_NUM_THREADS", "1")
-from   mpl_toolkits.mplot3d import Axes3D
+
-## import torch
+import numpy
-import AMSS_NCKU_Input      as input_data
+import scipy
-
+import matplotlib
-import os
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 #########################################################################################
@@ -92,9 +97,9 @@ def plot_binary_data( filename, binary_outdir, figure_outdir ):
-####################################################################################
+####################################################################################
-
+
-# Plot a single binary dataset (2D slices and 3D surface)
+# Plot a single binary dataset (2D slices and 3D surface)
 def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
@@ -188,7 +193,15 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
    plt.savefig( os.path.join(figure_surfaceplot_outdir, figure_title + " time = " + str(time) + " surface_plot.pdf") )   # save figure
    plt.close()
-    return
+    return
-
+
-####################################################################################
+####################################################################################
 ## Allow standalone subprocess execution for parallel binary-data plotting.
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -6,15 +6,20 @@
 ## 2024/10/01 --- 2025/09/14
 ##
 #################################################
-
+
-import numpy                               ## numpy for array operations
+import numpy                               ## numpy for array operations
-import matplotlib.pyplot    as     plt     ## matplotlib for plotting
+import matplotlib
-from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
+matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
-import glob
+import matplotlib.pyplot    as     plt     ## matplotlib for plotting
-import os                                  ## operating system utilities
+from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
-
+import glob
-import plot_binary_data
+import os                                  ## operating system utilities
-import AMSS_NCKU_Input as input_data
+
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -50,13 +55,37 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in the list
+    ## Plot each file in parallel using subprocesses.
-    for filename in file_list:
+    ## Each subprocess starts with BLAS thread limits in plot_binary_data.py.
-        print(filename)
+    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
-        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
+    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
-
+
-    print(                        )
+    running = []
-    print( " Binary Data Plot Has been Finished " )
+    failed  = []
    for filename in file_list:
        print(filename)
        proc = subprocess.Popen(
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )
    print(                                        )
    return
Author	SHA1	Message	Date
CGH0S7	45e3c725f9	Trigger-Discipline: parallelize result plotting	2026-04-24 10:04:57 +08:00
CGH0S7	7f603f189b	Trigger-Discipline: port TwoPuncture OpenMP optimizations	2026-04-24 09:25:13 +08:00
CGH0S7	a821f21a23	.gitignore updated	2026-04-24 09:10:12 +08:00
CGH0S7	34fe3e6aa5	Trigger-Discipline: port conservative build and fmisc optimizations	2026-04-24 09:09:50 +08:00
CGH0S7	79af79d471	baseline updated	2026-02-05 19:53:55 +08:00