Optimize bssn_rhs.f90: Fuse loops for metric inversion and Christoffel symbols to improve cache locality

优化 compute_rhs_bssn 热点路径并加入 NaN 检查开关
- 用 DEBUG_NAN_CHECK 宏按需启用 NaN 检查，并在输入/宏生成器中新增 Debug_NaN_Check 配置 - 逆度量改为先求行列式再乘法展开，减少除法；并在 Gam^i/Christoffel 处提取公共子表达式 - 预置批量 fderivs 辅助例程，便于后续矢量化/合并导数计算 - 将默认 MPI_processes 调整为 8 变更涉及： - AMSS_NCKU_source/bssn_rhs.f90 - generate_macrodef.py - AMSS_NCKU_Input.py - AMSS_NCKU_Input_Mini.py - inputfile_example/AMSS_NCKU_Input.py - AMSS_NCKU_source/diff_new.f90 TODO: fmisc.f90 polint()
2026-01-21 11:22:33 +08:00 · 2026-01-20 19:37:26 +08:00 · 2026-01-20 00:31:40 +08:00
53 changed files with 3488 additions and 8455 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 __pycache__
 GW150914
 GW150914-origin
 GW150914-mini
 docs
 *.tmp
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,12 +16,14 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 8                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
--- a/AMSS_NCKU_Input_Mini.py
+++ b/AMSS_NCKU_Input_Mini.py
@@ -0,0 +1,233 @@
 #################################################
 ##
 ## This file provides the input parameters required for numerical relativity.
 ## XIAOQU
 ## 2024/03/19 --- 2025/09/14
 ## Modified for GW150914-mini test case
 ##
 #################################################
 import numpy    
 #################################################
 ## Setting MPI processes and the output file directory
 File_directory   = "GW150914-mini"               ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
 MPI_processes    = 4                             ## number of mpi processes used in the simulation (Reduced for laptop)
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 #################################################
 #################################################
 ## Setting the physical system and numerical method
 Symmetry                 = "equatorial-symmetry"   ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
 Equation_Class           = "BSSN"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
                                                   ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
 Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
 #################################################
 ## Setting the time evolutionary information
 Start_Evolution_Time     = 0.0                    ## start evolution time t0
 Final_Evolution_Time     = 100.0                  ## final evolution time t1 (Reduced for quick test)
 Check_Time               = 10.0
 Dump_Time                = 10.0                   ## time inteval dT for dumping binary data
 D2_Dump_Time             = 10.0                   ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 1.0                    ## dump the puncture position and GW psi4 after dT"
 Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
 #################################################
 #################################################
 ## Setting the grid structure
 basic_grid_set    = "Patch"                          ## grid structure: choose "Patch" or "Shell-Patch"
 grid_center_set   = "Cell"                           ## grid center: chose "Cell" or "Vertex"
 grid_level        = 7                                ## total number of AMR grid levels (Reduced from 9)
 static_grid_level = 4                                ## number of AMR static grid levels (Reduced from 5)
 moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
 analysis_level    = 0
 refinement_level  = 3                                ## time refinement start from this grid level
 largest_box_xyz_max = [320.0, 320.0, 320.0]          ## scale of the largest box
                                                     ## not ne cess ary to be cubic for "Patch" grid s tructure
                                                     ## need to be a cubic box for "Shell-Patch" grid structure
 largest_box_xyz_min = - numpy.array(largest_box_xyz_max)  
 static_grid_number = 48                              ## grid points of each static AMR grid (in x direction) (Reduced from 96)
                                                     ## (grid points in y and z directions are automatically adjusted)
 moving_grid_number = 24                              ## grid points of each moving AMR grid (Reduced from 48)
 shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
                                                     ## in (phi, theta, r) direction
 devide_factor      = 2.0                             ## resolution between different grid levels dh0/dh1, only support 2.0 now
 static_grid_type   = 'Linear'                        ## AMR static grid structure , only supports "Linear"
 moving_grid_type   = 'Linear'                        ## AMR moving grid structure , only supports "Linear"
 quarter_sphere_number = 48                           ## grid number of 1/4 s pher ical surface (Reduced from 96)
                                                     ## (which is needed for evaluating the spherical surface integral)
 #################################################
 #################################################
 ## Setting the puncture information
 puncture_number       = 2                                     
 position_BH           = numpy.zeros( (puncture_number, 3) )   
 parameter_BH          = numpy.zeros( (puncture_number, 3) )   
 dimensionless_spin_BH = numpy.zeros( (puncture_number, 3) )   
 momentum_BH           = numpy.zeros( (puncture_number, 3) )   
 puncture_data_set     = "Manually"                       ## Method to give Puncture’s positions and momentum
                                                         ## choose "Manually" or "Automatically-BBH"
                                                         ## Prefer to choose "Manually", because "Automatically-BBH" is developing now
 ## initial orbital distance and ellipticity for BBHs system
 ## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
 Distance = 10.0
 e0       = 0.0
 ## black hole parameter (M Q* a*)
 parameter_BH[0] = [ 36.0/(36.0+29.0),  0.0,  +0.31 ]   
 parameter_BH[1] = [ 29.0/(36.0+29.0),  0.0,  -0.46 ]  
 ## dimensionless spin in each direction
 dimensionless_spin_BH[0] = [ 0.0,  0.0,  +0.31 ]   
 dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]  
 ## use Brugmann's convention
 ##  -----0-----> y
 ##   -      +     
 #---------------------------------------------
 ## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
 ## initial position for each puncture
 position_BH[0]  = [  0.0,  10.0*29.0/(36.0+29.0), 0.0 ]  
 position_BH[1]  = [  0.0, -10.0*36.0/(36.0+29.0), 0.0 ] 
 ## initial mumentum for each puncture
 ## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
 momentum_BH[0]  = [ -0.09530152296974252,  -0.00084541526517121,   0.0 ]
 momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
 #################################################
 #################################################
 ## Setting the gravitational wave information
 GW_L_max        = 4                      ## maximal L number in gravitational wave
 GW_M_max        = 4                      ## maximal M number in gravitational wave
 Detector_Number = 12                     ## number of dector
 Detector_Rmin   = 50.0                   ## nearest dector distance
 Detector_Rmax   = 160.0                  ## farest dector distance
 #################################################
 #################################################
 ## Setting the apprent horizon
 AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"
 AHF_Find_Every = 24
 AHF_Dump_Time  = 20.0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## Only influence the Equation_Class = "BSSN-EScalar" case
 FR_a2     = 3.0        ## f(R) = R + a2 * R^2    
 FR_l2     = 10000.0
 FR_phi0   = 0.00005
 FR_r0     = 120.0
 FR_sigma0 = 8.0
 FR_Choice = 2          ## Choice options: 1 2 3 4 5
                       ## 1: phi(r) = phi0 * Exp(-(r-r0)**2/sigma0)   
                       ##    V(r)   = 0
                       ## 2: phi(r) =  phi0 * a2^2/(1+a2^2)  
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 3: Schrodinger-Newton gived by system phi(r) 
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 4: phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma0) - tanh((r-r0)/sigma0) )  
                       ##    V(r)   = 0
                       ##    f(R)   = R + a2*R^2  with a2 = +oo
                       ## 5: phi(r) = phi0 * Exp(-(r-r0)**2/sigma)   
                       ##    V(r)   = 0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## (please do not change if not necessary)
 boundary_choice = "BAM-choice"     ## Sommerfeld boundary condition : choose "BAM-choice" or "Shibata-choice" 
                                   ## prefer "BAM-choice"
 gauge_choice  = 0                  ## gauge choice
                                   ## 0: B^i gauge
                                   ## 1: David's puncture gauge
                                   ## 2: MB B^i gauge               
                                   ## 3: RIT B^i gauge
                                   ## 4: MB beta gauge 
                                   ## 5: RIT beta gauge 
                                   ## 6: MGB1 B^i gauge
                                   ## 7: MGB2 B^i gauge
                                   ## prefer 0 or 1
 tetrad_type  = 2                   ## tetradtype 
                                   ##  v:r; u: phi; w: theta
                                   ##      v^a = (x,y,z)
                                   ## 0: orthonormal order: v,u,w
                                   ##    v^a = (x,y,z)   
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## 1: orthonormal order: w,u,v
                                   ##    m = (theta + i phi)/sqrt(2) 
                                   ##    following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)    
                                   ## 2: orthonormal order: v,u,w
                                   ##    v_a = (x,y,z)
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## this version recommend set to 2
                                   ## prefer 2
 #################################################
--- a/AMSS_NCKU_MiniProgram.py
+++ b/AMSS_NCKU_MiniProgram.py
@@ -0,0 +1,224 @@
 ##################################################################
 ##
 ## AMSS-NCKU Numerical Relativity Mini Test Program
 ## Author: Assistant (based on Xiaoqu's code)
 ## 2026/01/20
 ##
 ## This script runs a scaled-down version of the GW150914 test case
 ## suitable for laptop testing.
 ##
 ##################################################################
 import os
 import shutil
 import sys
 import time
 # --- Context Manager for Input File Swapping ---
 class InputFileSwapper:
    def __init__(self, mini_file="AMSS_NCKU_Input_Mini.py", target_file="AMSS_NCKU_Input.py"):
        self.mini_file = mini_file
        self.target_file = target_file
        self.backup_file = target_file + ".bak"
        self.swapped = False
    def __enter__(self):
        print(f"[MiniProgram] Swapping {self.target_file} with {self.mini_file}...")
        if os.path.exists(self.target_file):
            shutil.move(self.target_file, self.backup_file)
        shutil.copy(self.mini_file, self.target_file)
        self.swapped = True
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        if self.swapped:
            print(f"[MiniProgram] Restoring original {self.target_file}...")
            os.remove(self.target_file)
            if os.path.exists(self.backup_file):
                shutil.move(self.backup_file, self.target_file)
 def main():
    # Use the swapper to ensure all imported modules see the mini configuration
    with InputFileSwapper():
        # Import modules AFTER swapping input file
        try:
            import AMSS_NCKU_Input as input_data
            import print_information
            import setup
            import numerical_grid
            import generate_macrodef
            import makefile_and_run
            import generate_TwoPuncture_input
            import renew_puncture_parameter
            import plot_xiaoqu
            import plot_GW_strain_amplitude_xiaoqu
        except ImportError as e:
            print(f"Error importing modules: {e}")
            return
        print_information.print_program_introduction()
        print("\n" + "#"*60)
        print(" RUNNING MINI TEST CASE: GW150914-mini")
        print("#"*60 + "\n")
        # --- Directory Setup ---
        File_directory = os.path.join(input_data.File_directory)
        if os.path.exists(File_directory):
            print(f" Output directory '{File_directory}' exists. Removing for mini test...")
            shutil.rmtree(File_directory, ignore_errors=True)
        os.mkdir(File_directory)
        shutil.copy("AMSS_NCKU_Input.py", File_directory) # Copies the current (mini) input
        output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
        os.mkdir(output_directory)
        binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
        os.mkdir(binary_results_directory)
        figure_directory = os.path.join(File_directory, "figure")
        os.mkdir(figure_directory)
        print(" Output directories generated.\n")
        # --- Setup and Input Generation ---
        setup.print_input_data(File_directory)
        setup.generate_AMSSNCKU_input()
        setup.print_puncture_information()
        print("\n Generating AMSS-NCKU input parfile...")
        numerical_grid.append_AMSSNCKU_cgh_input()
        print("\n Plotting initial grid...")
        numerical_grid.plot_initial_grid()
        print("\n Generating macro files...")
        generate_macrodef.generate_macrodef_h()
        generate_macrodef.generate_macrodef_fh()
        # --- Compilation Preparation ---
        print("\n Preparing to compile and run...")
        AMSS_NCKU_source_path = "AMSS_NCKU_source"
        AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
        if not os.path.exists(AMSS_NCKU_source_path):
             print(" Error: AMSS_NCKU_source not found! Please run in the project root.")
             return
        shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
        macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
        macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
        shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
        shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
        # --- Compilation ---
        cwd = os.getcwd()
        os.chdir(AMSS_NCKU_source_copy)
        print(" Compiling ABE...")
        makefile_and_run.makefile_ABE()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ): 
            print(" Compiling TwoPunctureABE...")
            makefile_and_run.makefile_TwoPunctureABE()
        os.chdir(cwd)
        # --- Copy Executables ---
        if (input_data.GPU_Calculation == "no"):
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
        else:
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
        if not os.path.exists(ABE_file):
            print(" Error: ABE executable compilation failed.")
            return
        shutil.copy2(ABE_file, output_directory)
        TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
            if not os.path.exists(TwoPuncture_file):
                print(" Error: TwoPunctureABE compilation failed.")
                return
            shutil.copy2(TwoPuncture_file, output_directory)
        # --- Execution ---
        start_time = time.time()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
             print("\n Generating TwoPuncture input...")
             generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
             AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
             AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
             shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
             print(" Running TwoPunctureABE...")
             os.chdir(output_directory)
             makefile_and_run.run_TwoPunctureABE()
             os.chdir(cwd)
        # Update Puncture Parameter
        renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
        AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
        AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
        shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
        print("\n Input files ready. Launching ABE...")
        os.chdir(output_directory)
        makefile_and_run.run_ABE()
        os.chdir(cwd)
        end_time = time.time()
        elapsed_time = end_time - start_time
        # --- Post-processing ---
        print("\n Copying output files for inspection...")
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
        for fname in ["bssn_BH.dat", "bssn_ADMQs.dat", "bssn_psi4.dat", "bssn_constraint.dat"]:
            fpath = os.path.join(binary_results_directory, fname)
            if os.path.exists(fpath):
                shutil.copy(fpath, os.path.join(output_directory, fname))
        # --- Plotting ---
        print("\n Plotting results...")
        try:
            plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
                plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.grid_level):
                plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
            plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
        except Exception as e:
            print(f"Warning: Plotting failed: {e}")
        print(f"\n Program Cost = {elapsed_time:.2f} Seconds \n")
        print(" AMSS-NCKU-Python simulation finished (Mini Test).\n")
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script, spawning exponentially more
 ## workers (fork bomb).
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
@@ -270,12 +262,6 @@ if not os.path.exists( ABE_file ):
 ## Copy the executable ABE (or ABEGPU) into the run directory
 shutil.copy2(ABE_file, output_directory)
 ## Copy interp load balance profile if present (for optimize pass)
 interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
 if os.path.exists(interp_lb_profile):
    shutil.copy2(interp_lb_profile, output_directory)
    print( " Copied interp_lb_profile.bin to run directory " )
 ###########################
 ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
@@ -438,31 +424,26 @@ print(
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,13 +1,9 @@
 #!/usr/bin/env python3
 """
-AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
+AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
-1. RMS errors < 1% for:
+1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
   - 3D Vector Total RMS
   - X Component RMS
   - Y Component RMS
   - Z Component RMS
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
@@ -61,62 +57,79 @@ def load_constraint_data(filepath):
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
-def calculate_all_rms_errors(bh_data_ref, bh_data_target):
+
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
-    Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
+    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
-    Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
+
-    Returns the maximum error between BH1 and BH2 for each category.
+    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
-    results = {}
+    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
-    for bh in ['1', '2']:
+    x1_new = bh_data_target['x1'][:M]
-        x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
+    y1_new = bh_data_target['y1'][:M]
-        x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
+    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
-        # 核心修改：根据组委会的邮件指示，分母统一使用 r = sqrt(x^2 + y^2)
+    # Calculate RMS for BH1
-        r_ref = np.sqrt(x_r**2 + y_r**2)
+    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
-        r_new = np.sqrt(x_n**2 + y_n**2)
+    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
-        denom_max = np.maximum(r_ref, r_new)
+    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
-        valid = denom_max > 1e-15
+    # Calculate RMS for BH2
-        if np.sum(valid) < 10:
+    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
-            results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
+    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
-            continue
+    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
-        def calc_rms(delta):
+    # Avoid division by zero for BH1
-            # 将对应分量的偏差除以统一的轨道半径分母 denom_max
+    valid_mask1 = r1_max > 1e-15
-            return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
+    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
-        # 1. Total 3D Vector RMS
+    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
-        delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
+    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
        rms_3d = calc_rms(delta_vec)
-        # 2. Component-wise RMS (分离计算各轴，但共用半径分母)
+    # Avoid division by zero for BH2
-        rms_x = calc_rms(np.abs(x_r - x_n))
+    valid_mask2 = r2_max > 1e-15
-        rms_y = calc_rms(np.abs(y_r - y_n))
+    if np.sum(valid_mask2) < 10:
-        rms_z = calc_rms(np.abs(z_r - z_n))
+        return None, "Insufficient valid data points for BH2"
-        results[f'BH{bh}'] = {
+    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
-            '3D_Vector': rms_3d,
+    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
            'X_Component': rms_x,
            'Y_Component': rms_y,
            'Z_Component': rms_z
        }
-    # 获取 BH1 和 BH2 中的最大误差
+    # Final RMS is the maximum of BH1 and BH2
-    max_rms = {
+    rms_final = max(rms_bh1, rms_bh2)
-        '3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
+
-        'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
+    return rms_final, None
        'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
        'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
    }
    return max_rms, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
@@ -142,32 +155,34 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print(Color.BOLD + "   AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
+    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-def print_rms_results(rms_dict, error, threshold=1.0):
+
-    print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
+def print_rms_results(rms_rel, error, threshold=1.0):
-    print("-" * 65)
+    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
-    all_passed = True
+    passed = rms_rel < threshold
    print(f"   Requirement: < {threshold}%\n")
-    for key, val in rms_dict.items():
+    print(f"   RMS relative error: {rms_rel:.4f}%")
-        passed = val < threshold
+    print(f"   Requirement:        < {threshold}%")
-        all_passed = all_passed and passed
+    print(f"   Status:             {get_status_text(passed)}")
-        status = get_status_text(passed)
+
-        print(f"   {key:15}: {val:8.4f}%   |   Status: {status}")
+    return passed
    return all_passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
-    print("-" * 65)
+    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
@@ -185,6 +200,7 @@ def print_constraint_results(results, threshold=2.0):
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
@@ -194,7 +210,7 @@ def print_summary(rms_passed, constraint_passed):
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
-    print(f"   [1] Comprehensive RMS check:      {res_rms}")
+    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
@@ -203,48 +219,61 @@ def print_summary(rms_passed, constraint_passed):
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
-    # Output modified RMS results
+    # Calculate RMS error
-    rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
+    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
-    rms_passed = print_rms_results(rms_dict, error)
+    rms_passed = print_rms_results(rms_rel, error)
-    # Output constraint results
+    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -13,9 +13,6 @@ using namespace std;
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
 #ifdef INTERP_LB_PROFILE
 #include "interp_lb_profile.h"
 #endif
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -344,9 +341,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
                          double *Shellf, int Symmetry)
 {
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
-  int myrank, nprocs;
+  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
@@ -358,18 +354,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] records which MPI rank owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  // All ranks traverse the same block list so they all agree on ownership
+  int *weight;
-  int *owner_rank;
+  weight = new int[NN];
-  owner_rank = new int[NN];
+  memset(weight, 0, sizeof(int) * NN);
-  for (int j = 0; j < NN; j++)
+
-    owner_rank[j] = -1;
+  double *DH, *llb, *uub;
  DH = new double[dim];
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -401,6 +403,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -425,7 +433,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -433,11 +440,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -446,128 +456,61 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Replace MPI_Allreduce with per-owner MPI_Bcast:
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  // Group consecutive points by owner rank and broadcast each group.
+  int *Weight;
-  // Since each point's data is non-zero only on the owner rank,
+  Weight = new int[NN];
-  // Bcast from owner is equivalent to Allreduce(MPI_SUM) but much cheaper.
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  //  misc::tillherecheck("print me");
  for (int i = 0; i < NN; i++)
  {
-    int j = 0;
+    if (Weight[i] > 1)
    while (j < NN)
    {
      int cur_owner = owner_rank[j];
      if (cur_owner < 0)
    {
      if (myrank == 0)
        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
      for (int j = 0; j < num_var; j++)
        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
    }
    else if (Weight[i] == 0 && myrank == 0)
    {
      cout << "ERROR: Patch::Interp_Points fails to find point (";
-          for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-            cout << XX[d][j];
+        cout << XX[j][i];
-            if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")";
      }
      cout << " on Patch (";
-          for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-            cout << bbox[d] << "+" << lli[d] * DH[d];
+        cout << bbox[j] << "+" << lli[j] * getdX(j);
-            if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")--";
      }
      cout << "(";
-          for (int d = 0; d < dim; d++)
+      for (int j = 0; j < dim; j++)
      {
-            cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        cout << bbox[dim + j] << "-" << uui[j] * getdX(j);
-            if (d < dim - 1)
+        if (j < dim - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
-          MPI_Abort(MPI_COMM_WORLD, 1);
+#if 0
-        }
+       checkBlock();
-        j++;
+#else
-        continue;
+      cout << "splited domains:" << endl;
      }
      // Find contiguous run of points with the same owner
      int jstart = j;
      while (j < NN && owner_rank[j] == cur_owner)
        j++;
      int count = (j - jstart) * num_var;
      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
    }
  }
  delete[] owner_rank;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
                          double *Shellf, int Symmetry,
                          int Nmin_consumer, int Nmax_consumer)
      {
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
 #ifdef INTERP_LB_PROFILE
  double t_interp_start = MPI_Wtime();
 #endif
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
  varl = VarList;
  while (varl)
  {
    num_var++;
    varl = varl->next;
  }
  memset(Shellf, 0, sizeof(double) * NN * num_var);
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int i = 0; i < dim; i++)
    {
      pox[i] = XX[i][j];
      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
      {
        cout << "Patch::Interp_Points: point (";
        for (int k = 0; k < dim; k++)
        {
          cout << XX[k][j];
          if (k < dim - 1)
            cout << ",";
          else
            cout << ") is out of current Patch." << endl;
        }
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
        MyList<Block> *Bp = blb;
-    bool notfind = true;
+        while (Bp)
    while (notfind && Bp)
        {
          Block *BP = Bp->data;
      bool flag = true;
          for (int i = 0; i < dim; i++)
          {
 #ifdef Vertex
@@ -584,222 +527,32 @@ void Patch::Interp_Points(MyList<var> *VarList,
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
          flag = false;
          break;
          }
-      }
+          cout << "(";
-
+          for (int j = 0; j < dim; j++)
      if (flag)
          {
-        notfind = false;
+            cout << llb[j] << ":" << uub[j];
-        owner_rank[j] = BP->rank;
+            if (j < dim - 1)
-        if (myrank == BP->rank)
+              cout << ",";
-        {
+            else
-          varl = VarList;
+              cout << ")" << endl;
          int k = 0;
          while (varl)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
          }
          if (Bp == ble)
            break;
          Bp = Bp->next;
        }
      }
 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
  double t_interp_local = t_interp_end - t_interp_start;
 #endif
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
    if (owner_rank[j] < 0 && myrank == 0)
    {
      cout << "ERROR: Patch::Interp_Points fails to find point (";
      for (int d = 0; d < dim; d++)
      {
        cout << XX[d][j];
        if (d < dim - 1)
          cout << ",";
        else
          cout << ")";
      }
      cout << " on Patch (";
      for (int d = 0; d < dim; d++)
      {
        cout << bbox[d] << "+" << lli[d] * DH[d];
        if (d < dim - 1)
          cout << ",";
        else
          cout << ")--";
      }
      cout << "(";
      for (int d = 0; d < dim; d++)
      {
        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
        if (d < dim - 1)
          cout << ",";
        else
          cout << ")" << endl;
      }
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
-  // --- Targeted point-to-point communication phase ---
+  delete[] shellf;
-  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  delete[] weight;
-  int *consumer_rank = new int[NN];
+  delete[] Weight;
-  {
+  delete[] DH;
-    int mp = NN / nprocs;
+  delete[] llb;
-    int Lp = NN - nprocs * mp;
+  delete[] uub;
    for (int j = 0; j < NN; j++)
    {
      if (j < Lp * (mp + 1))
        consumer_rank[j] = j / (mp + 1);
      else
        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
    }
  }
  // Count sends and recvs per rank
  int *send_count = new int[nprocs];
  int *recv_count = new int[nprocs];
  memset(send_count, 0, sizeof(int) * nprocs);
  memset(recv_count, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own == con)
      continue; // local — no communication needed
    if (own == myrank)
      send_count[con]++;
    if (con == myrank)
      recv_count[own]++;
  }
  // Build send buffers: for each destination rank, pack (index, data) pairs
  // Each entry: 1 int (point index j) + num_var doubles
  int total_send = 0, total_recv = 0;
  int *send_offset = new int[nprocs];
  int *recv_offset = new int[nprocs];
  for (int r = 0; r < nprocs; r++)
  {
    send_offset[r] = total_send;
    total_send += send_count[r];
    recv_offset[r] = total_recv;
    total_recv += recv_count[r];
  }
  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
  double *sendbuf = new double[total_send * stride];
  double *recvbuf = new double[total_recv * stride];
  // Temporary counters for packing
  int *pack_pos = new int[nprocs];
  memset(pack_pos, 0, sizeof(int) * nprocs);
  for (int j = 0; j < NN; j++)
  {
    int own = owner_rank[j];
    int con = consumer_rank[j];
    if (own != myrank || con == myrank)
      continue;
    int pos = (send_offset[con] + pack_pos[con]) * stride;
    sendbuf[pos] = (double)j; // point index
    for (int v = 0; v < num_var; v++)
      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
    pack_pos[con]++;
  }
  // Post non-blocking recvs and sends
  int n_req = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0) n_req++;
    if (send_count[r] > 0) n_req++;
  }
  MPI_Request *reqs = new MPI_Request[n_req];
  int req_idx = 0;
  for (int r = 0; r < nprocs; r++)
  {
    if (recv_count[r] > 0)
    {
      MPI_Irecv(recvbuf + recv_offset[r] * stride,
                recv_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  for (int r = 0; r < nprocs; r++)
  {
    if (send_count[r] > 0)
    {
      MPI_Isend(sendbuf + send_offset[r] * stride,
                send_count[r] * stride, MPI_DOUBLE,
                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
    }
  }
  if (n_req > 0)
    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
  // Unpack recv buffers into Shellf
  for (int i = 0; i < total_recv; i++)
  {
    int pos = i * stride;
    int j = (int)recvbuf[pos];
    for (int v = 0; v < num_var; v++)
      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
  }
  delete[] reqs;
  delete[] sendbuf;
  delete[] recvbuf;
  delete[] pack_pos;
  delete[] send_offset;
  delete[] recv_offset;
  delete[] send_count;
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
 #ifdef INTERP_LB_PROFILE
  {
    static bool profile_written = false;
    if (!profile_written) {
      double *all_times = nullptr;
      if (myrank == 0) all_times = new double[nprocs];
      MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
                 all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
      if (myrank == 0) {
        int heavy[64];
        int nh = InterpLBProfile::identify_heavy_ranks(
            all_times, nprocs, 2.5, heavy, 64);
        InterpLBProfile::write_profile(
            "interp_lb_profile.bin", nprocs,
            all_times, heavy, nh, 2.5);
        printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
        for (int i = 0; i < nh; i++)
          printf("  Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
        delete[] all_times;
      }
      profile_written = true;
    }
  }
 #endif
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -820,22 +573,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
-  memset(Shellf, 0, sizeof(double) * NN * num_var);
+  double *shellf;
  shellf = new double[NN * num_var];
  memset(shellf, 0, sizeof(double) * NN * num_var);
-  // owner_rank[j] stores the global rank that owns point j
+  // we use weight to monitor code, later some day we can move it for optimization
-  int *owner_rank;
+  int *weight;
-  owner_rank = new int[NN];
+  weight = new int[NN];
-  for (int j = 0; j < NN; j++)
+  memset(weight, 0, sizeof(int) * NN);
    owner_rank[j] = -1;
-  // Build global-to-local rank translation for Comm_here
+  double *DH, *llb, *uub;
-  MPI_Group world_group, local_group;
+  DH = new double[dim];
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);
  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
  {
    DH[i] = getdX(i);
  }
  llb = new double[dim];
  uub = new double[dim];
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -867,6 +622,12 @@ void Patch::Interp_Points(MyList<var> *VarList,
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
 // NOTE: our dividing structure is (exclude ghost)
 // -1 0
 //       1  2
 // so (0,1) does not belong to any part for vertex structure
 // here we put (0,0.5) to left part and (0.5,1) to right part
 // BUT for cell structure the bbox is (-1.5,0.5) and (0.5,2.5), there is no missing region at all
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
@@ -891,7 +652,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
@@ -899,11 +659,14 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
            //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
          weight[j] = 1;
        }
      }
      if (Bp == ble)
@@ -912,35 +675,97 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }
-  // Collect unique global owner ranks and translate to local ranks in Comm_here
+  MPI_Allreduce(shellf, Shellf, NN * num_var, MPI_DOUBLE, MPI_SUM, Comm_here);
-  // Then broadcast each owner's points via MPI_Bcast on Comm_here
+  int *Weight;
-  {
+  Weight = new int[NN];
-    int j = 0;
+  MPI_Allreduce(weight, Weight, NN, MPI_INT, MPI_SUM, Comm_here);
    while (j < NN)
    {
      int cur_owner_global = owner_rank[j];
      if (cur_owner_global < 0)
      {
        // Point not found — skip (error check disabled for sub-communicator levels)
        j++;
        continue;
      }
      // Translate global rank to local rank in Comm_here
      int cur_owner_local;
      MPI_Group_translate_ranks(world_group, 1, &cur_owner_global, local_group, &cur_owner_local);
-      // Find contiguous run of points with the same owner
+  //  misc::tillherecheck("print me");
-      int jstart = j;
+  //  if(lmyrank == 0) cout<<"myrank = "<<myrank<<"print me"<<endl;
-      while (j < NN && owner_rank[j] == cur_owner_global)
+
-        j++;
+  for (int i = 0; i < NN; i++)
-      int count = (j - jstart) * num_var;
+  {
-      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner_local, Comm_here);
+    if (Weight[i] > 1)
    {
      if (lmyrank == 0)
        cout << "WARNING: Patch::Interp_Points meets multiple weight" << endl;
      for (int j = 0; j < num_var; j++)
        Shellf[j + i * num_var] = Shellf[j + i * num_var] / Weight[i];
    }
 #if 0 // for not involved levels, this may fail     
     else if(Weight[i] == 0 && lmyrank == 0)
     {
       cout<<"ERROR: Patch::Interp_Points fails to find point (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<XX[j][i];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")";
       }
       cout<<" on Patch (";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[j]<<"+"<<lli[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")--";
       }
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<bbox[dim+j]<<"-"<<uui[j]*getdX(j);
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 #if 0
       checkBlock();
 #else
  cout<<"splited domains:"<<endl;
  {
     MyList<Block> *Bp=blb;
     while(Bp)
     {
 	Block *BP=Bp->data;
 	for(int i=0;i<dim;i++)
 	{
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif    
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +(ghost_width-0.5)*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-(ghost_width-0.5)*DH[i];
 #else
 #ifdef Cell
          llb[i] = (feq(BP->bbox[i]    ,bbox[i]    ,DH[i]/2)) ? BP->bbox[i]+lli[i]*DH[i]     : BP->bbox[i]    +ghost_width*DH[i];
          uub[i] = (feq(BP->bbox[dim+i],bbox[dim+i],DH[i]/2)) ? BP->bbox[dim+i]-uui[i]*DH[i] : BP->bbox[dim+i]-ghost_width*DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif 
 	}       
       cout<<"(";
       for(int j=0;j<dim;j++)
       {
 	  cout<<llb[j]<<":"<<uub[j];
 	  if(j<dim-1) cout<<",";
 	  else        cout<<")"<<endl;
       }
 	if(Bp == ble) break;
 	Bp=Bp->next;
     }
  }
 #endif       
       MPI_Abort(MPI_COMM_WORLD,1);
     }
 #endif
  }
-  MPI_Group_free(&world_group);
+  delete[] shellf;
-  MPI_Group_free(&local_group);
+  delete[] weight;
-  delete[] owner_rank;
+  delete[] Weight;
  delete[] DH;
  delete[] llb;
  delete[] uub;
 }
 void Patch::checkBlock()
 {
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -39,10 +39,6 @@ public:
   bool Find_Point(double *XX);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry,
                      int Nmin_consumer, int Nmax_consumer);
   void Interp_Points(MyList<var> *VarList,
                      int NN, double **XX,
                      double *Shellf, int Symmetry, MPI_Comm Comm_here);
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -32,16 +32,6 @@ namespace Parallel
  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
  MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
                           int ib0_orig, int ib3_orig,
                           int jb1_orig, int jb4_orig,
                           int kb2_orig, int kb5_orig,
                           Patch* PP, int r_left, int r_right,
                           int ingfsi, int fngfsi, bool periodic,
                           Block* &split_first_block, Block* &split_last_block);
  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
                           int block_id, int ingfsi, int fngfsi, int lev);
  void KillBlocks(MyList<Patch> *PatchLIST);
  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -91,43 +81,6 @@ namespace Parallel
                   int Symmetry);
  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
  struct SyncCache {
    bool valid;
    int cpusize;
    MyList<gridseg> **combined_src;
    MyList<gridseg> **combined_dst;
    int *send_lengths;
    int *recv_lengths;
    double **send_bufs;
    double **recv_bufs;
    int *send_buf_caps;
    int *recv_buf_caps;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    SyncCache();
    void invalidate();
    void destroy();
  };
  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  struct AsyncSyncState {
    int req_no;
    bool active;
    AsyncSyncState() : req_no(0), active(false) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
                  SyncCache &cache, AsyncSyncState &state);
  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
@@ -140,15 +93,6 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                          MyList<var> *VarList1, MyList<var> *VarList2,
                          int Symmetry, SyncCache &cache);
  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                             MyList<var> *VarList1, MyList<var> *VarList2,
                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,8 +1,7 @@
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 #include <omp.h>
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -43,18 +42,6 @@ private:
       int ntotal;
       // ===== Precomputed spectral derivative matrices =====
       double *D1_A, *D2_A;
       double *D1_B, *D2_B;
       double *DF1_phi, *DF2_phi;
       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
       int max_threads;
       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
       struct parameters
       {
              int nvar, n1, n2, n3;
@@ -71,28 +58,6 @@ public:
                    int Newtonmaxit);
       ~TwoPunctures();
       // 02/07: New/modified methods
       void allocate_workspace();
       void free_workspace();
       void precompute_derivative_matrices();
       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
                                double *l, double *u_ws, double *d, double *y);
       void LineRelax_be_omp(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD, int tid);
       void LineRelax_al_omp(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD, int tid);
       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void Solve();
       void set_initial_guess(derivs v);
       int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -151,11 +116,23 @@ public:
       double BY_KKofxyz(double x, double y, double z);
       void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
       void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
                  double const *rhs, int const *ncols, int **cols, double **JFD);
       void LineRelax_be(double *dv,
                         int const i, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols, int **cols,
                         double **JFD);
       void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                         int n3, derivs dv, derivs u, double *values);
       void LinEquations(double A, double B, double X, double R,
                         double x, double r, double phi,
                         double y, double z, derivs dU, derivs U, double *values);
       void LineRelax_al(double *dv,
                         int const j, int const k, int const nvar,
                         int const n1, int const n2, int const n3,
                         double const *rhs, int const *ncols,
                         int **cols, double **JFD);
       void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
       void Save(char *fname);
       // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -730,12 +730,6 @@ void bssn_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -987,32 +981,6 @@ bssn_class::~bssn_class()
  delete Azzz;
 #endif
  // Destroy sync caches before GH
  if (sync_cache_pre)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_pre[i].destroy();
    delete[] sync_cache_pre;
  }
  if (sync_cache_cor)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_cor[i].destroy();
    delete[] sync_cache_cor;
  }
  if (sync_cache_rp_coarse)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
  if (sync_cache_rp_fine)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_fine[i].destroy();
    delete[] sync_cache_rp_fine;
  }
  delete GH;
 #ifdef WithShell
  delete SH;
@@ -2213,7 +2181,6 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2426,10 +2393,9 @@ void bssn_class::RecursiveStep(int lev)
 #endif
 #if (REGLEV == 0)
-  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2605,10 +2571,9 @@ void bssn_class::ParallelStep()
  delete[] tporg;
  delete[] tporgo;
 #if (REGLEV == 0)
-  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
+  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2772,10 +2737,9 @@ void bssn_class::ParallelStep()
      if (lev + 1 >= GH->movls)
      {
        //	       GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
-        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
+        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
-                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
+                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -2787,10 +2751,9 @@ void bssn_class::ParallelStep()
    // for this level
    if (YN == 1)
    {
-      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
-                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -2806,10 +2769,9 @@ void bssn_class::ParallelStep()
        if (YN == 1)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -2822,10 +2784,9 @@ void bssn_class::ParallelStep()
        if (i % 4 == 3)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -3197,7 +3158,21 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -3341,16 +3316,25 @@ void bssn_class::Step(int lev, int YN)
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3369,23 +3353,6 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
 #if (MAPBH == 0)
  // for black hole position
@@ -3561,7 +3528,24 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -3701,16 +3685,26 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" 
                                << iter_count << " variables at t = " 
                                << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -3729,25 +3723,6 @@ void bssn_class::Step(int lev, int YN)
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
 #if (MAPBH == 0)
    // for black hole position
@@ -4059,7 +4034,22 @@ void bssn_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
-  // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+  // check error information
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #ifdef WithShell
  // evolve Shell Patches
@@ -4200,16 +4190,25 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif
  }
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " 
                              << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -4228,24 +4227,6 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
 #endif
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -4405,7 +4386,23 @@ void bssn_class::Step(int lev, int YN)
      Pp = Pp->next;
    }
-    // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
+    // check error information
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime 
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #ifdef WithShell
    // evolve Shell Patches
@@ -4545,16 +4542,25 @@ void bssn_class::Step(int lev, int YN)
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -4572,25 +4578,6 @@ void bssn_class::Step(int lev, int YN)
             << " seconds! " << endl;
      }
    }
 #endif
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime
                                << ", lev = " << lev << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
 #endif
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -4956,19 +4943,11 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Predictor rhs calculation");
-  // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
@@ -4980,6 +4959,10 @@ void bssn_class::Step(int lev, int YN)
    }
  }
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #if (MAPBH == 0)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -5157,21 +5140,11 @@ void bssn_class::Step(int lev, int YN)
    //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector error check");
-    // Non-blocking error reduction overlapped with Sync to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev], &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, GH->Commlev[lev]);
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
@@ -5185,6 +5158,12 @@ void bssn_class::Step(int lev, int YN)
      }
    }
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
 #if (MAPBH == 0)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
@@ -5468,11 +5447,21 @@ void bssn_class::SHStep()
 #if (PSTR == 1 || PSTR == 2)
 //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor's error check");
 #endif
-  // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+  // check error information
  MPI_Request err_req;
  {
    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
+    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  {
@@ -5490,19 +5479,6 @@ void bssn_class::SHStep()
    }
  }
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
  if (ERROR)
  {
    SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
    if (myrank == 0)
    {
      if (ErrorMonitor->outfile)
        ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
@@ -5645,11 +5621,21 @@ void bssn_class::SHStep()
        sPp = sPp->next;
      }
    }
-    // Non-blocking error reduction overlapped with Synch to hide Allreduce latency
+    // check error information
    MPI_Request err_req_cor;
    {
      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
+      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count 
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    {
@@ -5667,20 +5653,6 @@ void bssn_class::SHStep()
      }
    }
    // Complete non-blocking error reduction and check
    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
    if (ERROR)
    {
      SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
      if (myrank == 0)
      {
        if (ErrorMonitor->outfile)
          ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
                                << " variables at t = " << PhysTime << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    sPp = SH->PatL;
    while (sPp)
    {
@@ -5809,7 +5781,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5819,11 +5791,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5860,7 +5842,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5870,11 +5852,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5888,7 +5880,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #if (PSTR == 1 || PSTR == 2)
 //    a_stream.clear();
@@ -5946,14 +5938,24 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5968,21 +5970,31 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
  }
 }
@@ -6033,14 +6045,24 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6057,21 +6079,31 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
 #endif
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
@@ -6101,11 +6133,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6156,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6134,10 +6186,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
    }
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
 }
 #undef MIXOUTB
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -126,11 +126,6 @@ public:
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;
       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -61,7 +61,9 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
 !  gont = 0: success; gont = 1: something wrong
-  integer::gont
+  integer::gont,i,j,k
  real*8 :: val1, val2
  real*8 :: det, t_gupxx, t_gupxy, t_gupxz, t_gupyy, t_gupyz, t_gupzz
 !~~~~~~> Other variables:
@@ -84,7 +86,10 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
-  real*8            :: dX, dY, dZ, PI
+  real*8            :: PI
 #if (DEBUG_NAN_CHECK)
  real*8            :: dX
 #endif
  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
@@ -106,8 +111,8 @@
  call getpbh(BHN,Porg,Mass)
 #endif
-!!! sanity check (disabled in production builds for performance)
+#if (DEBUG_NAN_CHECK)
-#ifdef DEBUG
+!!! sanity check
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -141,10 +146,6 @@
  PI = dacos(-ONE)
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
  alpn1 = Lap + ONE
  chin1 = chi + ONE
  gxx = dxx + ONE
@@ -158,15 +159,15 @@
  div_beta = betaxx + betayy + betazz
  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
-  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+
  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
  gxx_rhs = - TWO * alpn1 * Axx    -  F2o3 * gxx * div_beta          + &
              TWO *(  gxx * betaxx +   gxy * betayx +   gxz * betazx)
@@ -192,71 +193,99 @@
                                       gyz * betayx +   gzz * betazx   &
                                                    -   gxz * betayy     !rhs for gij
-! invert tilted metric
+! fused loop for metric inversion and connections
-  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+  !DIR$ SIMD
-           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+  do k=1,ex(3)
-  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
+  do j=1,ex(2)
-  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
+  do i=1,ex(1)
-  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
+     ! 1. Metric Inversion
-  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
+     det = ONE / ( &
-  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
+            gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
-  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
+            gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
            gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) )
     t_gupxx =   ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) * det
     t_gupxy = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) * det
     t_gupxz =   ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) * det
     t_gupyy =   ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) * det
     t_gupyz = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) * det
     t_gupzz =   ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) * det
     gupxx(i,j,k) = t_gupxx
     gupxy(i,j,k) = t_gupxy
     gupxz(i,j,k) = t_gupxz
     gupyy(i,j,k) = t_gupyy
     gupyz(i,j,k) = t_gupyz
     gupzz(i,j,k) = t_gupzz
     if(co == 0)then
-! Gam^i_Res = Gam^i + gup^ij_,j
+        Gmx_Res(i,j,k) = Gamx(i,j,k) - (t_gupxx*(t_gupxx*gxxx(i,j,k)+t_gupxy*gxyx(i,j,k)+t_gupxz*gxzx(i,j,k))&
-  Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
+                         +t_gupxy*(t_gupxx*gxyx(i,j,k)+t_gupxy*gyyx(i,j,k)+t_gupxz*gyzx(i,j,k))&
-                   +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+                         +t_gupxz*(t_gupxx*gxzx(i,j,k)+t_gupxy*gyzx(i,j,k)+t_gupxz*gzzx(i,j,k))&
-                   +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+                         +t_gupxx*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
-                   +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+                         +t_gupxy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
-                   +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+                         +t_gupxz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
-                   +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+                         +t_gupxx*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
-                   +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                         +t_gupxy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
-                   +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                         +t_gupxz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
-                   +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+        Gmy_Res(i,j,k) = Gamy(i,j,k) - (t_gupxx*(t_gupxy*gxxx(i,j,k)+t_gupyy*gxyx(i,j,k)+t_gupyz*gxzx(i,j,k))&
-  Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+                         +t_gupxy*(t_gupxy*gxyx(i,j,k)+t_gupyy*gyyx(i,j,k)+t_gupyz*gyzx(i,j,k))&
-                   +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+                         +t_gupxz*(t_gupxy*gxzx(i,j,k)+t_gupyy*gyzx(i,j,k)+t_gupyz*gzzx(i,j,k))&
-                   +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+                         +t_gupxy*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
-                   +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+                         +t_gupyy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
-                   +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+                         +t_gupyz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
-                   +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+                         +t_gupxy*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
-                   +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                         +t_gupyy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
-                   +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                         +t_gupyz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
-                   +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+        Gmz_Res(i,j,k) = Gamz(i,j,k) - (t_gupxx*(t_gupxz*gxxx(i,j,k)+t_gupyz*gxyx(i,j,k)+t_gupzz*gxzx(i,j,k))&
-  Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+                         +t_gupxy*(t_gupxz*gxyx(i,j,k)+t_gupyz*gyyx(i,j,k)+t_gupzz*gyzx(i,j,k))&
-                   +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+                         +t_gupxz*(t_gupxz*gxzx(i,j,k)+t_gupyz*gyzx(i,j,k)+t_gupzz*gzzx(i,j,k))&
-                   +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+                         +t_gupxy*(t_gupxz*gxxy(i,j,k)+t_gupyz*gxyy(i,j,k)+t_gupzz*gxzy(i,j,k))&
-                   +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+                         +t_gupyy*(t_gupxz*gxyy(i,j,k)+t_gupyz*gyyy(i,j,k)+t_gupzz*gyzy(i,j,k))&
-                   +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+                         +t_gupyz*(t_gupxz*gxzy(i,j,k)+t_gupyz*gyzy(i,j,k)+t_gupzz*gzzy(i,j,k))&
-                   +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+                         +t_gupxz*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
-                   +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                         +t_gupyz*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
-                   +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                         +t_gupzz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
                   +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
     endif
-! second kind of connection
+     ! 2. Christoffel Symbols
-  Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
+     val1 = TWO * gxyx(i,j,k) - gxxy(i,j,k)
-  Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
+     val2 = TWO * gxzx(i,j,k) - gxxz(i,j,k)
-  Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
+     Gamxxx(i,j,k) =HALF*( t_gupxx*gxxx(i,j,k) + t_gupxy*val1 + t_gupxz*val2 )
     Gamyxx(i,j,k) =HALF*( t_gupxy*gxxx(i,j,k) + t_gupyy*val1 + t_gupyz*val2 )
     Gamzxx(i,j,k) =HALF*( t_gupxz*gxxx(i,j,k) + t_gupyz*val1 + t_gupzz*val2 )
-  Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
+     val1 = TWO * gxyy(i,j,k) - gyyx(i,j,k)
-  Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
+     val2 = TWO * gyzy(i,j,k) - gyyz(i,j,k)
-  Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
+     Gamxyy(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyy(i,j,k) + t_gupxz*val2 )
     Gamyyy(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyy(i,j,k) + t_gupyz*val2 )
     Gamzyy(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyy(i,j,k) + t_gupzz*val2 )
-  Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
+     val1 = TWO * gxzz(i,j,k) - gzzx(i,j,k)
-  Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
+     val2 = TWO * gyzz(i,j,k) - gzzy(i,j,k)
-  Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
+     Gamxzz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*val2 + t_gupxz*gzzz(i,j,k) )
     Gamyzz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*val2 + t_gupyz*gzzz(i,j,k) )
     Gamzzz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*val2 + t_gupzz*gzzz(i,j,k) )
-  Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
+     val1 = gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)
-  Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
+     Gamxxy(i,j,k) =HALF*( t_gupxx*gxxy(i,j,k) + t_gupxy*gyyx(i,j,k) + t_gupxz*val1 )
-  Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
+     Gamyxy(i,j,k) =HALF*( t_gupxy*gxxy(i,j,k) + t_gupyy*gyyx(i,j,k) + t_gupyz*val1 )
     Gamzxy(i,j,k) =HALF*( t_gupxz*gxxy(i,j,k) + t_gupyz*gyyx(i,j,k) + t_gupzz*val1 )
     val1 = gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)
     Gamxxz(i,j,k) =HALF*( t_gupxx*gxxz(i,j,k) + t_gupxy*val1 + t_gupxz*gzzx(i,j,k) )
     Gamyxz(i,j,k) =HALF*( t_gupxy*gxxz(i,j,k) + t_gupyy*val1 + t_gupyz*gzzx(i,j,k) )
     Gamzxz(i,j,k) =HALF*( t_gupxz*gxxz(i,j,k) + t_gupyz*val1 + t_gupzz*gzzx(i,j,k) )
     val1 = gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)
     Gamxyz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyz(i,j,k) + t_gupxz*gzzy(i,j,k) )
     Gamyyz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyz(i,j,k) + t_gupyz*gzzy(i,j,k) )
     Gamzyz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyz(i,j,k) + t_gupzz*gzzy(i,j,k) )
  enddo
  enddo
  enddo
  Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
  Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
  Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
  Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
  Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
  Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
 ! Raise indices of \tilde A_{ij} and store in R_ij
  Rxx =    gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
@@ -287,30 +316,40 @@
  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
  ! reuse fxx/fxy/fxz as temporaries for matter-source combinations
  fxx = F2o3 * Kx + EIGHT * PI * Sx
  fxy = F2o3 * Ky + EIGHT * PI * Sy
  fxz = F2o3 * Kz + EIGHT * PI * Sz
  ! reuse Gamxa/Gamya/Gamza as temporaries for chix*R combinations
  Gamxa = chix * Rxx + chiy * Rxy + chiz * Rxz
  Gamya = chix * Rxy + chiy * Ryy + chiz * Ryz
  Gamza = chix * Rxz + chiy * Ryz + chiz * Rzz
   Gamx_rhs = - TWO * (   Lapx * Rxx +   Lapy * Rxy +   Lapz * Rxz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2/chin1 * (   chix * Rxx +   chiy * Rxy +   chiz * Rxz ) - &
+        -F3o2 * ONE/chin1 * Gamxa - &
-              gupxx * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupxx * fxx - &
-              gupxy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupxy * fxy - &
-              gupxz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+              gupxz * fxz + &
                        Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz   + &
                TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
   Gamy_rhs = - TWO * (   Lapx * Rxy +   Lapy * Ryy +   Lapz * Ryz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2/chin1 * (   chix * Rxy +  chiy * Ryy +    chiz * Ryz ) - &
+        -F3o2 * ONE/chin1 * Gamya - &
-              gupxy * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupxy * fxx - &
-              gupyy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupyy * fxy - &
-              gupyz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+              gupyz * fxz + &
                        Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz   + &
                TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
   Gamz_rhs = - TWO * (   Lapx * Rxz +   Lapy * Ryz +   Lapz * Rzz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2/chin1 * (   chix * Rxz +  chiy * Ryz +    chiz * Rzz ) - &
+        -F3o2 * ONE/chin1 * Gamza - &
-              gupxz * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupxz * fxx - &
-              gupyz * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupyz * fxy - &
-              gupzz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+              gupzz * fxz + &
                        Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz   + &
                TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
@@ -612,47 +651,47 @@
  fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
 ! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
-  f =        gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
+  f =        gupxx * ( fxx - F3o2 * ONE/chin1 * chix * chix ) + &
-             gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
+             gupyy * ( fyy - F3o2 * ONE/chin1 * chiy * chiy ) + &
-             gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
+             gupzz * ( fzz - F3o2 * ONE/chin1 * chiz * chiz ) + &
-       TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
+       TWO * gupxy * ( fxy - F3o2 * ONE/chin1 * chix * chiy ) + &
-       TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
+       TWO * gupxz * ( fxz - F3o2 * ONE/chin1 * chix * chiz ) + &
-       TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) 
+       TWO * gupyz * ( fyz - F3o2 * ONE/chin1 * chiy * chiz ) 
 ! Add chi part to Ricci tensor:
-  Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
+  Rxx = Rxx + (fxx - chix*chix*ONE/chin1*HALF + gxx * f) * ONE/chin1 * HALF
-  Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
+  Ryy = Ryy + (fyy - chiy*chiy*ONE/chin1*HALF + gyy * f) * ONE/chin1 * HALF
-  Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
+  Rzz = Rzz + (fzz - chiz*chiz*ONE/chin1*HALF + gzz * f) * ONE/chin1 * HALF
-  Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
+  Rxy = Rxy + (fxy - chix*chiy*ONE/chin1*HALF + gxy * f) * ONE/chin1 * HALF
-  Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
+  Rxz = Rxz + (fxz - chix*chiz*ONE/chin1*HALF + gxz * f) * ONE/chin1 * HALF
-  Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
+  Ryz = Ryz + (fyz - chiy*chiz*ONE/chin1*HALF + gyz * f) * ONE/chin1 * HALF
 ! covariant second derivatives of the lapse respect to physical metric
  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
                SYM,SYM,SYM,symmetry,Lev)
-  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
+  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz) * ONE/chin1
-  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
+  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz) * ONE/chin1
-  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
+  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz) * ONE/chin1
 ! now get physical second kind of connection
-  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
+  Gamxxx = Gamxxx - ( TWO * chix * ONE/chin1 - gxx * gxxx )*HALF
  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
  Gamxyy = Gamxyy - (                     - gyy * gxxx )*HALF
-  Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
+  Gamyyy = Gamyyy - ( TWO * chiy * ONE/chin1 - gyy * gxxy )*HALF
  Gamzyy = Gamzyy - (                     - gyy * gxxz )*HALF
  Gamxzz = Gamxzz - (                     - gzz * gxxx )*HALF
  Gamyzz = Gamyzz - (                     - gzz * gxxy )*HALF
-  Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
+  Gamzzz = Gamzzz - ( TWO * chiz * ONE/chin1 - gzz * gxxz )*HALF
-  Gamxxy = Gamxxy - (  chiy        /chin1 - gxy * gxxx )*HALF
+  Gamxxy = Gamxxy - (  chiy * ONE/chin1 - gxy * gxxx )*HALF
-  Gamyxy = Gamyxy - (         chix /chin1 - gxy * gxxy )*HALF
+  Gamyxy = Gamyxy - (  chix * ONE/chin1 - gxy * gxxy )*HALF
  Gamzxy = Gamzxy - (                     - gxy * gxxz )*HALF
-  Gamxxz = Gamxxz - (  chiz        /chin1 - gxz * gxxx )*HALF
+  Gamxxz = Gamxxz - (  chiz * ONE/chin1 - gxz * gxxx )*HALF
  Gamyxz = Gamyxz - (                     - gxz * gxxy )*HALF
-  Gamzxz = Gamzxz - (         chix /chin1 - gxz * gxxz )*HALF
+  Gamzxz = Gamzxz - (  chix * ONE/chin1 - gxz * gxxz )*HALF
  Gamxyz = Gamxyz - (                     - gyz * gxxx )*HALF
-  Gamyyz = Gamyyz - (  chiz        /chin1 - gyz * gxxy )*HALF
+  Gamyyz = Gamyyz - (  chiz * ONE/chin1 - gyz * gxxy )*HALF
-  Gamzyz = Gamzyz - (         chiy /chin1 - gyz * gxxz )*HALF
+  Gamzyz = Gamzyz - (  chiy * ONE/chin1 - gyz * gxxz )*HALF
  fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
  fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
@@ -695,7 +734,7 @@
       gupxz * (Axy * Azz + Ayz * Axz) + &
       gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
  f = - F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
-        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1 * ONE/chin1 * f)
  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
@@ -815,7 +854,8 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
+  fxx = dsqrt(chin1)
  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
  dtSfx_rhs = Gamx_rhs - reta*dtSfx
  dtSfy_rhs = Gamy_rhs - reta*dtSfy
  dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -827,7 +867,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
+  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
  dtSfx_rhs = Gamx_rhs - reta*dtSfx
  dtSfy_rhs = Gamy_rhs - reta*dtSfy
  dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -835,7 +875,8 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
+  fxx = dsqrt(chin1)
  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
  betax_rhs = FF*Gamx - reta*betax
  betay_rhs = FF*Gamy - reta*betay
  betaz_rhs = FF*Gamz - reta*betaz
@@ -847,7 +888,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
+  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
  betax_rhs = FF*Gamx - reta*betax
  betay_rhs = FF*Gamy - reta*betay
  betaz_rhs = FF*Gamz - reta*betaz
@@ -945,60 +986,103 @@
  SSA(2)=SYM
  SSA(3)=ANTI
-!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
+!!!!!!!!!advection term part
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
 ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
 ! so the constant offset has no effect on dissipation.
-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
-  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
-  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
-
+!!
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #else
 ! No dissipation on gauge variables (advection only)
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
  if(eps>0)then 
 ! usual Kreiss-Oliger dissipation      
  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "before",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "after",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
 #endif
 #endif
  endif
  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
@@ -1036,48 +1120,48 @@
 ! mov_Res_j = gupkj*(-1/chi d_k chi*A_ij + D_k A_ij) - 2/3 d_j trK - 8 PI s_j where D respect to physical metric
 ! store D_i A_jk - 1/chi d_i chi*A_jk in gjk_i
  call fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0)
  call fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0)
  call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0)
  call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  gxxx = gxxx - (  Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz &
-                 + Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx/chin1
+                 + Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx*ONE/chin1
  gxyx = gxyx - (  Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
-                 + Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy/chin1
+                 + Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy*ONE/chin1
  gxzx = gxzx - (  Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
-                 + Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz/chin1
+                 + Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz*ONE/chin1
  gyyx = gyyx - (  Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz &
-                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy/chin1
+                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy*ONE/chin1
  gyzx = gyzx - (  Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz &
-                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz/chin1
+                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz*ONE/chin1
  gzzx = gzzx - (  Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz &
-                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz/chin1
+                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz*ONE/chin1
  gxxy = gxxy - (  Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
-                 + Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx/chin1
+                 + Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx*ONE/chin1
  gxyy = gxyy - (  Gamxyy * Axx + Gamyyy * Axy + Gamzyy * Axz &
-                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy/chin1
+                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy*ONE/chin1
  gxzy = gxzy - (  Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
-                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz/chin1
+                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz*ONE/chin1
  gyyy = gyyy - (  Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz &
-                 + Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy/chin1
+                 + Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy*ONE/chin1
  gyzy = gyzy - (  Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
-                 + Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz/chin1
+                 + Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz*ONE/chin1
  gzzy = gzzy - (  Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz &
-                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz/chin1
+                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz*ONE/chin1
  gxxz = gxxz - (  Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
-                 + Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx/chin1
+                 + Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx*ONE/chin1
  gxyz = gxyz - (  Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
-                 + Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy/chin1
+                 + Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy*ONE/chin1
  gxzz = gxzz - (  Gamxzz * Axx + Gamyzz * Axy + Gamzzz * Axz &
-                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz/chin1
+                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz*ONE/chin1
  gyyz = gyyz - (  Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
-                 + Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy/chin1
+                 + Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy*ONE/chin1
  gyzz = gyzz - (  Gamxzz * Axy + Gamyzz * Ayy + Gamzzz * Ayz &
-                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz/chin1
+                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz*ONE/chin1
  gzzz = gzzz - (  Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz &
-                 + Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz/chin1
+                 + Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz*ONE/chin1
 movx_Res = gupxx*gxxx + gupyy*gxyy + gupzz*gxzz &
          +gupxy*gxyx + gupxz*gxzx + gupyz*gxzy &
          +gupxy*gxxy + gupxz*gxxz + gupyz*gxyz
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -130,11 +130,7 @@ void cgh::compose_cgh(int nprocs)
  for (int lev = 0; lev < levels; lev++)
  {
    checkPatchList(PatL[lev], false);
 #ifdef INTERP_LB_OPTIMIZE
    Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
 #else
    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
 #endif
 #if (RPB == 1)
    // we need distributed box of PatL[lev] and PatL[lev-1]
    if (lev > 0)
@@ -1305,13 +1301,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
 }
-bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                          MyList<var> *OldList, MyList<var> *StateList,
                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                          monitor *ErrorMonitor)
 {
  if (lev < movls)
-    return false;
+    return;
 #if (0)
  // #if (PSTR == 1 || PSTR == 2)
@@ -1400,7 +1396,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
      for (bhi = 0; bhi < BH_num; bhi++)
        delete[] tmpPorg[bhi];
      delete[] tmpPorg;
-      return false;
+      return;
    }
    // x direction
    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
@@ -1504,7 +1500,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
  for (int bhi = 0; bhi < BH_num; bhi++)
    delete[] tmpPorg[bhi];
  delete[] tmpPorg;
  return tot_flag;
 }
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -74,7 +74,7 @@ public:
                               MyList<var> *OldList, MyList<var> *StateList,
                               MyList<var> *FutureList, MyList<var> *tmList,
                               int Symmetry, bool BB);
-   bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                        MyList<var> *OldList, MyList<var> *StateList,
                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                        monitor *ErrorMonitor);
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,8 +69,6 @@
  fy = ZEO
  fz = ZEO
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -373,8 +371,6 @@
  fxz = ZEO
  fyz = ZEO
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -1943,6 +1939,309 @@
  return
  end subroutine fddyz
  subroutine fderivs_batch4(ex,f1,f2,f3,f4, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z,f4x,f4y,f4z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3,f4
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f4x,f4y,f4z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3,fh4
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  call symmetry_bd(2,ex,f4,fh4,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  f4x = ZEO; f4y = ZEO; f4z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
      f4x(i,j,k)=d12dx*(fh4(i-2,j,k)-EIT*fh4(i-1,j,k)+EIT*fh4(i+1,j,k)-fh4(i+2,j,k))
      f4y(i,j,k)=d12dy*(fh4(i,j-2,k)-EIT*fh4(i,j-1,k)+EIT*fh4(i,j+1,k)-fh4(i,j+2,k))
      f4z(i,j,k)=d12dz*(fh4(i,j,k-2)-EIT*fh4(i,j,k-1)+EIT*fh4(i,j,k+1)-fh4(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
      f4x(i,j,k)=d2dx*(-fh4(i-1,j,k)+fh4(i+1,j,k))
      f4y(i,j,k)=d2dy*(-fh4(i,j-1,k)+fh4(i,j+1,k))
      f4z(i,j,k)=d2dz*(-fh4(i,j,k-1)+fh4(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch4
 !-----------------------------------------------------------------------------
 ! batch first derivatives (3 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch3(ex,f1,f2,f3, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch3
 !-----------------------------------------------------------------------------
 ! batch first derivatives (2 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch2(ex,f1,f2, &
                            f1x,f1y,f1z,f2x,f2y,f2z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch2
 #elif (ghost_width == 4)
 ! sixth order code
@@ -2081,6 +2380,9 @@
  end subroutine fderivs
 !-----------------------------------------------------------------------------
 ! batch first derivatives (4 fields), same symmetry setup
 !-----------------------------------------------------------------------------
 !-----------------------------------------------------------------------------
 !
 ! single derivatives dx
 !
--- a/AMSS_NCKU_source/enforce_algebra.f90
+++ b/AMSS_NCKU_source/enforce_algebra.f90
@@ -19,60 +19,48 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA,detg
-  real*8 :: lgxx,lgyy,lgzz,ldetg
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: ltrA,lscale
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
-    lgxx = dxx(i,j,k) + ONE
+  detg =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-    lgyy = dyy(i,j,k) + ONE
+          gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-    lgzz = dzz(i,j,k) + ONE
+  gupxx =   ( gyy * gzz - gyz * gyz ) / detg
  gupxy = - ( gxy * gzz - gyz * gxz ) / detg
  gupxz =   ( gxy * gyz - gyy * gxz ) / detg
  gupyy =   ( gxx * gzz - gxz * gxz ) / detg
  gupyz = - ( gxx * gyz - gxy * gxz ) / detg
  gupzz =   ( gxx * gyy - gxy * gxy ) / detg
-    ldetg =  lgxx * lgyy * lgzz &
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-           + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) &
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
           + gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) &
           - gxz(i,j,k) * lgyy * gxz(i,j,k) &
           - gxy(i,j,k) * gxy(i,j,k) * lgzz &
           - lgxx * gyz(i,j,k) * gyz(i,j,k)
-    lgupxx =   ( lgyy * lgzz - gyz(i,j,k) * gyz(i,j,k) ) / ldetg
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxy = - ( gxy(i,j,k) * lgzz - gyz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxz =   ( gxy(i,j,k) * gyz(i,j,k) - lgyy * gxz(i,j,k) ) / ldetg
+  Axz = Axz - F1o3 * gxz * trA
-    lgupyy =   ( lgxx * lgzz - gxz(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyz = - ( lgxx * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / ldetg
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupzz =   ( lgxx * lgyy - gxy(i,j,k) * gxy(i,j,k) ) / ldetg
+  Azz = Azz - F1o3 * gzz * trA
-    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
+  detg = ONE / ( detg ** F1o3 ) 
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
-    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
+  gxx = gxx * detg
-    Axy(i,j,k) = Axy(i,j,k) - F1o3 * gxy(i,j,k) * ltrA
+  gxy = gxy * detg
-    Axz(i,j,k) = Axz(i,j,k) - F1o3 * gxz(i,j,k) * ltrA
+  gxz = gxz * detg
-    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
+  gyy = gyy * detg
-    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * gyz(i,j,k) * ltrA
+  gyz = gyz * detg
-    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
+  gzz = gzz * detg
-    lscale = ONE / ( ldetg ** F1o3 )
+  dxx = gxx - ONE
-
+  dyy = gyy - ONE
-    dxx(i,j,k) = lgxx * lscale - ONE
+  dzz = gzz - ONE
    gxy(i,j,k) = gxy(i,j,k) * lscale
    gxz(i,j,k) = gxz(i,j,k) * lscale
    dyy(i,j,k) = lgyy * lscale - ONE
    gyz(i,j,k) = gyz(i,j,k) * lscale
    dzz(i,j,k) = lgzz * lscale - ONE
  enddo
  enddo
  enddo
  return
@@ -95,70 +83,50 @@
 !~~~~~~~> Local variable:
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)) :: trA
-  real*8 :: lgxx,lgyy,lgzz,lscale
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gxx,gyy,gzz 
-  real*8 :: lgxy,lgxz,lgyz
+  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
  real*8 :: lgupxx,lgupxy,lgupxz,lgupyy,lgupyz,lgupzz
  real*8 :: ltrA
  real*8, parameter :: F1o3 = 1.D0 / 3.D0, ONE = 1.D0, TWO = 2.D0
 !~~~~~~>
-  do k=1,ex(3)
+  gxx = dxx + ONE
-  do j=1,ex(2)
+  gyy = dyy + ONE
-  do i=1,ex(1)
+  gzz = dzz + ONE
 ! for g
  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-! for g: normalize determinant first
+  gupzz = ONE / ( gupzz ** F1o3 ) 
    lgxx = dxx(i,j,k) + ONE
    lgyy = dyy(i,j,k) + ONE
    lgzz = dzz(i,j,k) + ONE
    lgxy = gxy(i,j,k)
    lgxz = gxz(i,j,k)
    lgyz = gyz(i,j,k)
-    lscale =  lgxx * lgyy * lgzz + lgxy * lgyz * lgxz &
+  gxx = gxx * gupzz
-            + lgxz * lgxy * lgyz - lgxz * lgyy * lgxz &
+  gxy = gxy * gupzz
-            - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz
+  gxz = gxz * gupzz
  gyy = gyy * gupzz
  gyz = gyz * gupzz
  gzz = gzz * gupzz
-    lscale = ONE / ( lscale ** F1o3 )
+  dxx = gxx - ONE
  dyy = gyy - ONE
  dzz = gzz - ONE
 ! for A  
-    lgxx = lgxx * lscale
+  gupxx =   ( gyy * gzz - gyz * gyz )
-    lgxy = lgxy * lscale
+  gupxy = - ( gxy * gzz - gyz * gxz )
-    lgxz = lgxz * lscale
+  gupxz =   ( gxy * gyz - gyy * gxz )
-    lgyy = lgyy * lscale
+  gupyy =   ( gxx * gzz - gxz * gxz )
-    lgyz = lgyz * lscale
+  gupyz = - ( gxx * gyz - gxy * gxz )
-    lgzz = lgzz * lscale
+  gupzz =   ( gxx * gyy - gxy * gxy )
-    dxx(i,j,k) = lgxx - ONE
+  trA =         gupxx * Axx + gupyy * Ayy + gupzz * Azz &
-    gxy(i,j,k) = lgxy
+       + TWO * (gupxy * Axy + gupxz * Axz + gupyz * Ayz)
    gxz(i,j,k) = lgxz
    dyy(i,j,k) = lgyy - ONE
    gyz(i,j,k) = lgyz
    dzz(i,j,k) = lgzz - ONE
-! for A: trace-free using normalized metric (det=1, no division needed)
+  Axx = Axx - F1o3 * gxx * trA
-    lgupxx =   ( lgyy * lgzz - lgyz * lgyz )
+  Axy = Axy - F1o3 * gxy * trA
-    lgupxy = - ( lgxy * lgzz - lgyz * lgxz )
+  Axz = Axz - F1o3 * gxz * trA
-    lgupxz =   ( lgxy * lgyz - lgyy * lgxz )
+  Ayy = Ayy - F1o3 * gyy * trA
-    lgupyy =   ( lgxx * lgzz - lgxz * lgxz )
+  Ayz = Ayz - F1o3 * gyz * trA
-    lgupyz = - ( lgxx * lgyz - lgxy * lgxz )
+  Azz = Azz - F1o3 * gzz * trA
    lgupzz =   ( lgxx * lgyy - lgxy * lgxy )
    ltrA =         lgupxx * Axx(i,j,k) + lgupyy * Ayy(i,j,k) &
                 + lgupzz * Azz(i,j,k) &
         + TWO * (lgupxy * Axy(i,j,k) + lgupxz * Axz(i,j,k) &
                 + lgupyz * Ayz(i,j,k))
    Axx(i,j,k) = Axx(i,j,k) - F1o3 * lgxx * ltrA
    Axy(i,j,k) = Axy(i,j,k) - F1o3 * lgxy * ltrA
    Axz(i,j,k) = Axz(i,j,k) - F1o3 * lgxz * ltrA
    Ayy(i,j,k) = Ayy(i,j,k) - F1o3 * lgyy * ltrA
    Ayz(i,j,k) = Ayz(i,j,k) - F1o3 * lgyz * ltrA
    Azz(i,j,k) = Azz(i,j,k) - F1o3 * lgzz * ltrA
  enddo
  enddo
  enddo
  return
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -1,318 +0,0 @@
 #include "tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff)
 {
    (void)onoff;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
    const double F8     = 8.0;
    const double F16    = 16.0;
    const double F30    = 30.0;
    const double F1o12  = ONE / 12.0;
    const double F1o144 = ONE / 144.0;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static double *fh = NULL;
    static size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    symmetry_bd(2, ex, f, fh, SoA);
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
    const double Sdzdz = ONE / (dZ * dZ);
    const double Fdxdx = F1o12 / (dX * dX);
    const double Fdydy = F1o12 / (dY * dY);
    const double Fdzdz = F1o12 / (dZ * dZ);
    const double Sdxdy = F1o4 / (dX * dY);
    const double Sdxdz = F1o4 / (dX * dZ);
    const double Sdydz = F1o4 / (dY * dZ);
    const double Fdxdy = F1o144 / (dX * dY);
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);
    /* 只清零不被主循环覆盖的边界面 */
    {
        /* 高边界：k0=ex3-1 */
        for (int j0 = 0; j0 < ex2; ++j0)
            for (int i0 = 0; i0 < ex1; ++i0) {
                const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
            }
        /* 高边界：j0=ex2-1 */
        for (int k0 = 0; k0 < ex3 - 1; ++k0)
            for (int i0 = 0; i0 < ex1; ++i0) {
                const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
            }
        /* 高边界：i0=ex1-1 */
        for (int k0 = 0; k0 < ex3 - 1; ++k0)
            for (int j0 = 0; j0 < ex2 - 1; ++j0) {
                const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
            }
        /* 低边界：当二阶模板也不可用时，对应 i0/j0/k0=0 面 */
        if (kminF == 1) {
            for (int j0 = 0; j0 < ex2; ++j0)
                for (int i0 = 0; i0 < ex1; ++i0) {
                    const size_t p = idx_ex(i0, j0, 0, ex);
                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
                }
        }
        if (jminF == 1) {
            for (int k0 = 0; k0 < ex3; ++k0)
                for (int i0 = 0; i0 < ex1; ++i0) {
                    const size_t p = idx_ex(i0, 0, k0, ex);
                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
                }
        }
        if (iminF == 1) {
            for (int k0 = 0; k0 < ex3; ++k0)
                for (int j0 = 0; j0 < ex2; ++j0) {
                    const size_t p = idx_ex(0, j0, k0, ex);
                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
                }
        }
    }
    /*
     * 两段式：
     * 1) 二阶可用区域先计算二阶模板
     * 2) 高阶可用区域再覆盖四阶模板
     */
    const int i2_lo = (iminF > 0) ? iminF : 0;
    const int j2_lo = (jminF > 0) ? jminF : 0;
    const int k2_lo = (kminF > 0) ? kminF : 0;
    const int i2_hi = ex1 - 2;
    const int j2_hi = ex2 - 2;
    const int k2_hi = ex3 - 2;
    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
    const int i4_hi = ex1 - 3;
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;
    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Sdydy * (
                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Sdzdz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Sdxdy * (
                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                    );
                    fxz[p] = Sdxdz * (
                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                    );
                    fyz[p] = Sdydz * (
                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
                }
            }
        }
    }
    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    fxx[p] = Fdxdx * (
                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Fdydy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Fdzdz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    {
                        const double t_jm2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
                        const double t_jm1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
                        const double t_jp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
                        const double t_jp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
                    }
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -1,167 +0,0 @@
 #include "tool.h"
 /*
 * C 版 fderivs
 *
 * Fortran:
 * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
 *
 * 约定：
 *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
 *   X: ex1, Y: ex2, Z: ex3
 */
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff)
 {
    (void)onoff; // Fortran 里没用到
    const double ZEO = 0.0, ONE = 1.0;
    const double TWO = 2.0, EIT = 8.0;
    const double F12 = 12.0;
    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    // Fortran 1-based bounds
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    // SoA(1:3) = SYM1,SYM2,SYM3
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static double *fh = NULL;
    static size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // call symmetry_bd(2,ex,f,fh,SoA)
    symmetry_bd(2, ex, f, fh, SoA);
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    // fx = fy = fz = 0
    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    for (size_t p = 0; p < all; ++p) {
        fx[p] = ZEO;
        fy[p] = ZEO;
        fz[p] = ZEO;
    }
    /*
     * 两段式：
     * 1) 先在二阶可用区域计算二阶模板
     * 2) 再在高阶可用区域覆盖为四阶模板
     *
     * 与原 if/elseif 逻辑等价，但减少逐点分支判断。
     */
    const int i2_lo = (iminF > 0) ? iminF : 0;
    const int j2_lo = (jminF > 0) ? jminF : 0;
    const int k2_lo = (kminF > 0) ? kminF : 0;
    const int i2_hi = ex1 - 2;
    const int j2_hi = ex2 - 2;
    const int k2_hi = ex3 - 2;
    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
    const int i4_hi = ex1 - 3;
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;
    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                    );
                    fy[p] = d12dy * (
                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                    );
                    fz[p] = d12dz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,6 +324,7 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -349,6 +350,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -377,6 +379,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -883,17 +886,14 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
+  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
   enddo
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
   enddo
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
@@ -912,6 +912,7 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -940,6 +941,7 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
  funcc = 0.d0
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
@@ -1115,169 +1117,27 @@ end subroutine d2dump
 !------------------------------------------------------------------------------
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------
 #ifndef POLINT6_USE_BARYCENTRIC
 #define POLINT6_USE_BARYCENTRIC 1
 #endif
 !DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
  subroutine polint6_neville(xa, ya, x, y, dy)
  implicit none
  real*8, dimension(6), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
  integer :: i, m, ns, n_m
  real*8, dimension(6) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
  c = ya
  d = ya
  ho = xa - x
  ns = 1
  dif = abs(x - xa(1))
  do i = 2, 6
    dift = abs(x - xa(i))
    if (dift < dif) then
      ns = i
      dif = dift
    end if
  end do
  y = ya(ns)
  ns = ns - 1
  do m = 1, 5
    n_m = 6 - m
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
        write(*,*) 'failure in polint for point',x
        write(*,*) 'with input points: ',xa
        stop
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
      dy = d(ns)
      ns = ns - 1
    end if
    y = y + dy
  end do
  return
  end subroutine polint6_neville
 !DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
  subroutine polint6_barycentric(xa, ya, x, y, dy)
  implicit none
  real*8, dimension(6), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
  integer :: i, j
  logical :: is_uniform
  real*8, dimension(6) :: lambda
  real*8 :: dx, den_i, term, num, den, step, tol
  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
  do i = 1, 6
    if (x == xa(i)) then
      y = ya(i)
      dy = 0.d0
      return
    end if
  end do
  step = xa(2) - xa(1)
  is_uniform = (step /= 0.d0)
  if (is_uniform) then
    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
    do i = 3, 6
      if (abs((xa(i) - xa(i-1)) - step) > tol) then
        is_uniform = .false.
        exit
      end if
    end do
  end if
  if (is_uniform) then
    num = 0.d0
    den = 0.d0
    do i = 1, 6
      term = c_uniform(i) / (x - xa(i))
      num = num + term * ya(i)
      den = den + term
    end do
    y = num / den
    dy = 0.d0
    return
  end if
  do i = 1, 6
    den_i = 1.d0
    do j = 1, 6
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        den_i = den_i * dx
      end if
    end do
    lambda(i) = 1.d0 / den_i
  end do
  num = 0.d0
  den = 0.d0
  do i = 1, 6
    term = lambda(i) / (x - xa(i))
    num = num + term * ya(i)
    den = den + term
  end do
  y = num / den
  dy = 0.d0
  return
  end subroutine polint6_barycentric
 !DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa,ya,x,y,dy,ordn)
  implicit none
 !~~~~~~> Input Parameter:
  integer,intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa,ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y,dy
-  integer :: i, m, ns, n_m
+!~~~~~~> Other parameter:
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
-  if (ordn == 6) then
+  integer :: m,n,ns
-#if POLINT6_USE_BARYCENTRIC
+  real*8, dimension(ordn) :: c,d,den,ho
-    call polint6_barycentric(xa, ya, x, y, dy)
+  real*8 :: dif,dift
-#else
+
-    call polint6_neville(xa, ya, x, y, dy)
+!~~~~~~>
-#endif
+
-    return
+  n=ordn
-  end if
+  m=ordn
  c=ya
  d=ya
@@ -1285,38 +1145,27 @@ end subroutine d2dump
  ns=1
  dif=abs(x-xa(1))
-
+  do m=1,n
-  do i = 2, ordn
+   dift=abs(x-xa(m))
    dift = abs(x - xa(i))
   if(dift < dif) then
-      ns = i
+    ns=m
    dif=dift
   end if
  end do
  y=ya(ns)
  ns=ns-1
-
+  do m=1,n-1
-  do m = 1, ordn - 1
+    den(1:n-m)=ho(1:n-m)-ho(1+m:n)
-    n_m = ordn - m
+    if (any(den(1:n-m) == 0.0))then
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
      write(*,*) 'failure in polint for point',x
      write(*,*) 'with input points: ',xa
      stop
    endif
-
+    den(1:n-m)=(c(2:n-m+1)-d(1:n-m))/den(1:n-m)
-      den_val = (c(i+1) - d(i)) / den_val
+    d(1:n-m)=ho(1+m:n)*den(1:n-m)
-
+    c(1:n-m)=ho(1:n-m)*den(1:n-m)
-      d(i) = h * den_val
+    if (2*ns < n-m) then
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy=c(ns+1)
    else
      dy=d(ns)
@@ -1326,79 +1175,43 @@ end subroutine d2dump
  end do
  return
  end subroutine polint
 !------------------------------------------------------------------------------
 ! Compute Lagrange interpolation basis weights for one target point.
 !------------------------------------------------------------------------------
 !DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
  subroutine polint_lagrange_weights(xa, x, w, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: xa
  real*8, intent(in) :: x
  real*8, dimension(1:ordn), intent(out) :: w
  integer :: i, j
  real*8 :: num, den, dx
  do i = 1, ordn
    num = 1.d0
    den = 1.d0
    do j = 1, ordn
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        num = num * (x - xa(j))
        den = den * dx
      end if
    end do
    w(i) = num / den
  end do
  return
  end subroutine polint_lagrange_weights
 !------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a
  real*8, dimension(1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
  m=size(x1a)
  do i=1,m
    yntmp=ya(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: j
  real*8, dimension(ordn) :: ymtmp
  real*8 :: dy_temp
  do j=1,ordn
    call polint(x1a, ya(:,j), x1, ymtmp(j), dy_temp, ordn)
  end do
-  call polint(x2a, ymtmp, x2, y, dy, ordn)
+
-#endif
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
  return
  end subroutine polin2
 !------------------------------------------------------------------------------
 !
@@ -1406,15 +1219,18 @@ end subroutine d2dump
 !
 !------------------------------------------------------------------------------
  subroutine polin3(x1a,x2a,x3a,ya,x1,x2,x3,y,dy,ordn)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: x1a,x2a,x3a
  real*8, dimension(1:ordn,1:ordn,1:ordn), intent(in) :: ya
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+!~~~~~~> Other parameters:
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1423,40 +1239,24 @@ end subroutine d2dump
  m=size(x1a)
  n=size(x2a)
  do i=1,m
   do j=1,n
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
    yntmp=yatmp(i,:)
    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
  integer  :: i, j, k
  real*8, dimension(ordn) :: w1, w2
  real*8, dimension(ordn) :: ymtmp
  real*8 :: yx_sum, x_sum
  call polint_lagrange_weights(x1a, x1, w1, ordn)
  call polint_lagrange_weights(x2a, x2, w2, ordn)
  do k = 1, ordn
    yx_sum = 0.d0
    do j = 1, ordn
      x_sum = 0.d0
      do i = 1, ordn
        x_sum = x_sum + w1(i) * ya(i,j,k)
      end do
      yx_sum = yx_sum + w2(j) * x_sum
    end do
    ymtmp(k) = yx_sum
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
@@ -1801,11 +1601,8 @@ deallocate(f_flat)
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
  integer :: i,j,k
-  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
+  fout = C1*f1+C2*f2+C3*f3
    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
  end do
  return
--- a/AMSS_NCKU_source/interp_lb_profile.C
+++ b/AMSS_NCKU_source/interp_lb_profile.C
@@ -1,107 +0,0 @@
 #include "interp_lb_profile.h"
 #include <cstdio>
 #include <cstring>
 #include <algorithm>
 namespace InterpLBProfile {
 bool write_profile(const char *filepath, int nprocs,
                   const double *rank_times,
                   const int *heavy_ranks, int num_heavy,
                   double threshold_ratio)
 {
    FILE *fp = fopen(filepath, "wb");
    if (!fp) return false;
    ProfileHeader hdr;
    hdr.magic = MAGIC;
    hdr.version = VERSION;
    hdr.nprocs = nprocs;
    hdr.num_heavy = num_heavy;
    hdr.threshold_ratio = threshold_ratio;
    fwrite(&hdr, sizeof(hdr), 1, fp);
    fwrite(rank_times, sizeof(double), nprocs, fp);
    fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
    fclose(fp);
    return true;
 }
 bool read_profile(const char *filepath, int current_nprocs,
                  int *heavy_ranks, int &num_heavy,
                  double *rank_times, MPI_Comm comm)
 {
    int myrank;
    MPI_Comm_rank(comm, &myrank);
    int valid = 0;
    ProfileHeader hdr;
    memset(&hdr, 0, sizeof(hdr));
    if (myrank == 0) {
        FILE *fp = fopen(filepath, "rb");
        if (fp) {
            if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
                hdr.magic == MAGIC && hdr.version == VERSION &&
                hdr.nprocs == current_nprocs)
            {
                if (fread(rank_times, sizeof(double), current_nprocs, fp)
                    == (size_t)current_nprocs &&
                    fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
                    == (size_t)hdr.num_heavy)
                {
                    num_heavy = hdr.num_heavy;
                    valid = 1;
                }
            } else if (fp) {
                printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
                       "nprocs=%d (current=%d)\n",
                       hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
            }
            fclose(fp);
        }
    }
    MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
    if (!valid) return false;
    MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
    MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
    MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
    return true;
 }
 int identify_heavy_ranks(const double *rank_times, int nprocs,
                         double threshold_ratio,
                         int *heavy_ranks, int max_heavy)
 {
    double sum = 0;
    for (int i = 0; i < nprocs; i++) sum += rank_times[i];
    double mean = sum / nprocs;
    double threshold = threshold_ratio * mean;
    // Collect candidates
    struct RankTime { int rank; double time; };
    RankTime *candidates = new RankTime[nprocs];
    int ncand = 0;
    for (int i = 0; i < nprocs; i++) {
        if (rank_times[i] > threshold)
            candidates[ncand++] = {i, rank_times[i]};
    }
    // Sort descending by time
    std::sort(candidates, candidates + ncand,
              [](const RankTime &a, const RankTime &b) {
                  return a.time > b.time;
              });
    int count = (ncand < max_heavy) ? ncand : max_heavy;
    for (int i = 0; i < count; i++)
        heavy_ranks[i] = candidates[i].rank;
    delete[] candidates;
    return count;
 }
 } // namespace InterpLBProfile
--- a/AMSS_NCKU_source/interp_lb_profile.bin
+++ b/AMSS_NCKU_source/interp_lb_profile.bin
--- a/AMSS_NCKU_source/interp_lb_profile.h
+++ b/AMSS_NCKU_source/interp_lb_profile.h
@@ -1,38 +0,0 @@
 #ifndef INTERP_LB_PROFILE_H
 #define INTERP_LB_PROFILE_H
 #include <mpi.h>
 namespace InterpLBProfile {
 static const unsigned int MAGIC   = 0x494C4250; // "ILBP"
 static const unsigned int VERSION = 1;
 struct ProfileHeader {
    unsigned int magic;
    unsigned int version;
    int nprocs;
    int num_heavy;
    double threshold_ratio;
 };
 // Write profile file (rank 0 only)
 bool write_profile(const char *filepath, int nprocs,
                   const double *rank_times,
                   const int *heavy_ranks, int num_heavy,
                   double threshold_ratio);
 // Read profile file (rank 0 reads, then broadcasts to all)
 // Returns true if file found and valid for current nprocs
 bool read_profile(const char *filepath, int current_nprocs,
                  int *heavy_ranks, int &num_heavy,
                  double *rank_times, MPI_Comm comm);
 // Identify heavy ranks: those with time > threshold_ratio * mean
 int identify_heavy_ranks(const double *rank_times, int nprocs,
                         double threshold_ratio,
                         int *heavy_ranks, int max_heavy);
 } // namespace InterpLBProfile
 #endif /* INTERP_LB_PROFILE_H */
--- a/AMSS_NCKU_source/interp_lb_profile_data.h
+++ b/AMSS_NCKU_source/interp_lb_profile_data.h
@@ -1,29 +0,0 @@
 /* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
 /* 更新：负载均衡问题已经通过优化插值函数解决，此profile静态均衡方案已弃用，本头文件现在未参与编译 */
 /* Auto-generated from interp_lb_profile.bin — do not edit */
 #ifndef INTERP_LB_PROFILE_DATA_H
 #define INTERP_LB_PROFILE_DATA_H
 #define INTERP_LB_NPROCS 64
 #define INTERP_LB_NUM_HEAVY 4
 static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
 /* Split table: {block_id, r_left, r_right} */
 static const int interp_lb_splits[4][3] = {
    {27, 26, 27},
    {35, 34, 35},
    {28, 28, 29},
    {36, 36, 37},
 };
 /* Rank remap for displaced neighbor blocks */
 static const int interp_lb_num_remaps = 4;
 static const int interp_lb_remaps[][2] = {
    {26, 25},
    {29, 30},
    {34, 33},
    {37, 38},
 };
 #endif /* INTERP_LB_PROFILE_DATA_H */
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -1,117 +0,0 @@
 #include "tool.h"
 /*
 * C 版 kodis
 *
 * Fortran signature:
 * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
 *
 * 约定：
 *   X: ex1, Y: ex2, Z: ex3
 *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
 *   SoA[3]
 *   eps: double
 */
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps)
 {
    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0;             // 2^6
    const int NO_SYMM = 0, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
    // Fortran: imax=ex(1) 等是 1-based 上界
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran loops:
     * do k=1,ex3
     * do j=1,ex2
     * do i=1,ex1
     *
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
    const int i0_hi = imaxF - 4;  // inclusive
    const int j0_hi = jmaxF - 4;
    const int k0_hi = kmaxF - 4;
    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
        free(fh);
        return;
    }
    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
                    const double Dx_term =
                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
                    const double Dy_term =
                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
                    const double Dz_term =
                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/lopsided_c.C
+++ b/AMSS_NCKU_source/lopsided_c.C
@@ -1,255 +0,0 @@
 #include "tool.h"
 /*
 * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
 *
 * 约定：
 *   nghost = 3
 *   ex[3]  = {ex1,ex2,ex3}
 *   f      = 原始网格 (ex1*ex2*ex3)
 *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
 *   SoA[3] = 输入参数
 */
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3])
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    (void)d2dx; (void)d2dy; (void)d2dz;
    // Fortran:
    // imax = ex(1); jmax = ex(2); kmax = ex(3)
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran:
    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran 主循环：
     * do k=1,ex(3)-1
     * do j=1,ex(2)-1
     * do i=1,ex(1)-1
     *
     * 转成 C 0-based：
     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
     *
     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
     * iF=i0+1, jF=j0+1, kF=k0+1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    // Fortran: if(i+3 <= imax)
                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
                    else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    // Fortran: if(i-3 >= imin)
                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i-1 >= imin) <=> i0 >= iminF
                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                // ---------------- y direction ----------------
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -1,248 +0,0 @@
 #include "tool.h"
 /*
 * Combined advection (lopsided) + KO dissipation (kodis).
 * Uses one shared symmetry_bd buffer per call.
 */
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps)
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0; // 2^6
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    symmetry_bd(3, ex, f, fh, SoA);
    // Advection (same stencil logic as lopsided_c.C)
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    } else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    // KO dissipation (same domain restriction as kodiss_c.C)
    if (eps > ZEO) {
        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
        const int i0_hi = imaxF - 4; // inclusive
        const int j0_hi = jmaxF - 4;
        const int k0_hi = kmaxF - 4;
        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        const double Dx_term =
                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
                        const double Dy_term =
                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
                        const double Dz_term =
                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                    }
                }
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
 ! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
 ! Mathematically identical to calling lopsided then kodis separately.
 !-----------------------------------------------------------------------------
 subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,intent(in) :: eps
 !~~~~~~> local variables:
 ! note index -2,-1,0, so we have 3 extra points
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
 ! kodis parameters
  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
  real*8, parameter :: cof=6.4d1   ! 2^6
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
 ! Single symmetry_bd call shared by both advection and dissipation
  call symmetry_bd(3,ex,f,fh,SoA)
 ! ---- Advection (lopsided) loop ----
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction   
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction   
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
  enddo
  enddo
  enddo
 ! ---- Dissipation (kodis) loop ----
  if(eps > ZEO) then
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
  endif
  enddo
  enddo
  enddo
  endif
  return
  end subroutine lopsided_kodis
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/macrodef.fh
+++ b/AMSS_NCKU_source/macrodef.fh
@@ -1,23 +1,7 @@
 #define tetradtype 2
 #define Cell
 #define ghost_width 3
 #define GAUGE 0
 #define CPBC_ghost_width  (ghost_width)
 #define ABV 0
 #define EScalar_CC 2
 #if 0
-
+note here
 define tetradtype
 v:r; u: phi; w: theta
 tetradtype 0
 v^a = (x,y,z)
@@ -30,48 +14,70 @@ define tetradtype
 v_a = (x,y,z)
 orthonormal order: v,u,w
 m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
 #endif
 #define tetradtype 2
-define Cell or Vertex
+#if 0
 note here
 Cell center or Vertex center
 #endif
 #define Cell
-define ghost_width
+#if 0
 note here
 2nd order: 2
 4th order: 3
 6th order: 4
 8th order: 5
 #endif
 #define ghost_width 3
-define WithShell
+#if 0
 note here
 use shell or not
 #endif
 #define WithShell
-define CPBC
+#if 0
 note here
 use constraint preserving boundary condition or not
 only affect Z4c
-    CPBC only supports WithShell
+#endif
 #define CPBC
-define GAUGE
+#if 0
 note here
 Gauge condition type
 0: B^i gauge
-    1: David puncture gauge
+1: David's puncture gauge
 2: MB B^i gauge
 3: RIT B^i gauge
 4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
 5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
 6: MGB1 B^i gauge
 7: MGB2 B^i gauge
 #endif
 #define GAUGE 2
-define CPBC_ghost_width  (ghost_width)
+#if 0
 buffer points for CPBC boundary
 #endif
 #define CPBC_ghost_width  (ghost_width)
-define ABV
+#if 0
-    0: using BSSN variable for constraint violation and psi4 calculation
+using BSSN variable for constraint violation and psi4 calculation: 0
-    1: using ADM variable for constraint violation and psi4 calculation
+using ADM variable for constraint violation and psi4 calculation: 1
 #endif
 #define ABV 0
-define EScalar_CC
+#if 0
 Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
 1: Case C of 1112.3928, V=0
-    2: shell with   phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
+2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
 3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
-    4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
+4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
 5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
 #endif
 #define EScalar_CC 2
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -6,124 +6,92 @@
 // application parameters
 /// ****
 // sommerfeld boundary type
 // 0: bam, 1: shibata
 #define SommerType 0
 /// ****
 // for Using Gauss-Legendre quadrature in theta direction
 #define GaussInt
-#define ABEtype 0
+/// ****
 //#define With_AHF
 #define Psi4type 0
 //#define Point_Psi4
 #define RPS 1
 #define AGM 0
 #define RPB 0
 #define MAPBH 1
 #define PSTR 0
 #define REGLEV 0
 //#define USE_GPU
 //#define CHECKDETAIL
 //#define FAKECHECK
 //
 // define SommerType
 //     sommerfeld boundary type
 //     0: bam
 //     1: shibata
 //
 // define GaussInt
 //     for Using Gauss-Legendre quadrature in theta direction
 //
 // define ABEtype
 // 0: BSSN vacuum
 // 1: coupled to scalar field
 // 2: Z4c vacuum
 // 3: coupled to Maxwell field
 //
-// define With_AHF
+#define ABEtype 2
 /// ****
 // using Apparent Horizon Finder
-//
+//#define With_AHF
-// define Psi4type
+
 /// ****
 // Psi4 calculation method
 // 0: EB method
 // 1: 4-D method
 //
-// define Point_Psi4
+#define Psi4type 0
 /// ****
 // for Using point psi4 or not
-//
+//#define Point_Psi4
-// define RPS
+
 /// ****
 // RestrictProlong in Step (0) or after Step (1)
-//
+#define RPS 1
-// define AGM
+
 /// ****
 // Enforce algebra constraint
 // for every RK4 sub step: 0
 // only when iter_count == 3: 1
 // after routine Step: 2
-//
+#define AGM 0
 // define RPB
 //     Restrict Prolong using BAM style 1 or old style 0
 //
 // define MAPBH
 //     1: move Analysis out ot 4 sub steps and treat PBH with Euler method
 //
 // define PSTR
 //     parallel structure
 //     0: level by level
 //     1: considering all levels
 //     2: as 1 but reverse the CPU order
 //     3: Frank's scheme
 //
 // define REGLEV
 //     regrid for every level or for all levels at a time
 //     0: for every level;
 //     1: for all
 //
 // define USE_GPU
 //     use gpu or not
 //
 // define CHECKDETAIL
 //     use checkpoint for every process
 //
 // define FAKECHECK
 //     use FakeCheckPrepare to write CheckPoint
 //
 /// ****
 // Restrict Prolong using BAM style 1 or old style 0
 #define RPB 0
 /// ****
 // 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
 #define MAPBH 1
 /// ****
 // parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
 #define PSTR 0
 /// ****
 // regrid for every level or for all levels at a time
 // 0: for every level; 1: for all
 #define REGLEV 0
 /// ****
 // use gpu or not
 //#define USE_GPU
 /// ****
 // use checkpoint for every process
 //#define CHECKDETAIL
 /// ****
 // use FakeCheckPrepare to write CheckPoint
 //#define FAKECHECK
 ////================================================================
 //  some basic parameters for numerical calculation
 ////================================================================
 #define dim 3
-//#define Cell or Vertex in "macrodef.fh" 
+//#define Cell or Vertex in "microdef.fh"
 // ******
 // buffer point number for mesh refinement interface
 #define buffer_width 6
-#define SC_width buffer_width
+// ******
 #define CS_width (2*buffer_width)
 //
 // define Cell or Vertex in "macrodef.fh" 
 //
 // define buffer_width
 //     buffer point number for mesh refinement interface
 //
 // define SC_width buffer_width
 // buffer point number shell-box interface, on shell
-//
+#define SC_width buffer_width
 // define CS_width
 // buffer point number shell-box interface, on box
-//
+#define CS_width (2*buffer_width)
 #if(buffer_width < ghost_width)
 #error we always assume buffer_width>ghost_width
@@ -142,4 +110,3 @@
 #define TINY 1e-10
 #endif   /* MICRODEF_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -2,35 +2,6 @@
 include makefile.inc
 ## polint(ordn=6) kernel selector:
 ##   1 (default): barycentric fast path
 ##   0          : fallback to Neville path
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 ## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
 ##   make                        -> opt  (PGO-guided, maximum performance)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
 ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
 ## PGO has been turned off, now tested and found to be negative optimization
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
 .f90.o:
@@ -45,65 +16,13 @@ endif
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # C rewrite of BSSN RHS kernel and helpers
 bssn_rhs_c.o: bssn_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fderivs_c.o: fderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fdderivs_c.o: fdderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 kodiss_c.o: kodiss_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 lopsided_c.o: lopsided_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 lopsided_kodis_c.o: lopsided_kodis_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 TwoPunctures.o: TwoPunctures.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
 ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
 ifeq ($(USE_CXX_KERNELS),0)
 # Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
 CFILES =
 else
 # C++ mode (default): C rewrite of bssn_rhs and helper kernels
 CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
 endif
 ## RK4 kernel switch (independent from USE_CXX_KERNELS)
 ifeq ($(USE_CXX_RK4),1)
 CFILES += rungekutta4_rout_c.o
 RK4_F90_OBJ =
 else
 RK4_F90_OBJ = rungekutta4_rout.o
 endif
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
 	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
+	   NullShellPatch2_Evo.o writefile_f.o
 C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
@@ -113,9 +32,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 	   NullShellPatch2_Evo.o \
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
-F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
@@ -126,14 +45,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
 	   NullNews2.o tool_f.o
 ifeq ($(USE_CXX_KERNELS),0)
 # Fortran mode: include original bssn_rhs.o
 F90FILES = $(F90FILES_BASE) bssn_rhs.o
 else
 # C++ mode (default): bssn_rhs.o replaced by C++ kernel
 F90FILES = $(F90FILES_BASE)
 endif
 F77FILES = zbesh.o
 AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
@@ -146,7 +57,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
 CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
 # file dependences
-$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -169,7 +80,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
-$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
 TwoPunctureFILES: TwoPunctures.h
@@ -178,14 +89,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o
 # projects
-ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
-ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -8,51 +8,18 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
 PGO_MODE ?= opt
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
 ##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
 ##   optimize   : Pass 2 — read profile and apply block rebalancing
 INTERP_LB_MODE ?= off
 ifeq ($(INTERP_LB_MODE),profile)
 INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
 else ifeq ($(INTERP_LB_MODE),optimize)
 INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
 else
 INTERP_LB_FLAGS =
 endif
 ## Kernel implementation switch
 ##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
 ##   0           : fall back to original Fortran kernels
 USE_CXX_KERNELS ?= 1
 ## RK4 kernel implementation switch
 ##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
 ## Aggressive optimization flags:
 ## -O3: Maximum optimization
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
 ## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
               -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1934,33 +1934,18 @@
 ! when if=1 -> ic=0, this is different to vertex center grid 
  real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3))   :: funcc
  integer,dimension(3) :: cxI
-  integer :: i,j,k,ii,jj,kk,px,py,pz
+  integer :: i,j,k,ii,jj,kk
  real*8, dimension(6,6) :: tmp2
  real*8, dimension(6) :: tmp1
  integer, dimension(extf(1)) :: cix
  integer, dimension(extf(2)) :: ciy
  integer, dimension(extf(3)) :: ciz
  integer, dimension(extf(1)) :: pix
  integer, dimension(extf(2)) :: piy
  integer, dimension(extf(3)) :: piz
  real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
  real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
  real*8, dimension(6,2), parameter :: WC = reshape((/&
      C1,C2,C3,C4,C5,C6,&
      C6,C5,C4,C3,C2,C1/), (/6,2/))
  integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
  integer::maxcx,maxcy,maxcz
  real*8,dimension(3) :: CD,FD
-  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
+  
  real*8 :: tmp_xyz_line(extc(1))   ! 存储整条 X 线上完成 Y 向融合后的结果
  real*8 :: v1, v2, v3, v4, v5, v6
  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max
  real*8 :: res_line
  real*8 :: tmp_z_slab(extc(1), extc(2))  ! 分配在 k 循环外
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2035,123 +2020,145 @@
          return
  endif
  do i = imino,imaxo
     ii = i + lbf(1) - 1
     cix(i) = ii/2 - lbc(1) + 1
     if(ii/2*2 == ii)then
        pix(i) = 1
     else
        pix(i) = 2
     endif
  enddo
  do j = jmino,jmaxo
     jj = j + lbf(2) - 1
     ciy(j) = jj/2 - lbc(2) + 1
     if(jj/2*2 == jj)then
        piy(j) = 1
     else
        piy(j) = 2
     endif
  enddo
  do k = kmino,kmaxo
     kk = k + lbf(3) - 1
     ciz(k) = kk/2 - lbc(3) + 1
     if(kk/2*2 == kk)then
        piz(k) = 1
     else
        piz(k) = 2
     endif
  enddo
  maxcx = maxval(cix(imino:imaxo))
  maxcy = maxval(ciy(jmino:jmaxo))
  maxcz = maxval(ciz(kmino:kmaxo))
  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
     write(*,*)"error in prolong"
     return
  endif
  call symmetry_bd(3,extc,func,funcc,SoA)
     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
 jc_min = minval(ciy(jmino:jmaxo))
 jc_max = maxval(ciy(jmino:jmaxo))
 do k = kmino, kmaxo
    pz = piz(k); kc = ciz(k)
    ! --- Pass 1: Z 方向，只算一次 ---
    do iy = jc_min-3, jc_max+3   ! 仅需的 iy 范围
        do ii = imini-3, imaxi+3  ! 仅需的 ii 范围
            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
        end do
    end do
    do j = jmino, jmaxo
        py = piy(j); jc = ciy(j)
        ! --- Pass 2: Y 方向 ---
        do ii = imini-3, imaxi+3
            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
        end do
        ! --- Pass 3: X 方向 ---
        do i = imino, imaxo
            funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
        end do
    end do
 end do
 !~~~~~~> prolongation start...
 #if 0
  do k = kmino,kmaxo
     pz = piz(k)
     kc = ciz(k)
   do j = jmino,jmaxo
        py = piy(j)
        jc = ciy(j)
 ! --- 步骤 1 & 2 融合：分段处理 X 轴，提升 Cache 命中率 ---
        ! 我们将 ii 循环逻辑重组，减少对 funcc 的跨行重复访问
        do ii = 1, extc(1)
           ! 1. 先做 Z 方向的 6 条线插值（针对当前的 ii 和当前的 6 个 iy）
           ! 我们直接在这里把 Y 方向的加权也做了，省去 tmp_yz 数组
           ! 这样 funcc 的数据读进来后立即完成所有维度的贡献，不再写回内存
           res_line = 0.0d0
           do jj = 1, 6
              iy = jc - 3 + jj
              ! 这一行代码是核心：一次性完成 Z 插值并加上 Y 的权重
              ! 编译器会把 WC(jj, py) 存在寄存器里
              res_line = res_line + WC(jj, py) * ( &
                         WC(1, pz) * funcc(ii, iy, kc-2) + &
                         WC(2, pz) * funcc(ii, iy, kc-1) + &
                         WC(3, pz) * funcc(ii, iy, kc  ) + &
                         WC(4, pz) * funcc(ii, iy, kc+1) + &
                         WC(5, pz) * funcc(ii, iy, kc+2) + &
                         WC(6, pz) * funcc(ii, iy, kc+3) )
           end do
           tmp_xyz_line(ii) = res_line
        end do
        ! 3. 【降维：X 向】最后在最内层只处理 X 方向的 6 点加权
        ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
    do i = imino,imaxo
-           px = pix(i)
+       cxI(1) = i
-           ic = cix(i)
+       cxI(2) = j
       cxI(3) = k
 ! change to coarse level reference
 !|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---| 
 !|=======x===============x===============x===============x=======|
       cxI = (cxI+lbf-1)/2
 ! change to array index      
       cxI = cxI - lbc + 1
-           ! 直接从预计算好的 line 中读取连续的 6 个点
+       if(any(cxI+3 > extc)) write(*,*)"error in prolong"
-           ! ic-2 到 ic+3 对应原始 6 点算子
+       ii=i+lbf(1)-1
-           funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
+       jj=j+lbf(2)-1
-                         WC(2,px)*tmp_xyz_line(ic-1) + &
+       kk=k+lbf(3)-1
-                         WC(3,px)*tmp_xyz_line(ic  ) + &
+#if 0
-                         WC(4,px)*tmp_xyz_line(ic+1) + &
+       if(ii/2*2==ii)then
-                         WC(5,px)*tmp_xyz_line(ic+2) + &
+         if(jj/2*2==jj)then
-                         WC(6,px)*tmp_xyz_line(ic+3)
+           if(kk/2*2==kk)then
-        end do
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
-     end do
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
-  end do
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         endif
       else
         if(jj/2*2==jj)then
           if(kk/2*2==kk)then               
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         endif
       endif
 #else 
       if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
       else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
       endif
       if(jj/2*2==jj)then
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
       else
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
       endif
       if(ii/2*2==ii)then
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
       else
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
       endif
 #endif
    enddo
   enddo
  enddo
  return
  end subroutine prolong3
@@ -2351,10 +2358,6 @@ end do
  real*8,dimension(3) :: CD,FD
  real*8 :: tmp_xz_plane(extf(1), 6) 
  real*8 :: tmp_x_line(extf(1))
  integer :: fi, fj, fk, ii, jj, kk
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2436,56 +2439,6 @@ end do
  call symmetry_bd(2,extf,funf,funff,SoA)
 !~~~~~~> restriction start...
 do k = kmino, kmaxo
    fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
    do j = jmino, jmaxo
        fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
        ! 确保 ii 循环是最内层且连续访问
        !DIR$ VECTOR ALWAYS
        do ii = 1, extf(1)
            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
                                  C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
                                  C3*(funff(ii,fj-2,fk  )+funff(ii,fj-2,fk+1))
            tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
                                  C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
                                  C3*(funff(ii,fj-1,fk  )+funff(ii,fj-1,fk+1))
            tmp_xz_plane(ii, 3) = C1*(funff(ii,fj  ,fk-2)+funff(ii,fj  ,fk+3)) + &
                                  C2*(funff(ii,fj  ,fk-1)+funff(ii,fj  ,fk+2)) + &
                                  C3*(funff(ii,fj  ,fk  )+funff(ii,fj  ,fk+1))
            tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
                                  C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
                                  C3*(funff(ii,fj+1,fk  )+funff(ii,fj+1,fk+1))
            tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
                                  C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
                                  C3*(funff(ii,fj+2,fk  )+funff(ii,fj+2,fk+1))
            tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
                                  C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
                                  C3*(funff(ii,fj+3,fk  )+funff(ii,fj+3,fk+1))
        end do
        ! 优化点 2: 同样向量化 Y 方向压缩
        !DIR$ VECTOR ALWAYS
        do ii = 1, extf(1)
            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
        end do
        ! 优化点 3: 最终写入，利用已经缓存在 tmp_x_line 的数据
        do i = imino, imaxo
            fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
            func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
                            C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
                            C3*(tmp_x_line(fi  ) + tmp_x_line(fi+1))
        end do
    end do
 end do
 #if 0
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
@@ -2509,7 +2462,7 @@ end do
    enddo
   enddo
  enddo
-#endif
+  
  return
  end subroutine restrict3
--- a/AMSS_NCKU_source/rungekutta4_rout_c.C
+++ b/AMSS_NCKU_source/rungekutta4_rout_c.C
@@ -1,212 +0,0 @@
 #include "rungekutta4_rout.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstddef>
 #include <complex>
 #include <immintrin.h>
 namespace {
 inline void rk4_stage0(std::size_t n,
                       const double *__restrict f0,
                       const double *__restrict frhs,
                       double *__restrict f1,
                       double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d vr = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d vr = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * frhs[i];
    }
 }
 inline void rk4_rhs_accum(std::size_t n,
                          const double *__restrict f1,
                          double *__restrict frhs) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d v2 = _mm512_set1_pd(2.0);
    for (; i + 7 < n; i += 8) {
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        const __m512d vrhs = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
    }
 #elif defined(__AVX2__)
    const __m256d v2 = _mm256_set1_pd(2.0);
    for (; i + 3 < n; i += 4) {
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        const __m256d vrhs = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        frhs[i] = frhs[i] + 2.0 * f1[i];
    }
 }
 inline void rk4_f1_from_f0_f1(std::size_t n,
                              const double *__restrict f0,
                              double *__restrict f1,
                              double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * f1[i];
    }
 }
 inline void rk4_stage3(std::size_t n,
                       const double *__restrict f0,
                       double *__restrict f1,
                       const double *__restrict frhs,
                       double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        const __m512d vr = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        const __m256d vr = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * (f1[i] + frhs[i]);
    }
 }
 } // namespace
 extern "C" {
 void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 void rungekutta4_cplxscalar_(double &dT,
                             std::complex<double> &f0,
                             std::complex<double> &f1,
                             std::complex<double> &f_rhs,
                             int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 int f_rungekutta4_rout(int *ex, double &dT,
                       double *f0, double *f1, double *f_rhs,
                       int &RK4) {
    const std::size_t n = static_cast<std::size_t>(ex[0]) *
                          static_cast<std::size_t>(ex[1]) *
                          static_cast<std::size_t>(ex[2]);
    const double *const __restrict f0r = f0;
    double *const __restrict f1r = f1;
    double *const __restrict frhs = f_rhs;
    if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
        std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
    switch (RK4) {
    case 0:
        rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
        break;
    case 1:
        rk4_rhs_accum(n, f1r, frhs);
        rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
        break;
    case 2:
        rk4_rhs_accum(n, f1r, frhs);
        rk4_f1_from_f0_f1(n, f0r, f1r, dT);
        break;
    default:
        rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
        break;
    }
    return 0;
 }
 } // extern "C"
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -1,246 +0,0 @@
 #ifndef SHARE_FUNC_H
 #define SHARE_FUNC_H
 #include <stdlib.h>
 #include <stddef.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 /* 主网格：0-based -> 1D */
 static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
    const int ex1 = ex[0], ex2 = ex[1];
    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
 }
 /*
 * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
 * ord=2 => shift=1
 * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
 */
 static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 1;
    const int nx = ex[0] + 2;      // ex1 + ord
    const int ny = ex[1] + 2;
    const int ii = iF + shift;     // 0..ex1+1
    const int jj = jF + shift;     // 0..ex2+1
    const int kk = kF + shift;     // 0..ex3+1
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
 * ord=3 => shift=2
 * iF/jF/kF 是 Fortran 索引（可为负）
 */
 static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 2;                 // ord=3 -> -2..ex
    const int nx = ex[0] + 3;            // ex1 + ord
    const int ny = ex[1] + 3;
    const int ii = iF + shift;           // 0..ex1+2
    const int jj = jF + shift;           // 0..ex2+2
    const int kk = kF + shift;           // 0..ex3+2
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
 * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
 *
 * C 里我们把：
 *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
 *   funcc 用“平移下标”存为一维数组：
 *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
 *     总长度 nx = extc1 + ord
 *     同理 ny = extc2 + ord, nz = extc3 + ord
 */
 static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
    const int nx = extc[0], ny = extc[1];
    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
 }
 static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
    const int shift = ord - 1;          // iF = -shift .. extc1
    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
    const int ny = extc[1] + ord;
    const int ii = iF + shift;          // 0..extc1+shift
    const int jj = jF + shift;          // 0..extc2+shift
    const int kk = kF + shift;          // 0..extc3+shift
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * 等价于 Fortran:
 * funcc(1:extc1,1:extc2,1:extc3)=func
 * do i=0,ord-1
 *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
 * enddo
 * do i=0,ord-1
 *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
 * enddo
 * do i=0,ord-1
 *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
 * enddo
 */
 static inline void symmetry_bd_impl(int ord,
                 int shift,
                 const int extc[3],
                 const double *__restrict func,
                 double *__restrict funcc,
                 const double SoA[3])
 {
    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
    const int nx = extc1 + ord;
    const int ny = extc2 + ord;
    const size_t snx = (size_t)nx;
    const size_t splane = (size_t)nx * (size_t)ny;
    const size_t interior_i = (size_t)shift + 1u;          /* iF = 1 */
    const size_t interior_j = ((size_t)shift + 1u) * snx;  /* jF = 1 */
    const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
    const size_t interior0 = interior_k + interior_j + interior_i;
    /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
    for (int k0 = 0; k0 < extc3; ++k0) {
        const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
        const size_t dst_k0 = interior0 + (size_t)k0 * splane;
        for (int j0 = 0; j0 < extc2; ++j0) {
            const double *src = src_k + (size_t)j0 * (size_t)extc1;
            double *dst = funcc + dst_k0 + (size_t)j0 * snx;
            memcpy(dst, src, (size_t)extc1 * sizeof(double));
        }
    }
    /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
    const double s1 = SoA[0];
    if (s1 == 1.0) {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = funcc[off + src_i];
                }
            }
        }
    } else if (s1 == -1.0) {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = -funcc[off + src_i];
                }
            }
        }
    } else {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = funcc[off + src_i] * s1;
                }
            }
        }
    }
    /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
    const double s2 = SoA[1];
    if (s2 == 1.0) {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = src[i];
            }
        }
    } else if (s2 == -1.0) {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = -src[i];
            }
        }
    } else {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
            }
        }
    }
    /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
    const double s3 = SoA[2];
    if (s3 == 1.0) {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
        }
    } else if (s3 == -1.0) {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
        }
    } else {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
        }
    }
 }
 static inline void symmetry_bd(int ord,
                 const int extc[3],
                 const double *func,
                 double *funcc,
                 const double SoA[3])
 {
    if (ord <= 0) return;
    /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
    if (ord == 2) {
        symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
        return;
    }
    if (ord == 3) {
        symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
        return;
    }
    symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
 }
 #endif
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -220,9 +220,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -234,11 +241,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
  double *RP_out, *IP_out;
@@ -361,17 +363,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -563,17 +556,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, Comm_here);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -751,17 +735,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1009,17 +984,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1453,17 +1419,8 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -1897,17 +1854,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2092,17 +2040,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch2 *GH, var *
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2287,17 +2226,8 @@ void surface_integral::surf_Wave(double rex, int lev, NullShellPatch *GH, var *R
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
@@ -2384,9 +2314,25 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    pox[2][n] = rex * nz_g[n];
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
  Lp = n_tot - cpusize * mp;
  if (Lp > myrank)
  {
    Nmin = myrank * mp + myrank;
@@ -2398,20 +2344,6 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    Nmax = Nmin + mp - 1;
  }
  double *shellf;
  shellf = new double[n_tot * InList];
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  double Mass_out = 0;
  double ang_outx, ang_outy, ang_outz;
  double p_outx, p_outy, p_outz;
  ang_outx = ang_outy = ang_outz = 0.0;
  p_outx = p_outy = p_outz = 0.0;
  const double f1o8 = 0.125;
  double Chi, Psi;
  double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
  double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;
@@ -2532,13 +2464,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -2801,13 +2735,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, Comm_here);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, Comm_here);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3084,13 +3020,15 @@ void surface_integral::surf_MassPAng(double rex, int lev, ShellPatch *GH, var *c
    }
  }
-  {
+  MPI_Allreduce(&Mass_out, &mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double scalar_out[7] = {Mass_out, ang_outx, ang_outy, ang_outz, p_outx, p_outy, p_outz};
+
-    double scalar_in[7];
+  MPI_Allreduce(&ang_outx, &sx, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(scalar_out, scalar_in, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  MPI_Allreduce(&ang_outy, &sy, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    mass = scalar_in[0]; sx = scalar_in[1]; sy = scalar_in[2]; sz = scalar_in[3];
+  MPI_Allreduce(&ang_outz, &sz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    px = scalar_in[4]; py = scalar_in[5]; pz = scalar_in[6];
+
-  }
+  MPI_Allreduce(&p_outx, &px, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outy, &py, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  MPI_Allreduce(&p_outz, &pz, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 #ifdef GaussInt
  mass = mass * rex * rex * dphi * factor;
@@ -3669,17 +3607,8 @@ void surface_integral::surf_Wave(double rex, cgh *GH, ShellPatch *SH,
  }
  //|------+  Communicate and sum the results from each processor.
-  {
+  MPI_Allreduce(RP_out, RP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    double *RPIP_out = new double[2 * NN];
+  MPI_Allreduce(IP_out, IP, NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    double *RPIP = new double[2 * NN];
    memcpy(RPIP_out, RP_out, NN * sizeof(double));
    memcpy(RPIP_out + NN, IP_out, NN * sizeof(double));
    MPI_Allreduce(RPIP_out, RPIP, 2 * NN, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    memcpy(RP, RPIP, NN * sizeof(double));
    memcpy(IP, RPIP + NN, NN * sizeof(double));
    delete[] RPIP_out;
    delete[] RPIP;
  }
  //|------= Free memory.
--- a/AMSS_NCKU_source/tool.h
+++ b/AMSS_NCKU_source/tool.h
@@ -1,33 +0,0 @@
 #include "share_func.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff);
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff);
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps);
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3]);
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps);
--- a/generate_interp_lb_header.py
+++ b/generate_interp_lb_header.py
@@ -1,72 +0,0 @@
 #!/usr/bin/env python3
 """Convert interp_lb_profile.bin to a C header for compile-time embedding."""
 import struct, sys
 if len(sys.argv) < 3:
    print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
    sys.exit(1)
 with open(sys.argv[1], 'rb') as f:
    magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
    threshold = struct.unpack('d', f.read(8))[0]
    times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
    heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
 # For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
 # (or vice versa depending on which neighbor is lighter)
 splits = []
 for hr in heavy:
    prev_t = times[hr - 1] if hr > 0 else 1e30
    next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
    if prev_t <= next_t:
        splits.append((hr, hr - 1, hr))  # (block_id, r_left, r_right)
    else:
        splits.append((hr, hr, hr + 1))
 # Also remap the displaced neighbor blocks
 remaps = {}
 for hr, r_l, r_r in splits:
    if r_l != hr:
        # We took r_l's slot, so remap block r_l to its other neighbor
        displaced = r_l
        if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
            remaps[displaced] = displaced - 1
        elif displaced < nprocs - 1:
            remaps[displaced] = displaced + 1
    else:
        displaced = r_r
        if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
            remaps[displaced] = displaced + 1
        elif displaced > 0:
            remaps[displaced] = displaced - 1
 with open(sys.argv[2], 'w') as out:
    out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
    out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
    out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
    out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
    out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
    out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
    out.write(", ".join(str(h) for h in heavy))
    out.write("};\n\n")
    out.write("/* Split table: {block_id, r_left, r_right} */\n")
    out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
    for bid, rl, rr in splits:
        out.write(f"    {{{bid}, {rl}, {rr}}},\n")
    out.write("};\n\n")
    out.write("/* Rank remap for displaced neighbor blocks */\n")
    out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
    out.write(f"static const int interp_lb_remaps[][2] = {{\n")
    for src, dst in sorted(remaps.items()):
        out.write(f"    {{{src}, {dst}}},\n")
    if not remaps:
        out.write("    {-1, -1},\n")
    out.write("};\n\n")
    out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
 print(f"Generated {sys.argv[2]}:")
 print(f"  {num_heavy} heavy blocks to split: {heavy}")
 for bid, rl, rr in splits:
    print(f"    block {bid}: split -> rank {rl} (left), rank {rr} (right)")
 for src, dst in sorted(remaps.items()):
    print(f"    block {src}: remap -> rank {dst}")
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -392,6 +392,17 @@ def generate_macrodef_fh():
        print( "# Finite_Difference_Method #define ghost_width setting error!!!",   file=file1 )
        print(                                                   file=file1 )
    # Define macro DEBUG_NAN_CHECK
    # 0: off (default), 1: on
    debug_nan_check = getattr(input_data, "Debug_NaN_Check", 0)
    if debug_nan_check:
        print( "#define DEBUG_NAN_CHECK 1", file=file1 )
        print(                             file=file1 )
    else:
        print( "#define DEBUG_NAN_CHECK 0", file=file1 )
        print(                             file=file1 )
    # Whether to use a shell-patch grid
    # use shell or not
@@ -514,6 +525,9 @@ def generate_macrodef_fh():
    print( "    6th order: 4",                                                                      file=file1 )
    print( "    8th order: 5",                                                                      file=file1 )
    print(                                                                                          file=file1 )
    print( "define DEBUG_NAN_CHECK",                                                                file=file1 )
    print( "    0: off (default), 1: on",                                                           file=file1 )
    print(                                                                                          file=file1 )
    print( "define WithShell",                                                                      file=file1 )
    print( "    use shell or not",                                                                  file=file1 )
    print(                                                                                          file=file1 )
--- a/inputfile_example/AMSS_NCKU_Input.py
+++ b/inputfile_example/AMSS_NCKU_Input.py
@@ -36,6 +36,7 @@ Equation_Class           = "BSSN"                  ## Evolution Equation: choose
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
 Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -10,48 +10,18 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 NUMACTL_CPU_BIND = ""
-def get_last_n_cores_per_socket(n=32):
+## Build parallelism configuration
-    """
+## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
-    Read CPU topology via lscpu and return a taskset -c string
+## Set make -j to utilize available cores for faster builds
-    selecting the last `n` cores of each NUMA node (socket).
+BUILD_JOBS = 14
    Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
    -> "taskset -c 24-55,80-111"
    """
    result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
    # Build a dict: node_id -> sorted list of CPU ids
    node_cpus = {}
    for line in result.stdout.splitlines():
        if line.startswith("#") or not line.strip():
            continue
        parts = line.split(",")
        if len(parts) < 2:
            continue
        node_id, cpu_id = int(parts[0]), int(parts[1])
        node_cpus.setdefault(node_id, []).append(cpu_id)
    segments = []
    for node_id in sorted(node_cpus):
        cpus = sorted(node_cpus[node_id])
        selected = cpus[-n:]          # last n cores of this socket
        segments.append(f"{selected[0]}-{selected[-1]}")
    cpu_str = ",".join(segments)
    total = len(segments) * n
    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
    #return f"taskset -c {cpu_str}"
    return f""
 ## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
 NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 ## Build parallelism: match the number of bound cores
 BUILD_JOBS = 64
 ##################################################################
@@ -70,7 +40,7 @@ def makefile_ABE():
    ## Build command with CPU binding to nohz_full cores
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
    elif (input_data.GPU_Calculation == "yes"):
        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
    else:
@@ -148,7 +118,6 @@ def run_ABE():
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
@@ -184,14 +153,13 @@ def run_ABE():
 ## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
 def run_TwoPunctureABE():
-    tp_time1=time.time()
+
    print(                                                          )
    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
    print(                                                          )
    ## Define the command to run
-    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
    TwoPuncture_command         = " ./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
@@ -212,9 +180,7 @@ def run_TwoPunctureABE():
    print(                                               )
    print( " The TwoPunctureABE simulation is finished " ) 
    print(                                               )
-    tp_time2=time.time()
+    
    et=tp_time2-tp_time1
    print(f"Used time: {et}")
    return
 ##################################################################
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
 import multiprocessing
 def run_plot_task(task):
    """Execute a single plotting task.
    Parameters
    ----------
    task : tuple
        A tuple of (function, args_tuple) where function is a callable
        plotting function and args_tuple contains its arguments.
    """
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    """Execute a list of independent plotting tasks in parallel.
    Uses the 'fork' context to create worker processes so that the main
    script is NOT re-imported/re-executed in child processes.
    Parameters
    ----------
    plot_tasks : list of tuples
        Each element is (function, args_tuple).
    """
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/TwoPunctureABE.profdata
+++ b/pgo_profile/TwoPunctureABE.profdata
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default_9726853898452064389_0.profdata
+++ b/pgo_profile/default_9726853898452064389_0.profdata
--- a/pgo_profile/default_9726853898452064389_0.profraw
+++ b/pgo_profile/default_9726853898452064389_0.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################
 ## Restrict OpenMP to one thread per process so that running
 ## many workers in parallel does not create an O(workers * BLAS_threads)
 ## thread explosion.  The variable MUST be set before numpy/scipy
 ## are imported, because the BLAS library reads them only at load time.
 import os
 os.environ.setdefault("OMP_NUM_THREADS",        "1")
 import numpy
 import scipy
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 import os
 #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
 ####################################################################################
 ####################################################################################
 ## Allow this module to be run as a standalone script so that each
 ## binary-data plot can be executed in a fresh subprocess whose BLAS
 ## environment variables (set above) take effect before numpy loads.
 ##
 ## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
 ####################################################################################
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################
 import numpy                               ## numpy for array operations
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in parallel using subprocesses.
+    ## Plot each file in the list
    ## Each subprocess is a fresh Python process where the BLAS thread-count
    ## environment variables (set at the top of plot_binary_data.py) take
    ## effect before numpy is imported.  This avoids the thread explosion
    ## that occurs when multiprocessing.Pool with 'fork' context inherits
    ## already-initialized multi-threaded BLAS from the parent.
    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
    running = []
    failed  = []
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        ## Keep at most max_workers subprocesses active at a time
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    ## Wait for all remaining subprocesses to finish
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )