fix: try to fix segfault at 240 steps by adding WithShell guard for writecheck_sh call

fix(build): update makefile.inc for debian 13
feat: port GPU code to CUDA 13 and enable GPU computation
2026-01-22 14:26:41 +08:00 · 2026-01-21 09:29:35 +08:00 · 2026-01-13 18:15:49 +00:00 · 2026-01-14 01:41:31 +08:00 · 2026-01-13 23:52:44 +08:00 · 2026-01-13 23:45:49 +08:00
23 changed files with 1485 additions and 2578 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,3 @@
 __pycache__
 GW150914
 GW150914-origin
 GW150914-mini
 docs
 *.tmp
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,14 +16,12 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 8                             ## number of mpi processes used in the simulation
+MPI_processes    = 96                             ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 GPU_Calculation  = "yes"                         ## Use GPU or not
                                                 ## GPU support has been updated for CUDA 13
 CPU_Part         = 0.0
 GPU_Part         = 1.0
 #################################################
--- a/AMSS_NCKU_Input_Mini.py
+++ b/AMSS_NCKU_Input_Mini.py
@@ -1,233 +0,0 @@
 #################################################
 ##
 ## This file provides the input parameters required for numerical relativity.
 ## XIAOQU
 ## 2024/03/19 --- 2025/09/14
 ## Modified for GW150914-mini test case
 ##
 #################################################
 import numpy    
 #################################################
 ## Setting MPI processes and the output file directory
 File_directory   = "GW150914-mini"               ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
 MPI_processes    = 4                             ## number of mpi processes used in the simulation (Reduced for laptop)
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 #################################################
 #################################################
 ## Setting the physical system and numerical method
 Symmetry                 = "equatorial-symmetry"   ## Symmetry of System: choose equatorial-symmetry、no-symmetry、octant-symmetry
 Equation_Class           = "BSSN"                  ## Evolution Equation: choose "BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C" 
                                                   ## If "BSSN-EScalar" is chosen, it is necessary to set other parameters below
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
 Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
 #################################################
 ## Setting the time evolutionary information
 Start_Evolution_Time     = 0.0                    ## start evolution time t0
 Final_Evolution_Time     = 100.0                  ## final evolution time t1 (Reduced for quick test)
 Check_Time               = 10.0
 Dump_Time                = 10.0                   ## time inteval dT for dumping binary data
 D2_Dump_Time             = 10.0                   ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 1.0                    ## dump the puncture position and GW psi4 after dT"
 Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
 #################################################
 #################################################
 ## Setting the grid structure
 basic_grid_set    = "Patch"                          ## grid structure: choose "Patch" or "Shell-Patch"
 grid_center_set   = "Cell"                           ## grid center: chose "Cell" or "Vertex"
 grid_level        = 7                                ## total number of AMR grid levels (Reduced from 9)
 static_grid_level = 4                                ## number of AMR static grid levels (Reduced from 5)
 moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
 analysis_level    = 0
 refinement_level  = 3                                ## time refinement start from this grid level
 largest_box_xyz_max = [320.0, 320.0, 320.0]          ## scale of the largest box
                                                     ## not ne cess ary to be cubic for "Patch" grid s tructure
                                                     ## need to be a cubic box for "Shell-Patch" grid structure
 largest_box_xyz_min = - numpy.array(largest_box_xyz_max)  
 static_grid_number = 48                              ## grid points of each static AMR grid (in x direction) (Reduced from 96)
                                                     ## (grid points in y and z directions are automatically adjusted)
 moving_grid_number = 24                              ## grid points of each moving AMR grid (Reduced from 48)
 shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
                                                     ## in (phi, theta, r) direction
 devide_factor      = 2.0                             ## resolution between different grid levels dh0/dh1, only support 2.0 now
 static_grid_type   = 'Linear'                        ## AMR static grid structure , only supports "Linear"
 moving_grid_type   = 'Linear'                        ## AMR moving grid structure , only supports "Linear"
 quarter_sphere_number = 48                           ## grid number of 1/4 s pher ical surface (Reduced from 96)
                                                     ## (which is needed for evaluating the spherical surface integral)
 #################################################
 #################################################
 ## Setting the puncture information
 puncture_number       = 2                                     
 position_BH           = numpy.zeros( (puncture_number, 3) )   
 parameter_BH          = numpy.zeros( (puncture_number, 3) )   
 dimensionless_spin_BH = numpy.zeros( (puncture_number, 3) )   
 momentum_BH           = numpy.zeros( (puncture_number, 3) )   
 puncture_data_set     = "Manually"                       ## Method to give Puncture’s positions and momentum
                                                         ## choose "Manually" or "Automatically-BBH"
                                                         ## Prefer to choose "Manually", because "Automatically-BBH" is developing now
 ## initial orbital distance and ellipticity for BBHs system
 ## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
 Distance = 10.0
 e0       = 0.0
 ## black hole parameter (M Q* a*)
 parameter_BH[0] = [ 36.0/(36.0+29.0),  0.0,  +0.31 ]   
 parameter_BH[1] = [ 29.0/(36.0+29.0),  0.0,  -0.46 ]  
 ## dimensionless spin in each direction
 dimensionless_spin_BH[0] = [ 0.0,  0.0,  +0.31 ]   
 dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]  
 ## use Brugmann's convention
 ##  -----0-----> y
 ##   -      +     
 #---------------------------------------------
 ## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
 ## initial position for each puncture
 position_BH[0]  = [  0.0,  10.0*29.0/(36.0+29.0), 0.0 ]  
 position_BH[1]  = [  0.0, -10.0*36.0/(36.0+29.0), 0.0 ] 
 ## initial mumentum for each puncture
 ## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
 momentum_BH[0]  = [ -0.09530152296974252,  -0.00084541526517121,   0.0 ]
 momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
 #################################################
 #################################################
 ## Setting the gravitational wave information
 GW_L_max        = 4                      ## maximal L number in gravitational wave
 GW_M_max        = 4                      ## maximal M number in gravitational wave
 Detector_Number = 12                     ## number of dector
 Detector_Rmin   = 50.0                   ## nearest dector distance
 Detector_Rmax   = 160.0                  ## farest dector distance
 #################################################
 #################################################
 ## Setting the apprent horizon
 AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"
 AHF_Find_Every = 24
 AHF_Dump_Time  = 20.0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## Only influence the Equation_Class = "BSSN-EScalar" case
 FR_a2     = 3.0        ## f(R) = R + a2 * R^2    
 FR_l2     = 10000.0
 FR_phi0   = 0.00005
 FR_r0     = 120.0
 FR_sigma0 = 8.0
 FR_Choice = 2          ## Choice options: 1 2 3 4 5
                       ## 1: phi(r) = phi0 * Exp(-(r-r0)**2/sigma0)   
                       ##    V(r)   = 0
                       ## 2: phi(r) =  phi0 * a2^2/(1+a2^2)  
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 3: Schrodinger-Newton gived by system phi(r) 
                       ##    V(r)   = Exp(-8*Sqrt(PI/3)*phi(r)) * (1-Exp(4*Sqrt(PI/3)*phi(r)))**2 / (32*PI*a2)
                       ## 4: phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma0) - tanh((r-r0)/sigma0) )  
                       ##    V(r)   = 0
                       ##    f(R)   = R + a2*R^2  with a2 = +oo
                       ## 5: phi(r) = phi0 * Exp(-(r-r0)**2/sigma)   
                       ##    V(r)   = 0
 #################################################
 #################################################
 ## Other parameters (testing)
 ## (please do not change if not necessary)
 boundary_choice = "BAM-choice"     ## Sommerfeld boundary condition : choose "BAM-choice" or "Shibata-choice" 
                                   ## prefer "BAM-choice"
 gauge_choice  = 0                  ## gauge choice
                                   ## 0: B^i gauge
                                   ## 1: David's puncture gauge
                                   ## 2: MB B^i gauge               
                                   ## 3: RIT B^i gauge
                                   ## 4: MB beta gauge 
                                   ## 5: RIT beta gauge 
                                   ## 6: MGB1 B^i gauge
                                   ## 7: MGB2 B^i gauge
                                   ## prefer 0 or 1
 tetrad_type  = 2                   ## tetradtype 
                                   ##  v:r; u: phi; w: theta
                                   ##      v^a = (x,y,z)
                                   ## 0: orthonormal order: v,u,w
                                   ##    v^a = (x,y,z)   
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## 1: orthonormal order: w,u,v
                                   ##    m = (theta + i phi)/sqrt(2) 
                                   ##    following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)    
                                   ## 2: orthonormal order: v,u,w
                                   ##    v_a = (x,y,z)
                                   ##    m = (phi - i theta)/sqrt(2) 
                                   ##    following Frans, Eq.(8) of  PRD 75, 124018(2007)
                                   ## this version recommend set to 2
                                   ## prefer 2
 #################################################
--- a/AMSS_NCKU_MiniProgram.py
+++ b/AMSS_NCKU_MiniProgram.py
@@ -1,224 +0,0 @@
 ##################################################################
 ##
 ## AMSS-NCKU Numerical Relativity Mini Test Program
 ## Author: Assistant (based on Xiaoqu's code)
 ## 2026/01/20
 ##
 ## This script runs a scaled-down version of the GW150914 test case
 ## suitable for laptop testing.
 ##
 ##################################################################
 import os
 import shutil
 import sys
 import time
 # --- Context Manager for Input File Swapping ---
 class InputFileSwapper:
    def __init__(self, mini_file="AMSS_NCKU_Input_Mini.py", target_file="AMSS_NCKU_Input.py"):
        self.mini_file = mini_file
        self.target_file = target_file
        self.backup_file = target_file + ".bak"
        self.swapped = False
    def __enter__(self):
        print(f"[MiniProgram] Swapping {self.target_file} with {self.mini_file}...")
        if os.path.exists(self.target_file):
            shutil.move(self.target_file, self.backup_file)
        shutil.copy(self.mini_file, self.target_file)
        self.swapped = True
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        if self.swapped:
            print(f"[MiniProgram] Restoring original {self.target_file}...")
            os.remove(self.target_file)
            if os.path.exists(self.backup_file):
                shutil.move(self.backup_file, self.target_file)
 def main():
    # Use the swapper to ensure all imported modules see the mini configuration
    with InputFileSwapper():
        # Import modules AFTER swapping input file
        try:
            import AMSS_NCKU_Input as input_data
            import print_information
            import setup
            import numerical_grid
            import generate_macrodef
            import makefile_and_run
            import generate_TwoPuncture_input
            import renew_puncture_parameter
            import plot_xiaoqu
            import plot_GW_strain_amplitude_xiaoqu
        except ImportError as e:
            print(f"Error importing modules: {e}")
            return
        print_information.print_program_introduction()
        print("\n" + "#"*60)
        print(" RUNNING MINI TEST CASE: GW150914-mini")
        print("#"*60 + "\n")
        # --- Directory Setup ---
        File_directory = os.path.join(input_data.File_directory)
        if os.path.exists(File_directory):
            print(f" Output directory '{File_directory}' exists. Removing for mini test...")
            shutil.rmtree(File_directory, ignore_errors=True)
        os.mkdir(File_directory)
        shutil.copy("AMSS_NCKU_Input.py", File_directory) # Copies the current (mini) input
        output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
        os.mkdir(output_directory)
        binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
        os.mkdir(binary_results_directory)
        figure_directory = os.path.join(File_directory, "figure")
        os.mkdir(figure_directory)
        print(" Output directories generated.\n")
        # --- Setup and Input Generation ---
        setup.print_input_data(File_directory)
        setup.generate_AMSSNCKU_input()
        setup.print_puncture_information()
        print("\n Generating AMSS-NCKU input parfile...")
        numerical_grid.append_AMSSNCKU_cgh_input()
        print("\n Plotting initial grid...")
        numerical_grid.plot_initial_grid()
        print("\n Generating macro files...")
        generate_macrodef.generate_macrodef_h()
        generate_macrodef.generate_macrodef_fh()
        # --- Compilation Preparation ---
        print("\n Preparing to compile and run...")
        AMSS_NCKU_source_path = "AMSS_NCKU_source"
        AMSS_NCKU_source_copy = os.path.join(File_directory, "AMSS_NCKU_source_copy")
        if not os.path.exists(AMSS_NCKU_source_path):
             print(" Error: AMSS_NCKU_source not found! Please run in the project root.")
             return
        shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
        macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
        macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
        shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
        shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
        # --- Compilation ---
        cwd = os.getcwd()
        os.chdir(AMSS_NCKU_source_copy)
        print(" Compiling ABE...")
        makefile_and_run.makefile_ABE()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ): 
            print(" Compiling TwoPunctureABE...")
            makefile_and_run.makefile_TwoPunctureABE()
        os.chdir(cwd)
        # --- Copy Executables ---
        if (input_data.GPU_Calculation == "no"):
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
        else:
            ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
        if not os.path.exists(ABE_file):
            print(" Error: ABE executable compilation failed.")
            return
        shutil.copy2(ABE_file, output_directory)
        TwoPuncture_file = os.path.join(AMSS_NCKU_source_copy, "TwoPunctureABE")
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
            if not os.path.exists(TwoPuncture_file):
                print(" Error: TwoPunctureABE compilation failed.")
                return
            shutil.copy2(TwoPuncture_file, output_directory)
        # --- Execution ---
        start_time = time.time()
        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
             print("\n Generating TwoPuncture input...")
             generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
             AMSS_NCKU_TwoPuncture_inputfile = 'AMSS-NCKU-TwoPuncture.input'
             AMSS_NCKU_TwoPuncture_inputfile_path = os.path.join( File_directory, AMSS_NCKU_TwoPuncture_inputfile )
             shutil.copy2( AMSS_NCKU_TwoPuncture_inputfile_path, os.path.join(output_directory, 'TwoPunctureinput.par') )
             print(" Running TwoPunctureABE...")
             os.chdir(output_directory)
             makefile_and_run.run_TwoPunctureABE()
             os.chdir(cwd)
        # Update Puncture Parameter
        renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
        AMSS_NCKU_inputfile = 'AMSS-NCKU.input'
        AMSS_NCKU_inputfile_path = os.path.join(File_directory, AMSS_NCKU_inputfile)
        shutil.copy2( AMSS_NCKU_inputfile_path, os.path.join(output_directory, 'input.par') )
        print("\n Input files ready. Launching ABE...")
        os.chdir(output_directory)
        makefile_and_run.run_ABE()
        os.chdir(cwd)
        end_time = time.time()
        elapsed_time = end_time - start_time
        # --- Post-processing ---
        print("\n Copying output files for inspection...")
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "setting.par")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "AMSSNCKU_setting_parameter") )
        AMSS_NCKU_error_file_path = os.path.join(binary_results_directory, "Error.log")
        if os.path.exists(AMSS_NCKU_error_file_path):
            shutil.copy( AMSS_NCKU_error_file_path, os.path.join(output_directory, "Error.log") )
        for fname in ["bssn_BH.dat", "bssn_ADMQs.dat", "bssn_psi4.dat", "bssn_constraint.dat"]:
            fpath = os.path.join(binary_results_directory, fname)
            if os.path.exists(fpath):
                shutil.copy(fpath, os.path.join(output_directory, fname))
        # --- Plotting ---
        print("\n Plotting results...")
        try:
            plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
            plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
                plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.Detector_Number):
                plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
            for i in range(input_data.grid_level):
                plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
            plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
        except Exception as e:
            print(f"Warning: Plotting failed: {e}")
        print(f"\n Program Cost = {elapsed_time:.2f} Seconds \n")
        print(" AMSS-NCKU-Python simulation finished (Mini Test).\n")
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,279 +0,0 @@
 #!/usr/bin/env python3
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script
 Verification Requirements:
 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
 - For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
 - Final RMS = max(RMS_BH1, RMS_BH2)
 Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
 Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
 import numpy as np
 import sys
 import os
 # ANSI Color Codes
 class Color:
    GREEN = '\033[92m'
    RED = '\033[91m'
    YELLOW = '\033[93m'
    BLUE = '\033[94m'
    BOLD = '\033[1m'
    RESET = '\033[0m'
 def get_status_text(passed):
    if passed:
        return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
    else:
        return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
 def load_bh_trajectory(filepath):
    """Load black hole trajectory data"""
    data = np.loadtxt(filepath)
    return {
        'time': data[:, 0],
        'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
        'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
    }
 def load_constraint_data(filepath):
    """Load constraint violation data"""
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
    This function computes the RMS error independently for BH1 and BH2 trajectories,
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
    # Extract XY coordinates for both black holes
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
    x1_new = bh_data_target['x1'][:M]
    y1_new = bh_data_target['y1'][:M]
    x2_new = bh_data_target['x2'][:M]
    y2_new = bh_data_target['y2'][:M]
    # Calculate RMS for BH1
    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
    r1_new = np.sqrt(x1_new**2 + y1_new**2)
    r1_max = np.maximum(r1_ref, r1_new)
    # Calculate RMS for BH2
    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
    r2_new = np.sqrt(x2_new**2 + y2_new**2)
    r2_max = np.maximum(r2_ref, r2_new)
    # Avoid division by zero for BH1
    valid_mask1 = r1_max > 1e-15
    if np.sum(valid_mask1) < 10:
        return None, "Insufficient valid data points for BH1"
    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
    # Avoid division by zero for BH2
    valid_mask2 = r2_max > 1e-15
    if np.sum(valid_mask2) < 10:
        return None, "Insufficient valid data points for BH2"
    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
    # Final RMS is the maximum of BH1 and BH2
    rms_final = max(rms_bh1, rms_bh2)
    return rms_final, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
    Analyze ADM constraint violation
    Return maximum constraint violation for Grid Level 0
    """
    # Extract Grid Level 0 data (first entry for each time step)
    level0_data = constraint_data[::n_levels]
    # Calculate maximum absolute value for each constraint
    results = {
        'Ham': np.max(np.abs(level0_data[:, 1])),
        'Px': np.max(np.abs(level0_data[:, 2])),
        'Py': np.max(np.abs(level0_data[:, 3])),
        'Pz': np.max(np.abs(level0_data[:, 4])),
        'Gx': np.max(np.abs(level0_data[:, 5])),
        'Gy': np.max(np.abs(level0_data[:, 6])),
        'Gz': np.max(np.abs(level0_data[:, 7]))
    }
    results['max_violation'] = max(results.values())
    return results
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
 def print_rms_results(rms_rel, error, threshold=1.0):
    """Print RMS error results"""
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
    passed = rms_rel < threshold
    print(f"   RMS relative error: {rms_rel:.4f}%")
    print(f"   Requirement:        < {threshold}%")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 45)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
        print(f"   Max |{name:3}|: {results[name]:.6f}", end="   ")
        if (i + 1) % 2 == 0: print()
    if len(names) % 2 != 0: print()
    passed = results['max_violation'] < threshold
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
    return passed
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
    # Calculate RMS error
    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_rel, error)
    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,57 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
  m=nn
 1 if ((m.ge.2).and.(j.gt.m)) then
  j=j-m
  m=m/2
 goto 1
  endif
 j=j+m
 enddo
 mmax=2
 2  if (n.gt.mmax) then
     istep=2*mmax
     theta=6.28318530717959d0/(isign*mmax)
     wpr=-2.d0*sin(0.5d0*theta)**2
     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Commit the descriptor
 status = DftiCommitDescriptor(desc)
 if (status /= 0) then
   status = DftiFreeDescriptor(desc)
   return
 endif
 ! Execute FFT based on direction
 if (isign == 1) then
   ! Forward FFT: exp(-2*pi*i*k*n/N)
   status = DftiComputeForward(desc, dataa)
 else
   ! Backward FFT: exp(+2*pi*i*k*n/N)
   status = DftiComputeBackward(desc, dataa)
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -313,7 +313,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
  int split_size, min_size, block_size = 0;
-  int min_width = Mymax(2 * ghost_width + 2, buffer_width + 2);
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];
  MyList<Patch> *PLi = PatchLIST;
@@ -641,7 +641,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
  int split_size, min_size, block_size = 0;
-  int min_width = Mymax(2 * ghost_width + 2, buffer_width + 2);
+  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];
  MyList<Patch> *PLi = PatchLIST;
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,6 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
 #include <mkl_cblas.h>
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
@@ -892,17 +891,25 @@ double TwoPunctures::norm1(double *v, int n)
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::norm2(double *v, int n)
 {
-  // Optimized with oneMKL BLAS DNRM2
+  int i;
-  // Computes: sqrt(sum(v[i]^2))
+  double result = 0;
-  return cblas_dnrm2(n, v, 1);
+
  for (i = 0; i < n; i++)
    result += v[i] * v[i];
  return sqrt(result);
 }
 /* -------------------------------------------------------------------------*/
 double TwoPunctures::scalarproduct(double *v, double *w, int n)
 {
-  // Optimized with oneMKL BLAS DDOT
+  int i;
-  // Computes: sum(v[i] * w[i])
+  double result = 0;
-  return cblas_ddot(n, v, 1, w, 1);
+
  for (i = 0; i < n; i++)
    result += v[i] * w[i];
  return result;
 }
 /* -------------------------------------------------------------------------*/
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -18,7 +18,7 @@ using namespace std;
 #include <fstream>
 #endif
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
 	double * trA = M_ chin1;
 	enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
 	cudaMemset(trA,0,matrix_size * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
 	//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
 #endif //ifdef Vertex
 inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
 	cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz  )
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
 	cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
 	cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void computeRicci_part1(double * dst)
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
 inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }/*Exception*/
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
 inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
 {
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -617,9 +617,9 @@ __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
 inline void  sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
 	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 	sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
-	cudaThreadSynchronize(); 
+	cudaDeviceSynchronize(); 
 }
 __global__ void compute_rhs_bssn_part1() 
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //--------------test constant memory address & value--------------
 /*	double rank = mpi_rank;
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
  	compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	if(co == 0){
 		compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 		sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 //-------------------FOR GPU TEST----------------------
 //-----------------------------------------------------
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,6 +4,17 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"
 // CUDA error checking macro for CUDA 13 compatibility
 #define CUDA_SAFE_CALL(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(err)); \
            exit(EXIT_FAILURE); \
        } \
    } while(0)
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
 #define GRID_DIM 256
--- a/AMSS_NCKU_source/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/bssn_gpu_class.C
@@ -2134,7 +2134,9 @@ void bssn_class::Evolve(int Steps)
      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
      CheckPoint->writecheck_cgh(PhysTime, GH);
 #ifdef WithShell
      CheckPoint->writecheck_sh(PhysTime, SH);
 #endif
      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
    }
  }
--- a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
+++ b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu
@@ -20,7 +20,7 @@ using namespace std;
 __device__ volatile unsigned int global_count = 0;
-void compare_result_gpu(int ftag1,double * datac,int data_num){
+static void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
@@ -153,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
 inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
 	//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
 	//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fx,h_3D_SIZE[0]);
 			//compare_result_gpu(2,fy,h_3D_SIZE[0]);
 			//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 	//fderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	//fdderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(21,fh,h_3D_SIZE[2]);
 	sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
 			compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
 			compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 			compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
 			compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
 	sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
 			compare_result_gpu(2,fxy,h_3D_SIZE[0]);
 			compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
 inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_lopsided_ss_part1(double * dst)
@@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
 inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
 {
 		sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 }
 __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
 	}
 			//compare_result_gpu(10,f,h_3D_SIZE[0]);
 	sub_symmetry_bd_ss(3,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[3]);
 	sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
 }
@@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //-------------get device info-------------------------------------
@@ -2306,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
 	compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	//cudaThreadSynchronize();
+	//cudaDeviceSynchronize();
 	//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
 	compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	}
 	if(co == 0){
 		compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 		sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//test kodis
 	//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -61,9 +61,7 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
 !  gont = 0: success; gont = 1: something wrong
-  integer::gont,i,j,k
+  integer::gont
  real*8 :: val1, val2
  real*8 :: det, t_gupxx, t_gupxy, t_gupxz, t_gupyy, t_gupyz, t_gupzz
 !~~~~~~> Other variables:
@@ -86,10 +84,7 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
-  real*8            :: PI
+  real*8            :: dX, dY, dZ, PI
 #if (DEBUG_NAN_CHECK)
  real*8            :: dX
 #endif
  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
@@ -111,7 +106,6 @@
  call getpbh(BHN,Porg,Mass)
 #endif
 #if (DEBUG_NAN_CHECK)
 !!! sanity check
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
@@ -142,10 +136,13 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
  alpn1 = Lap + ONE
  chin1 = chi + ONE
  gxx = dxx + ONE
@@ -159,16 +156,16 @@
  div_beta = betaxx + betayy + betazz
  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
  gxx_rhs = - TWO * alpn1 * Axx    -  F2o3 * gxx * div_beta          + &
              TWO *(  gxx * betaxx +   gxy * betayx +   gxz * betazx)
@@ -193,99 +190,71 @@
                                       gyz * betayx +   gzz * betazx   &
                                                    -   gxz * betayy     !rhs for gij
-! fused loop for metric inversion and connections
+! invert tilted metric
-  !DIR$ SIMD
+  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-  do k=1,ex(3)
+           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-  do j=1,ex(2)
+  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
-  do i=1,ex(1)
+  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
-     ! 1. Metric Inversion
+  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
-     det = ONE / ( &
+  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
-            gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
+  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
-            gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
+  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
            gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) )
     t_gupxx =   ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) * det
     t_gupxy = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) * det
     t_gupxz =   ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) * det
     t_gupyy =   ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) * det
     t_gupyz = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) * det
     t_gupzz =   ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) * det
     gupxx(i,j,k) = t_gupxx
     gupxy(i,j,k) = t_gupxy
     gupxz(i,j,k) = t_gupxz
     gupyy(i,j,k) = t_gupyy
     gupyz(i,j,k) = t_gupyz
     gupzz(i,j,k) = t_gupzz
  if(co == 0)then
-        Gmx_Res(i,j,k) = Gamx(i,j,k) - (t_gupxx*(t_gupxx*gxxx(i,j,k)+t_gupxy*gxyx(i,j,k)+t_gupxz*gxzx(i,j,k))&
+! Gam^i_Res = Gam^i + gup^ij_,j
-                         +t_gupxy*(t_gupxx*gxyx(i,j,k)+t_gupxy*gyyx(i,j,k)+t_gupxz*gyzx(i,j,k))&
+  Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
-                         +t_gupxz*(t_gupxx*gxzx(i,j,k)+t_gupxy*gyzx(i,j,k)+t_gupxz*gzzx(i,j,k))&
+                   +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
-                         +t_gupxx*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
+                   +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
-                         +t_gupxy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
+                   +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
-                         +t_gupxz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
+                   +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
-                         +t_gupxx*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+                   +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
-                         +t_gupxy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+                   +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-                         +t_gupxz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
+                   +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
-        Gmy_Res(i,j,k) = Gamy(i,j,k) - (t_gupxx*(t_gupxy*gxxx(i,j,k)+t_gupyy*gxyx(i,j,k)+t_gupyz*gxzx(i,j,k))&
+                   +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
-                         +t_gupxy*(t_gupxy*gxyx(i,j,k)+t_gupyy*gyyx(i,j,k)+t_gupyz*gyzx(i,j,k))&
+  Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
-                         +t_gupxz*(t_gupxy*gxzx(i,j,k)+t_gupyy*gyzx(i,j,k)+t_gupyz*gzzx(i,j,k))&
+                   +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
-                         +t_gupxy*(t_gupxy*gxxy(i,j,k)+t_gupyy*gxyy(i,j,k)+t_gupyz*gxzy(i,j,k))&
+                   +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
-                         +t_gupyy*(t_gupxy*gxyy(i,j,k)+t_gupyy*gyyy(i,j,k)+t_gupyz*gyzy(i,j,k))&
+                   +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
-                         +t_gupyz*(t_gupxy*gxzy(i,j,k)+t_gupyy*gyzy(i,j,k)+t_gupyz*gzzy(i,j,k))&
+                   +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
-                         +t_gupxy*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+                   +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
-                         +t_gupyy*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+                   +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-                         +t_gupyz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
+                   +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
-        Gmz_Res(i,j,k) = Gamz(i,j,k) - (t_gupxx*(t_gupxz*gxxx(i,j,k)+t_gupyz*gxyx(i,j,k)+t_gupzz*gxzx(i,j,k))&
+                   +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
-                         +t_gupxy*(t_gupxz*gxyx(i,j,k)+t_gupyz*gyyx(i,j,k)+t_gupzz*gyzx(i,j,k))&
+  Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
-                         +t_gupxz*(t_gupxz*gxzx(i,j,k)+t_gupyz*gyzx(i,j,k)+t_gupzz*gzzx(i,j,k))&
+                   +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
-                         +t_gupxy*(t_gupxz*gxxy(i,j,k)+t_gupyz*gxyy(i,j,k)+t_gupzz*gxzy(i,j,k))&
+                   +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
-                         +t_gupyy*(t_gupxz*gxyy(i,j,k)+t_gupyz*gyyy(i,j,k)+t_gupzz*gyzy(i,j,k))&
+                   +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
-                         +t_gupyz*(t_gupxz*gxzy(i,j,k)+t_gupyz*gyzy(i,j,k)+t_gupzz*gzzy(i,j,k))&
+                   +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
-                         +t_gupxz*(t_gupxz*gxxz(i,j,k)+t_gupyz*gxyz(i,j,k)+t_gupzz*gxzz(i,j,k))&
+                   +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
-                         +t_gupyz*(t_gupxz*gxyz(i,j,k)+t_gupyz*gyyz(i,j,k)+t_gupzz*gyzz(i,j,k))&
+                   +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-                         +t_gupzz*(t_gupxz*gxzz(i,j,k)+t_gupyz*gyzz(i,j,k)+t_gupzz*gzzz(i,j,k)))
+                   +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
                   +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
  endif
-     ! 2. Christoffel Symbols
+! second kind of connection
-     val1 = TWO * gxyx(i,j,k) - gxxy(i,j,k)
+  Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
-     val2 = TWO * gxzx(i,j,k) - gxxz(i,j,k)
+  Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
-     Gamxxx(i,j,k) =HALF*( t_gupxx*gxxx(i,j,k) + t_gupxy*val1 + t_gupxz*val2 )
+  Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
     Gamyxx(i,j,k) =HALF*( t_gupxy*gxxx(i,j,k) + t_gupyy*val1 + t_gupyz*val2 )
     Gamzxx(i,j,k) =HALF*( t_gupxz*gxxx(i,j,k) + t_gupyz*val1 + t_gupzz*val2 )
-     val1 = TWO * gxyy(i,j,k) - gyyx(i,j,k)
+  Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
-     val2 = TWO * gyzy(i,j,k) - gyyz(i,j,k)
+  Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
-     Gamxyy(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyy(i,j,k) + t_gupxz*val2 )
+  Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
     Gamyyy(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyy(i,j,k) + t_gupyz*val2 )
     Gamzyy(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyy(i,j,k) + t_gupzz*val2 )
-     val1 = TWO * gxzz(i,j,k) - gzzx(i,j,k)
+  Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
-     val2 = TWO * gyzz(i,j,k) - gzzy(i,j,k)
+  Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
-     Gamxzz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*val2 + t_gupxz*gzzz(i,j,k) )
+  Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
     Gamyzz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*val2 + t_gupyz*gzzz(i,j,k) )
     Gamzzz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*val2 + t_gupzz*gzzz(i,j,k) )
-     val1 = gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)
+  Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
-     Gamxxy(i,j,k) =HALF*( t_gupxx*gxxy(i,j,k) + t_gupxy*gyyx(i,j,k) + t_gupxz*val1 )
+  Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
-     Gamyxy(i,j,k) =HALF*( t_gupxy*gxxy(i,j,k) + t_gupyy*gyyx(i,j,k) + t_gupyz*val1 )
+  Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
     Gamzxy(i,j,k) =HALF*( t_gupxz*gxxy(i,j,k) + t_gupyz*gyyx(i,j,k) + t_gupzz*val1 )
     val1 = gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)
     Gamxxz(i,j,k) =HALF*( t_gupxx*gxxz(i,j,k) + t_gupxy*val1 + t_gupxz*gzzx(i,j,k) )
     Gamyxz(i,j,k) =HALF*( t_gupxy*gxxz(i,j,k) + t_gupyy*val1 + t_gupyz*gzzx(i,j,k) )
     Gamzxz(i,j,k) =HALF*( t_gupxz*gxxz(i,j,k) + t_gupyz*val1 + t_gupzz*gzzx(i,j,k) )
     val1 = gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)
     Gamxyz(i,j,k) =HALF*( t_gupxx*val1 + t_gupxy*gyyz(i,j,k) + t_gupxz*gzzy(i,j,k) )
     Gamyyz(i,j,k) =HALF*( t_gupxy*val1 + t_gupyy*gyyz(i,j,k) + t_gupyz*gzzy(i,j,k) )
     Gamzyz(i,j,k) =HALF*( t_gupxz*val1 + t_gupyz*gyyz(i,j,k) + t_gupzz*gzzy(i,j,k) )
  enddo
  enddo
  enddo
  Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
  Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
  Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
  Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
  Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
  Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
 ! Raise indices of \tilde A_{ij} and store in R_ij
  Rxx =    gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
@@ -316,40 +285,30 @@
  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
  ! reuse fxx/fxy/fxz as temporaries for matter-source combinations
  fxx = F2o3 * Kx + EIGHT * PI * Sx
  fxy = F2o3 * Ky + EIGHT * PI * Sy
  fxz = F2o3 * Kz + EIGHT * PI * Sz
  ! reuse Gamxa/Gamya/Gamza as temporaries for chix*R combinations
  Gamxa = chix * Rxx + chiy * Rxy + chiz * Rxz
  Gamya = chix * Rxy + chiy * Ryy + chiz * Ryz
  Gamza = chix * Rxz + chiy * Ryz + chiz * Rzz
   Gamx_rhs = - TWO * (   Lapx * Rxx +   Lapy * Rxy +   Lapz * Rxz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2 * ONE/chin1 * Gamxa - &
+        -F3o2/chin1 * (   chix * Rxx +   chiy * Rxy +   chiz * Rxz ) - &
-              gupxx * fxx - &
+              gupxx * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-              gupxy * fxy - &
+              gupxy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-              gupxz * fxz + &
+              gupxz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
                        Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz   + &
                TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
   Gamy_rhs = - TWO * (   Lapx * Rxy +   Lapy * Ryy +   Lapz * Ryz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2 * ONE/chin1 * Gamya - &
+        -F3o2/chin1 * (   chix * Rxy +  chiy * Ryy +    chiz * Ryz ) - &
-              gupxy * fxx - &
+              gupxy * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-              gupyy * fxy - &
+              gupyy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-              gupyz * fxz + &
+              gupyz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
                        Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz   + &
                TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
   Gamz_rhs = - TWO * (   Lapx * Rxz +   Lapy * Ryz +   Lapz * Rzz ) + &
        TWO * alpn1 * (                                                &
-        -F3o2 * ONE/chin1 * Gamza - &
+        -F3o2/chin1 * (   chix * Rxz +  chiy * Ryz +    chiz * Rzz ) - &
-              gupxz * fxx - &
+              gupxz * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-              gupyz * fxy - &
+              gupyz * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-              gupzz * fxz + &
+              gupzz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
                        Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz   + &
                TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
@@ -651,47 +610,47 @@
  fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
 ! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
-  f =        gupxx * ( fxx - F3o2 * ONE/chin1 * chix * chix ) + &
+  f =        gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
-             gupyy * ( fyy - F3o2 * ONE/chin1 * chiy * chiy ) + &
+             gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
-             gupzz * ( fzz - F3o2 * ONE/chin1 * chiz * chiz ) + &
+             gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
-       TWO * gupxy * ( fxy - F3o2 * ONE/chin1 * chix * chiy ) + &
+       TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
-       TWO * gupxz * ( fxz - F3o2 * ONE/chin1 * chix * chiz ) + &
+       TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
-       TWO * gupyz * ( fyz - F3o2 * ONE/chin1 * chiy * chiz ) 
+       TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) 
 ! Add chi part to Ricci tensor:
-  Rxx = Rxx + (fxx - chix*chix*ONE/chin1*HALF + gxx * f) * ONE/chin1 * HALF
+  Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
-  Ryy = Ryy + (fyy - chiy*chiy*ONE/chin1*HALF + gyy * f) * ONE/chin1 * HALF
+  Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
-  Rzz = Rzz + (fzz - chiz*chiz*ONE/chin1*HALF + gzz * f) * ONE/chin1 * HALF
+  Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
-  Rxy = Rxy + (fxy - chix*chiy*ONE/chin1*HALF + gxy * f) * ONE/chin1 * HALF
+  Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
-  Rxz = Rxz + (fxz - chix*chiz*ONE/chin1*HALF + gxz * f) * ONE/chin1 * HALF
+  Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
-  Ryz = Ryz + (fyz - chiy*chiz*ONE/chin1*HALF + gyz * f) * ONE/chin1 * HALF
+  Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
 ! covariant second derivatives of the lapse respect to physical metric
  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
                SYM,SYM,SYM,symmetry,Lev)
-  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz) * ONE/chin1
+  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
-  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz) * ONE/chin1
+  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
-  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz) * ONE/chin1
+  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
 ! now get physical second kind of connection
-  Gamxxx = Gamxxx - ( TWO * chix * ONE/chin1 - gxx * gxxx )*HALF
+  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
  Gamxyy = Gamxyy - (                     - gyy * gxxx )*HALF
-  Gamyyy = Gamyyy - ( TWO * chiy * ONE/chin1 - gyy * gxxy )*HALF
+  Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
  Gamzyy = Gamzyy - (                     - gyy * gxxz )*HALF
  Gamxzz = Gamxzz - (                     - gzz * gxxx )*HALF
  Gamyzz = Gamyzz - (                     - gzz * gxxy )*HALF
-  Gamzzz = Gamzzz - ( TWO * chiz * ONE/chin1 - gzz * gxxz )*HALF
+  Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
-  Gamxxy = Gamxxy - (  chiy * ONE/chin1 - gxy * gxxx )*HALF
+  Gamxxy = Gamxxy - (  chiy        /chin1 - gxy * gxxx )*HALF
-  Gamyxy = Gamyxy - (  chix * ONE/chin1 - gxy * gxxy )*HALF
+  Gamyxy = Gamyxy - (         chix /chin1 - gxy * gxxy )*HALF
  Gamzxy = Gamzxy - (                     - gxy * gxxz )*HALF
-  Gamxxz = Gamxxz - (  chiz * ONE/chin1 - gxz * gxxx )*HALF
+  Gamxxz = Gamxxz - (  chiz        /chin1 - gxz * gxxx )*HALF
  Gamyxz = Gamyxz - (                     - gxz * gxxy )*HALF
-  Gamzxz = Gamzxz - (  chix * ONE/chin1 - gxz * gxxz )*HALF
+  Gamzxz = Gamzxz - (         chix /chin1 - gxz * gxxz )*HALF
  Gamxyz = Gamxyz - (                     - gyz * gxxx )*HALF
-  Gamyyz = Gamyyz - (  chiz * ONE/chin1 - gyz * gxxy )*HALF
+  Gamyyz = Gamyyz - (  chiz        /chin1 - gyz * gxxy )*HALF
-  Gamzyz = Gamzyz - (  chiy * ONE/chin1 - gyz * gxxz )*HALF
+  Gamzyz = Gamzyz - (         chiy /chin1 - gyz * gxxz )*HALF
  fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
  fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
@@ -734,7 +693,7 @@
       gupxz * (Axy * Azz + Ayz * Axz) + &
       gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
  f = - F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
-        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1 * ONE/chin1 * f)
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
@@ -854,8 +813,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  fxx = dsqrt(chin1)
+  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
  dtSfx_rhs = Gamx_rhs - reta*dtSfx
  dtSfy_rhs = Gamy_rhs - reta*dtSfy
  dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -867,7 +825,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
+  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
  dtSfx_rhs = Gamx_rhs - reta*dtSfx
  dtSfy_rhs = Gamy_rhs - reta*dtSfy
  dtSfz_rhs = Gamz_rhs - reta*dtSfz
@@ -875,8 +833,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  fxx = dsqrt(chin1)
+  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-dsqrt(chin1))**2
  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-fxx)**2
  betax_rhs = FF*Gamx - reta*betax
  betay_rhs = FF*Gamy - reta*betay
  betaz_rhs = FF*Gamz - reta*betaz
@@ -888,7 +845,7 @@
  call fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
  reta = gupxx * dtSfx_rhs * dtSfx_rhs + gupyy * dtSfy_rhs * dtSfy_rhs + gupzz * dtSfz_rhs * dtSfz_rhs + &
       TWO * (gupxy * dtSfx_rhs * dtSfy_rhs + gupxz * dtSfx_rhs * dtSfz_rhs + gupyz * dtSfy_rhs * dtSfz_rhs)
-  reta = 1.31d0/2*dsqrt(reta*ONE/chin1)/(ONE-chin1)**2
+  reta = 1.31d0/2*dsqrt(reta/chin1)/(1-chin1)**2
  betax_rhs = FF*Gamx - reta*betax
  betay_rhs = FF*Gamy - reta*betay
  betaz_rhs = FF*Gamz - reta*betaz
@@ -1120,48 +1077,48 @@ endif
 ! mov_Res_j = gupkj*(-1/chi d_k chi*A_ij + D_k A_ij) - 2/3 d_j trK - 8 PI s_j where D respect to physical metric
 ! store D_i A_jk - 1/chi d_i chi*A_jk in gjk_i
  call fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0)
  call fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0)
  call fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  call fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0)
  call fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0)
  gxxx = gxxx - (  Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz &
-                 + Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx*ONE/chin1
+                 + Gamxxx * Axx + Gamyxx * Axy + Gamzxx * Axz) - chix*Axx/chin1
  gxyx = gxyx - (  Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
-                 + Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy*ONE/chin1
+                 + Gamxxx * Axy + Gamyxx * Ayy + Gamzxx * Ayz) - chix*Axy/chin1
  gxzx = gxzx - (  Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
-                 + Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz*ONE/chin1
+                 + Gamxxx * Axz + Gamyxx * Ayz + Gamzxx * Azz) - chix*Axz/chin1
  gyyx = gyyx - (  Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz &
-                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy*ONE/chin1
+                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chix*Ayy/chin1
  gyzx = gyzx - (  Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz &
-                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz*ONE/chin1
+                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chix*Ayz/chin1
  gzzx = gzzx - (  Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz &
-                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz*ONE/chin1
+                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chix*Azz/chin1
  gxxy = gxxy - (  Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz &
-                 + Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx*ONE/chin1
+                 + Gamxxy * Axx + Gamyxy * Axy + Gamzxy * Axz) - chiy*Axx/chin1
  gxyy = gxyy - (  Gamxyy * Axx + Gamyyy * Axy + Gamzyy * Axz &
-                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy*ONE/chin1
+                 + Gamxxy * Axy + Gamyxy * Ayy + Gamzxy * Ayz) - chiy*Axy/chin1
  gxzy = gxzy - (  Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
-                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz*ONE/chin1
+                 + Gamxxy * Axz + Gamyxy * Ayz + Gamzxy * Azz) - chiy*Axz/chin1
  gyyy = gyyy - (  Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz &
-                 + Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy*ONE/chin1
+                 + Gamxyy * Axy + Gamyyy * Ayy + Gamzyy * Ayz) - chiy*Ayy/chin1
  gyzy = gyzy - (  Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
-                 + Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz*ONE/chin1
+                 + Gamxyy * Axz + Gamyyy * Ayz + Gamzyy * Azz) - chiy*Ayz/chin1
  gzzy = gzzy - (  Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz &
-                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz*ONE/chin1
+                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiy*Azz/chin1
  gxxz = gxxz - (  Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz &
-                 + Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx*ONE/chin1
+                 + Gamxxz * Axx + Gamyxz * Axy + Gamzxz * Axz) - chiz*Axx/chin1
  gxyz = gxyz - (  Gamxyz * Axx + Gamyyz * Axy + Gamzyz * Axz &
-                 + Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy*ONE/chin1
+                 + Gamxxz * Axy + Gamyxz * Ayy + Gamzxz * Ayz) - chiz*Axy/chin1
  gxzz = gxzz - (  Gamxzz * Axx + Gamyzz * Axy + Gamzzz * Axz &
-                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz*ONE/chin1
+                 + Gamxxz * Axz + Gamyxz * Ayz + Gamzxz * Azz) - chiz*Axz/chin1
  gyyz = gyyz - (  Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz &
-                 + Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy*ONE/chin1
+                 + Gamxyz * Axy + Gamyyz * Ayy + Gamzyz * Ayz) - chiz*Ayy/chin1
  gyzz = gyzz - (  Gamxzz * Axy + Gamyzz * Ayy + Gamzzz * Ayz &
-                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz*ONE/chin1
+                 + Gamxyz * Axz + Gamyyz * Ayz + Gamzyz * Azz) - chiz*Ayz/chin1
  gzzz = gzzz - (  Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz &
-                 + Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz*ONE/chin1
+                 + Gamxzz * Axz + Gamyzz * Ayz + Gamzzz * Azz) - chiz*Azz/chin1
 movx_Res = gupxx*gxxx + gupyy*gxyy + gupzz*gxzz &
          +gupxy*gxyx + gupxz*gxzx + gupyz*gxzy &
          +gupxy*gxxy + gupxz*gxxz + gupyz*gxyz
--- a/AMSS_NCKU_source/bssn_step_gpu.C
+++ b/AMSS_NCKU_source/bssn_step_gpu.C
@@ -1676,8 +1676,11 @@ void bssn_class::Step_GPU(int lev, int YN)
 #endif // PSTR == ?
 //--------------------------With Shell--------------------------
 // Note: SHStep() implementation is in bssn_gpu_class.C
 #ifdef WithShell
 #if 0
 // This SHStep() implementation has been moved to bssn_gpu_class.C to avoid duplicate definition
 void bssn_class::SHStep()
 {
  int lev = 0;
@@ -1938,5 +1941,5 @@ void bssn_class::SHStep()
    sPp = sPp->next;
  }
 }
-d
+#endif // #if 0
 #endif // withshell
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -1939,309 +1939,6 @@
  return
  end subroutine fddyz
  subroutine fderivs_batch4(ex,f1,f2,f3,f4, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z,f4x,f4y,f4z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3,f4
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f4x,f4y,f4z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3,fh4
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  call symmetry_bd(2,ex,f4,fh4,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  f4x = ZEO; f4y = ZEO; f4z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
      f4x(i,j,k)=d12dx*(fh4(i-2,j,k)-EIT*fh4(i-1,j,k)+EIT*fh4(i+1,j,k)-fh4(i+2,j,k))
      f4y(i,j,k)=d12dy*(fh4(i,j-2,k)-EIT*fh4(i,j-1,k)+EIT*fh4(i,j+1,k)-fh4(i,j+2,k))
      f4z(i,j,k)=d12dz*(fh4(i,j,k-2)-EIT*fh4(i,j,k-1)+EIT*fh4(i,j,k+1)-fh4(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
      f4x(i,j,k)=d2dx*(-fh4(i-1,j,k)+fh4(i+1,j,k))
      f4y(i,j,k)=d2dy*(-fh4(i,j-1,k)+fh4(i,j+1,k))
      f4z(i,j,k)=d2dz*(-fh4(i,j,k-1)+fh4(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch4
 !-----------------------------------------------------------------------------
 ! batch first derivatives (3 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch3(ex,f1,f2,f3, &
                            f1x,f1y,f1z,f2x,f2y,f2z,f3x,f3y,f3z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2,f3
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f3x,f3y,f3z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2,fh3
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  call symmetry_bd(2,ex,f3,fh3,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  f3x = ZEO; f3y = ZEO; f3z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
      f3x(i,j,k)=d12dx*(fh3(i-2,j,k)-EIT*fh3(i-1,j,k)+EIT*fh3(i+1,j,k)-fh3(i+2,j,k))
      f3y(i,j,k)=d12dy*(fh3(i,j-2,k)-EIT*fh3(i,j-1,k)+EIT*fh3(i,j+1,k)-fh3(i,j+2,k))
      f3z(i,j,k)=d12dz*(fh3(i,j,k-2)-EIT*fh3(i,j,k-1)+EIT*fh3(i,j,k+1)-fh3(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
      f3x(i,j,k)=d2dx*(-fh3(i-1,j,k)+fh3(i+1,j,k))
      f3y(i,j,k)=d2dy*(-fh3(i,j-1,k)+fh3(i,j+1,k))
      f3z(i,j,k)=d2dz*(-fh3(i,j,k-1)+fh3(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch3
 !-----------------------------------------------------------------------------
 ! batch first derivatives (2 fields), same symmetry setup
 !-----------------------------------------------------------------------------
  subroutine fderivs_batch2(ex,f1,f2, &
                            f1x,f1y,f1z,f2x,f2y,f2z, &
                            X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
  implicit none
  integer,                               intent(in ):: ex(1:3),symmetry,onoff
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(in ):: f1,f2
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f1x,f1y,f1z
  real*8,  dimension(ex(1),ex(2),ex(3)), intent(out):: f2x,f2y,f2z
  real*8,                                intent(in) :: X(ex(1)),Y(ex(2)),Z(ex(3))
  real*8,                                intent(in ):: SYM1,SYM2,SYM3
 !~~~~~~ other variables
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh1,fh2
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0
  real*8,  parameter :: TWO=2.d0,EIT=8.d0
  real*8,  parameter :: F12=1.2d1
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -1
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -1
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -1
  SoA(1) = SYM1
  SoA(2) = SYM2
  SoA(3) = SYM3
  call symmetry_bd(2,ex,f1,fh1,SoA)
  call symmetry_bd(2,ex,f2,fh2,SoA)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  f1x = ZEO; f1y = ZEO; f1z = ZEO
  f2x = ZEO; f2y = ZEO; f2z = ZEO
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
   if(i+2 <= imax .and. i-2 >= imin .and. &
      j+2 <= jmax .and. j-2 >= jmin .and. &
      k+2 <= kmax .and. k-2 >= kmin) then
      f1x(i,j,k)=d12dx*(fh1(i-2,j,k)-EIT*fh1(i-1,j,k)+EIT*fh1(i+1,j,k)-fh1(i+2,j,k))
      f1y(i,j,k)=d12dy*(fh1(i,j-2,k)-EIT*fh1(i,j-1,k)+EIT*fh1(i,j+1,k)-fh1(i,j+2,k))
      f1z(i,j,k)=d12dz*(fh1(i,j,k-2)-EIT*fh1(i,j,k-1)+EIT*fh1(i,j,k+1)-fh1(i,j,k+2))
      f2x(i,j,k)=d12dx*(fh2(i-2,j,k)-EIT*fh2(i-1,j,k)+EIT*fh2(i+1,j,k)-fh2(i+2,j,k))
      f2y(i,j,k)=d12dy*(fh2(i,j-2,k)-EIT*fh2(i,j-1,k)+EIT*fh2(i,j+1,k)-fh2(i,j+2,k))
      f2z(i,j,k)=d12dz*(fh2(i,j,k-2)-EIT*fh2(i,j,k-1)+EIT*fh2(i,j,k+1)-fh2(i,j,k+2))
   elseif(i+1 <= imax .and. i-1 >= imin .and. &
          j+1 <= jmax .and. j-1 >= jmin .and. &
          k+1 <= kmax .and. k-1 >= kmin) then
      f1x(i,j,k)=d2dx*(-fh1(i-1,j,k)+fh1(i+1,j,k))
      f1y(i,j,k)=d2dy*(-fh1(i,j-1,k)+fh1(i,j+1,k))
      f1z(i,j,k)=d2dz*(-fh1(i,j,k-1)+fh1(i,j,k+1))
      f2x(i,j,k)=d2dx*(-fh2(i-1,j,k)+fh2(i+1,j,k))
      f2y(i,j,k)=d2dy*(-fh2(i,j-1,k)+fh2(i,j+1,k))
      f2z(i,j,k)=d2dz*(-fh2(i,j,k-1)+fh2(i,j,k+1))
   endif
  enddo
  enddo
  enddo
  return
  end subroutine fderivs_batch2
 #elif (ghost_width == 4)
 ! sixth order code
@@ -2380,9 +2077,6 @@
  end subroutine fderivs
 !-----------------------------------------------------------------------------
 ! batch first derivatives (4 fields), same symmetry setup
 !-----------------------------------------------------------------------------
 !-----------------------------------------------------------------------------
 !
 ! single derivatives dx
 !
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1276,9 +1276,7 @@ end subroutine d2dump
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
@@ -1302,12 +1300,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1332,9 +1325,7 @@ f_out = f_out*dX*dY*dZ
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
-  integer::i,j,k,n_elements
+  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1397,12 +1388,7 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(n_elements))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
 f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
@@ -1430,8 +1416,6 @@ f_out = f_out*dX*dY*dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8, dimension(:), allocatable :: f_flat
  real*8, external :: DDOT
  real*8 :: PIo4
@@ -1494,12 +1478,11 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
-! Optimized with oneMKL BLAS DDOT for dot product
+f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
 f_out = f_out
 Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
 allocate(f_flat(Nout))
 f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
 f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
 deallocate(f_flat)
  return
@@ -1697,7 +1680,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: tmp2
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(3) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds+1  
@@ -1733,21 +1715,20 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
  endif 
  ! Optimized with BLAS operations for better performance
  ! First dimension: z-direction weighted sum
  tmp2=0
  do m=1,ORDN
    tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
  enddo
  ! Second dimension: y-direction weighted sum
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
-  ! Third dimension: x-direction weighted sum using BLAS DDOT
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1777,7 +1758,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN,ORDN) :: ya
  real*8, dimension(ORDN) :: tmp1
  real*8, dimension(2) :: SoAh
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  cxB = inds(1:2)+1  
@@ -1807,14 +1787,15 @@ deallocate(f_flat)
     ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
  endif 
  ! Optimized with BLAS operations
  tmp1=0
  do m=1,ORDN
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
-  ! Use BLAS DDOT for final weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*tmp1(m)
  enddo
  return
@@ -1845,7 +1826,6 @@ deallocate(f_flat)
  real*8, dimension(ORDN) :: ya
  real*8 :: SoAh
  integer,dimension(3) :: inds
  real*8, external :: DDOT
 ! +1 because c++ gives 0 for first point
  inds = indsi + 1
@@ -1906,8 +1886,10 @@ deallocate(f_flat)
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
-  ! Optimized with BLAS DDOT for weighted sum
+  f_int=0
-  f_int = DDOT(ORDN, coef, 1, ya, 1)
+  do m=1,ORDN
    f_int = f_int + coef(m)*ya(m)
  enddo
  return
@@ -2139,38 +2121,24 @@ deallocate(f_flat)
  end function fWigner_d_function
 !----------------------------------
 ! Optimized factorial function using lookup table for small N
 ! and log-gamma for large N to avoid overflow
  function ffact(N) result(gont)
  implicit none
  integer,intent(in) :: N
  real*8 :: gont
  integer :: i
-  ! Lookup table for factorials 0! to 20! (precomputed)
+  integer :: i
  real*8, parameter, dimension(0:20) :: fact_table = [ &
    1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
    362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
    87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
    355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
    2432902008176640000.d0 ]
 ! sanity check
  if(N < 0)then
     write(*,*) "ffact: error input for factorial"
     gont = 1.d0
     return
  endif
-  ! Use lookup table for small N (fast path)
+  gont = 1.d0
-  if(N <= 20)then
+  do i=1,N
-     gont = fact_table(N)
+     gont = gont*i
-  else
+  enddo
     ! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
     ! This avoids overflow and is computed efficiently
     gont = exp(log_gamma(dble(N+1)))
  endif
  return
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -16,66 +16,115 @@ using namespace std;
 #include <string.h>
 #include <math.h>
 #endif
-
+/* Linear equation solution by Gauss-Jordan elimination.
 // Intel oneMKL LAPACK interface
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv, temp;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            for (int ii = 0; ii < n; ii++)
            {
              for (int jj = 0; jj < n; jj++)
                cout << a[ii * n + jj] << " ";
              cout << endl;
            }
            return 1; // error return
          }
        }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+    ipiv[icol] = ipiv[icol] + 1;
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+    if (irow != icol)
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    {
-
+      for (l = 0; l < n; l++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[irow * n + l];
-    delete[] ipiv;
+        a[irow * n + l] = a[icol * n + l];
-    delete[] a_copy;
+        a[icol * n + l] = swap;
    return 1;
      }
-  // Step 2: Compute matrix inverse A^(-1) using LU factorization
+      swap = b[irow];
-  // First do LU factorization of original matrix a
+      b[irow] = b[icol];
-  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
+      b[icol] = swap;
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
    }
-  // Then compute inverse from LU factorization
+    indxr[i] = irow;
-  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
+    indxc[i] = icol;
-  if (info != 0) {
+    if (a[icol * n + icol] == 0.0)
-    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
+    {
-    delete[] ipiv;
+      cout << "gaussj: Singular Matrix-2" << endl;
-    delete[] a_copy;
+      for (int ii = 0; ii < n; ii++)
-    return 1;
+      {
        for (int jj = 0; jj < n; jj++)
          cout << a[ii * n + jj] << " ";
        cout << endl;
      }
      return 1; // error return
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
  for (l = n - 1; l >= 0; l--)
  {
    if (indxr[l] != indxc[l])
      for (k = 0; k < n; k++)
      {
        swap = a[k * n + indxr[l]];
        a[k * n + indxr[l]] = a[k * n + indxc[l]];
        a[k * n + indxc[l]] = swap;
      }
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/ilucg.f90
+++ b/AMSS_NCKU_source/ilucg.f90
@@ -512,10 +512,11 @@
      IMPLICIT DOUBLE PRECISION (A-H,O-Z)
      DIMENSION V(N),W(N)
 !     SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
 !     Optimized using Intel oneMKL BLAS ddot
 !     Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
-      DOUBLE PRECISION, EXTERNAL :: DDOT
+      SUM = 0.0D0
-      DGVV = DDOT(N, V, 1, W, 1)
+            DO 10 I = 1,N
            SUM = SUM + V(I)*W(I)
 10          CONTINUE
      DGVV = SUM
      RETURN
      END
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -2,7 +2,7 @@
 #ifndef MICRODEF_H
 #define MICRODEF_H
-#include "macrodef.fh"
+#include "microdef.fh"
 // application parameters
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,32 +1,22 @@
 ## GCC version (commented out)
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+filein  = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/14/ -I/usr/include/c++/14/
 filein  = -I/usr/include/ -I${MKLROOT}/include
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+##filein  = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
-## Aggressive optimization flags:
+LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/14 -lgfortran -lmpi -lgfortran -lcudart -lcuda
-## -O3: Maximum optimization
+##LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
-## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
+
-## -fp-model fast=2: Aggressive floating-point optimizations
+CXXAPPFLAGS  = -O3 -Wno-deprecated -Dfortran3 -Dnewc
-## -fma: Enable fused multiply-add instructions
+#f90appflags = -O3 -fpp
-## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
+f90appflags  = -O3 -x f95-cpp-input
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma \
+f90          = gfortran 
-               -Dfortran3 -Dnewc -I${MKLROOT}/include
+f77          = gfortran 
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma \
+CXX          = g++
-               -fpp -I${MKLROOT}/include
+CC           = gcc
-f90          = ifx
+CLINKER      = mpic++ 
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx 
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9)
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -253,19 +253,7 @@ def generate_macrodef_h():
    # Define macro buffer_width
    # number of buffer points for mesh-refinement interfaces
-    # Calculate ghost_width based on Finite_Diffenence_Method to optimize buffer_width
+    print( "#define buffer_width 6",                     file=file1 )
    if ( input_data.Finite_Diffenence_Method == "2nd-order" ):
        gw = 2
    elif ( input_data.Finite_Diffenence_Method == "4th-order" ):
        gw = 3
    elif ( input_data.Finite_Diffenence_Method == "6th-order" ):
        gw = 4
    elif ( input_data.Finite_Diffenence_Method == "8th-order" ):
        gw = 5
    else:
        gw = 5 # Default conservative value
    print( f"#define buffer_width {gw + 1}",      file=file1 )
    print(                                               file=file1 )
    # Define macro SC_width as buffer_width
@@ -404,17 +392,6 @@ def generate_macrodef_fh():
        print( "# Finite_Difference_Method #define ghost_width setting error!!!",   file=file1 )
        print(                                                   file=file1 )
    # Define macro DEBUG_NAN_CHECK
    # 0: off (default), 1: on
    debug_nan_check = getattr(input_data, "Debug_NaN_Check", 0)
    if debug_nan_check:
        print( "#define DEBUG_NAN_CHECK 1", file=file1 )
        print(                             file=file1 )
    else:
        print( "#define DEBUG_NAN_CHECK 0", file=file1 )
        print(                             file=file1 )
    # Whether to use a shell-patch grid
    # use shell or not
@@ -537,9 +514,6 @@ def generate_macrodef_fh():
    print( "    6th order: 4",                                                                      file=file1 )
    print( "    8th order: 5",                                                                      file=file1 )
    print(                                                                                          file=file1 )
    print( "define DEBUG_NAN_CHECK",                                                                file=file1 )
    print( "    0: off (default), 1: on",                                                           file=file1 )
    print(                                                                                          file=file1 )
    print( "define WithShell",                                                                      file=file1 )
    print( "    use shell or not",                                                                  file=file1 )
    print(                                                                                          file=file1 )
--- a/inputfile_example/AMSS_NCKU_Input.py
+++ b/inputfile_example/AMSS_NCKU_Input.py
@@ -36,7 +36,6 @@ Equation_Class           = "BSSN"                  ## Evolution Equation: choose
 Initial_Data_Method      = "Ansorg-TwoPuncture"    ## initial data method: choose "Ansorg-TwoPuncture", "Lousto-Analytical", "Cao-Analytical", "KerrSchild-Analytical"
 Time_Evolution_Method    = "runge-kutta-45"        ## time evolution method: choose "runge-kutta-45"
 Finite_Diffenence_Method = "4th-order"             ## finite-difference method: choose "2nd-order", "4th-order", "6th-order", "8th-order"
 Debug_NaN_Check          = 0                       ## enable NaN checks in compute_rhs_bssn: 0 (off) or 1 (on)
 #################################################
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,18 +11,6 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 4-55,60-111"
 NUMACTL_CPU_BIND = ""
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
 BUILD_JOBS = 14
 ##################################################################
@@ -38,11 +26,11 @@ def makefile_ABE():
    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                        )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = "make -j4" + " ABE"
    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+        makefile_command  = "make -j4" + " ABEGPU"
    else:
        print( " CPU/GPU numerical calculation setting is wrong " )
        print(                                                    )
@@ -79,8 +67,8 @@ def makefile_TwoPunctureABE():
    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
    print(                                                            )
-    ## Build command with CPU binding to nohz_full cores
+    ## Build command
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+    makefile_command = "make" + " TwoPunctureABE"
    ## Execute the command with subprocess.Popen and stream output
    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
@@ -117,10 +105,10 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
@@ -159,7 +147,7 @@ def run_TwoPunctureABE():
    print(                                                          )
    ## Define the command to run
-    TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = "./TwoPunctureABE"
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
    ## Execute the command with subprocess.Popen and stream output
Author	SHA1	Message	Date
ianchb	95575d9450	fix: try to fix segfault at 240 steps by adding WithShell guard for writecheck_sh call	2026-01-22 14:26:41 +08:00
ianchb	54600327da	fix(build): update makefile.inc for debian 13	2026-01-21 09:29:35 +08:00
CGH0S7	75be0968fc	feat: port GPU code to CUDA 13 and enable GPU computation Major changes: - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050) - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize() - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility - Fix duplicate function definitions (compare_result_gpu, SHStep) - Fix syntax error in bssn_step_gpu.C - Enable GPU calculation in AMSS_NCKU_Input.py - Successfully build ABEGPU executable	2026-01-13 18:15:49 +00:00
CGH0S7	b27e071cde	Makefile updated for rocky10	2026-01-14 01:41:31 +08:00
CGH0S7	a1125d4c79	try to build gpu version	2026-01-13 23:52:44 +08:00
CGH0S7	dcc66588fc	gitignore updated	2026-01-13 23:45:49 +08:00
CGH0S7	950d448edf	fix(build): update LDLIBS to use -lmpi and remove hardcoded paths	2026-01-13 23:40:51 +08:00