黄老板逆天重写

2026-03-01 05:48:40 +08:00
74 changed files with 86000 additions and 72071 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
-__pycache__
-GW150914
-GW150914-origin
-docs
-*.tmp
-
+__pycache__
+GW150914
+GW150914-origin
+docs
+*.tmp
+
--- a/2.txt
+++ b/2.txt
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 2                          ## number of mpi processes used in the simulation

 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
@@ -50,7 +50,7 @@ Check_Time               = 100.0
 Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
 D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
+Evolution_Step_Number    = 6               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength

--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################

-## Guard against re-execution by multiprocessing child processes.
-## Without this, using 'spawn' or 'forkserver' context would cause every
-## worker to re-run the entire script, spawning exponentially more
-## workers (fork bomb).
-if __name__ != '__main__':
-    import sys as _sys
-    _sys.exit(0)
-

 ##################################################################

@@ -57,32 +49,32 @@ import time
 File_directory = os.path.join(input_data.File_directory)   

 ## If the specified output directory exists, ask the user whether to continue
-if os.path.exists(File_directory):
-    print( " Output dictionary has been existed !!!  "                                                              )
-    print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
-    print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
-    print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
-    print(                                                                                                          )
-    ## Prompt whether to overwrite the existing directory
-    while True:
-        try:
-            inputvalue = input()
-            ## If the user agrees to overwrite, proceed and remove the existing directory
-            if ( inputvalue == "continue" ):
-                print( " Continue the calculation !!! " )
-                print(                                  )
-                break  
-            ## If the user chooses not to overwrite, exit and keep the existing directory
-            elif ( inputvalue == "stop" ):
-                print( " Stop the calculation !!! "    )
-                sys.exit() 
-            ## If the user input is invalid, prompt again
-            else:
-                print( " Please input your choice !!! "                   )
-                print( " Input 'continue' or 'stop' in the terminal !!! " )
-        except ValueError:
-            print( " Please input your choice !!! "                   )
-            print( " Input 'continue' or 'stop' in the terminal !!! " )
+# if os.path.exists(File_directory):
+#     print( " Output dictionary has been existed !!!  "                                                              )
+#     print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
+#     print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
+#     print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
+#     print(                                                                                                          )
+#     ## Prompt whether to overwrite the existing directory
+#     while True:
+#         try:
+#             inputvalue = input()
+#             ## If the user agrees to overwrite, proceed and remove the existing directory
+#             if ( inputvalue == "continue" ):
+#                 print( " Continue the calculation !!! " )
+#                 print(                                  )
+#                 break  
+#             ## If the user chooses not to overwrite, exit and keep the existing directory
+#             elif ( inputvalue == "stop" ):
+#                 print( " Stop the calculation !!! "    )
+#                 sys.exit() 
+#             ## If the user input is invalid, prompt again
+#             else:
+#                 print( " Please input your choice !!! "                   )
+#                 print( " Input 'continue' or 'stop' in the terminal !!! " )
+#         except ValueError:
+#             print( " Please input your choice !!! "                   )
+#             print( " Input 'continue' or 'stop' in the terminal !!! " )
        
 ## Remove the existing output directory if present
 shutil.rmtree(File_directory, ignore_errors=True)
@@ -270,12 +262,6 @@ if not os.path.exists( ABE_file ):
 ## Copy the executable ABE (or ABEGPU) into the run directory
 shutil.copy2(ABE_file, output_directory)

-## Copy interp load balance profile if present (for optimize pass)
-interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
-if os.path.exists(interp_lb_profile):
-    shutil.copy2(interp_lb_profile, output_directory)
-    print( " Copied interp_lb_profile.bin to run directory " )
-
 ###########################

 ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
@@ -438,31 +424,26 @@ print(

 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
-from parallel_plot_helper import run_plot_tasks_parallel
-
-plot_tasks = []

 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )

 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )

 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )

 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )

 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
-
-run_plot_tasks_parallel(plot_tasks)
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )

 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,13 +1,9 @@
 #!/usr/bin/env python3
 """
-AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
+AMSS-NCKU GW150914 Simulation Regression Test Script

 Verification Requirements:
-1. RMS errors < 1% for:
-   - 3D Vector Total RMS
-   - X Component RMS
-   - Y Component RMS
-   - Z Component RMS
+1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)

 RMS Calculation Method:
@@ -61,62 +57,79 @@ def load_constraint_data(filepath):
                data.append([float(x) for x in parts[:8]])
    return np.array(data)

-def calculate_all_rms_errors(bh_data_ref, bh_data_target):
+
+def calculate_rms_error(bh_data_ref, bh_data_target):
    """
-    Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
-    Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
-    Returns the maximum error between BH1 and BH2 for each category.
+    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
+
+    This function computes the RMS error independently for BH1 and BH2 trajectories,
+    then returns the maximum of the two as the final RMS error metric.
+
+    For each black hole, the RMS is calculated as:
+        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
+
+    where:
+        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
+        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
+
+    Args:
+        bh_data_ref: Reference (baseline) trajectory data
+        bh_data_target: Target (optimized) trajectory data
+
+    Returns:
+        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
+        error: Error message if any
    """
+    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))

    if M < 10:
        return None, "Insufficient data points for comparison"

-    results = {}
+    # Extract XY coordinates for both black holes
+    x1_ref = bh_data_ref['x1'][:M]
+    y1_ref = bh_data_ref['y1'][:M]
+    x2_ref = bh_data_ref['x2'][:M]
+    y2_ref = bh_data_ref['y2'][:M]

-    for bh in ['1', '2']:
-        x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
-        x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
+    x1_new = bh_data_target['x1'][:M]
+    y1_new = bh_data_target['y1'][:M]
+    x2_new = bh_data_target['x2'][:M]
+    y2_new = bh_data_target['y2'][:M]

-        # 核心修改：根据组委会的邮件指示，分母统一使用 r = sqrt(x^2 + y^2)
-        r_ref = np.sqrt(x_r**2 + y_r**2)
-        r_new = np.sqrt(x_n**2 + y_n**2)
-        denom_max = np.maximum(r_ref, r_new)
+    # Calculate RMS for BH1
+    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
+    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
+    r1_new = np.sqrt(x1_new**2 + y1_new**2)
+    r1_max = np.maximum(r1_ref, r1_new)

-        valid = denom_max > 1e-15
-        if np.sum(valid) < 10:
-            results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
-            continue
+    # Calculate RMS for BH2
+    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
+    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
+    r2_new = np.sqrt(x2_new**2 + y2_new**2)
+    r2_max = np.maximum(r2_ref, r2_new)

-        def calc_rms(delta):
-            # 将对应分量的偏差除以统一的轨道半径分母 denom_max
-            return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
+    # Avoid division by zero for BH1
+    valid_mask1 = r1_max > 1e-15
+    if np.sum(valid_mask1) < 10:
+        return None, "Insufficient valid data points for BH1"

-        # 1. Total 3D Vector RMS
-        delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
-        rms_3d = calc_rms(delta_vec)
+    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
+    rms_bh1 = np.sqrt(np.mean(terms1)) * 100

-        # 2. Component-wise RMS (分离计算各轴，但共用半径分母)
-        rms_x = calc_rms(np.abs(x_r - x_n))
-        rms_y = calc_rms(np.abs(y_r - y_n))
-        rms_z = calc_rms(np.abs(z_r - z_n))
+    # Avoid division by zero for BH2
+    valid_mask2 = r2_max > 1e-15
+    if np.sum(valid_mask2) < 10:
+        return None, "Insufficient valid data points for BH2"

-        results[f'BH{bh}'] = {
-            '3D_Vector': rms_3d,
-            'X_Component': rms_x,
-            'Y_Component': rms_y,
-            'Z_Component': rms_z
-        }
+    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
+    rms_bh2 = np.sqrt(np.mean(terms2)) * 100

-    # 获取 BH1 和 BH2 中的最大误差
-    max_rms = {
-        '3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
-        'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
-        'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
-        'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
-    }
+    # Final RMS is the maximum of BH1 and BH2
+    rms_final = max(rms_bh1, rms_bh2)
+
+    return rms_final, None

-    return max_rms, None

 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
@@ -142,32 +155,34 @@ def analyze_constraint_violation(constraint_data, n_levels=9):


 def print_header():
+    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print(Color.BOLD + "   AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
+    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)

-def print_rms_results(rms_dict, error, threshold=1.0):
-    print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
-    print("-" * 65)
+
+def print_rms_results(rms_rel, error, threshold=1.0):
+    """Print RMS error results"""
+    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
+    print("-" * 45)

    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False

-    all_passed = True
-    print(f"   Requirement: < {threshold}%\n")
+    passed = rms_rel < threshold

-    for key, val in rms_dict.items():
-        passed = val < threshold
-        all_passed = all_passed and passed
-        status = get_status_text(passed)
-        print(f"   {key:15}: {val:8.4f}%   |   Status: {status}")
+    print(f"   RMS relative error: {rms_rel:.4f}%")
+    print(f"   Requirement:        < {threshold}%")
+    print(f"   Status:             {get_status_text(passed)}")
+
+    return passed

-    return all_passed

 def print_constraint_results(results, threshold=2.0):
+    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
-    print("-" * 65)
+    print("-" * 45)

    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
@@ -185,6 +200,7 @@ def print_constraint_results(results, threshold=2.0):


 def print_summary(rms_passed, constraint_passed):
+    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
@@ -194,7 +210,7 @@ def print_summary(rms_passed, constraint_passed):
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)

-    print(f"   [1] Comprehensive RMS check:      {res_rms}")
+    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
@@ -203,48 +219,61 @@ def print_summary(rms_passed, constraint_passed):

    return all_passed

+
 def main():
+    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")

+    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")

+    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")

+    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
+
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
+
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)

+    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")

+    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)

-    # Output modified RMS results
-    rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
-    rms_passed = print_rms_results(rms_dict, error)
+    # Calculate RMS error
+    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
+    rms_passed = print_rms_results(rms_rel, error)

-    # Output constraint results
+    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)

+    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
+
+    # Return exit code
    sys.exit(0 if all_passed else 1)

+
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -24,7 +24,7 @@ using namespace std;

 #include "misc.h"
 #include "macrodef.h"
-
+#include <omp.h>
 #ifndef ABEtype
 #error "not define ABEtype"
 #endif
@@ -69,8 +69,9 @@ int main(int argc, char *argv[])

      double Begin_clock, End_clock;
      if (myrank == 0)
-      {
+      {     
            Begin_clock = MPI_Wtime();
+
      }

      if (argc > 1)
--- a/AMSS_NCKU_source/Ansorg.psid
+++ b/AMSS_NCKU_source/Ansorg.psid
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -13,10 +13,7 @@ using namespace std;
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
-#ifdef INTERP_LB_PROFILE
-#include "interp_lb_profile.h"
-#endif
-
+#include "xh_global_interp.h"
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {

@@ -397,7 +394,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
    while (notfind && Bp) // run along Blocks
    {
      Block *BP = Bp->data;
-
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
@@ -433,8 +429,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+
+            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+
            varl = varl->next;
            k++;
          }
@@ -444,6 +442,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
        break;
      Bp = Bp->next;
    }
+
  }

  // Replace MPI_Allreduce with per-owner MPI_Bcast:
@@ -510,13 +509,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
-#ifdef INTERP_LB_PROFILE
-  double t_interp_start = MPI_Wtime();
-#endif
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
+// printf("here----\n");
+  // int zzz = 0;
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
@@ -535,30 +532,35 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;

-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);

  // --- Interpolation phase (identical to original) ---
+  // printf("NN: %d, num_var = %d\n", NN, num_var);
+  #pragma omp parallel
+  {
+  #pragma omp for
  for (int j = 0; j < NN; j++)
  {
-    double pox[dim];
+    double pox[dim], llb[dim], uub[dim];
+    MyList<var> *varl1;
    for (int i = 0; i < dim; i++)
    {
      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      // if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      // {
+      //   cout << "Patch::Interp_Points: point (";
+      //   for (int k = 0; k < dim; k++)
+      //   {
+      //     cout << XX[k][j];
+      //     if (k < dim - 1)
+      //       cout << ",";
+      //     else
+      //       cout << ") is out of current Patch." << endl;
+      //   }
+      //   MPI_Abort(MPI_COMM_WORLD, 1);
+      // }
    }

    MyList<Block> *Bp = blb;
@@ -590,21 +592,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
          break;
        }
      }
-
+      // printf("flag = %d\n", flag);
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
-          varl = VarList;
+          varl1 = VarList;
          int k = 0;
-          while (varl)
+          while (varl1)
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
-            varl = varl->next;
+            
+            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl1->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl1->data->SoA, Symmetry);
+            varl1 = varl1->next;
            k++;
+            // zzz += 1;
          }
        }
      }
@@ -613,12 +617,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-
-#ifdef INTERP_LB_PROFILE
-  double t_interp_end = MPI_Wtime();
-  double t_interp_local = t_interp_end - t_interp_start;
-#endif
-
+  }
+  // printf("Interpolation done, zzz = %d\n", zzz);
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
@@ -775,31 +775,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
-
-#ifdef INTERP_LB_PROFILE
-  {
-    static bool profile_written = false;
-    if (!profile_written) {
-      double *all_times = nullptr;
-      if (myrank == 0) all_times = new double[nprocs];
-      MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
-                 all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-      if (myrank == 0) {
-        int heavy[64];
-        int nh = InterpLBProfile::identify_heavy_ranks(
-            all_times, nprocs, 2.5, heavy, 64);
-        InterpLBProfile::write_profile(
-            "interp_lb_profile.bin", nprocs,
-            all_times, heavy, nh, 2.5);
-        printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
-        for (int i = 0; i < nh; i++)
-          printf("  Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
-        delete[] all_times;
-      }
-      profile_written = true;
-    }
-  }
-#endif
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -809,7 +784,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  int myrank, lmyrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_rank(Comm_here, &lmyrank);
-
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
@@ -899,7 +873,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
@@ -1131,7 +1105,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
        {
          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+          xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
@@ -1233,7 +1207,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
  // NOTE: we do not Synchnize variables here, make sure of that before calling this routine
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
+  
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
@@ -1373,7 +1347,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
        {
          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+          xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -4,7 +4,7 @@
 #include "prolongrestrict.h"
 #include "misc.h"
 #include "parameters.h"
-
+#include <omp.h>
 int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
 {
  nx = Mymax(1, shape / min_width);
@@ -462,7 +462,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
            }
          }
 #else
-          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev);
+          ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
          //	    ng->checkBlock();
          if (BlL)
            BlL->insert(ng);
@@ -500,384 +500,6 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i

  return BlL;
 }
-
-#ifdef INTERP_LB_OPTIMIZE
-#include "interp_lb_profile_data.h"
-
-MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-#ifdef USE_GPU_DIVIDE
-  double cpu_part, gpu_part;
-  map<string, double>::iterator iter;
-  iter = parameters::dou_par.find("cpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    cpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-        strcpy(pname, (iter->second).c_str());
-      else { cout << "Error inputpar" << endl; exit(0); }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN); str = pline;
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-      else if (status == 0) continue;
-      if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); }
-    }
-    inf.close();
-    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
-  }
-  iter = parameters::dou_par.find("gpu part");
-  if (iter != parameters::dou_par.end())
-  {
-    gpu_part = iter->second;
-  }
-  else
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    const int LEN = 256;
-    char pline[LEN];
-    string str, sgrp, skey, sval;
-    int sind;
-    char pname[50];
-    {
-      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
-      if (iter != parameters::str_par.end())
-        strcpy(pname, (iter->second).c_str());
-      else { cout << "Error inputpar" << endl; exit(0); }
-    }
-    ifstream inf(pname, ifstream::in);
-    if (!inf.good() && myrank == 0)
-    { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-    for (int i = 1; inf.good(); i++)
-    {
-      inf.getline(pline, LEN); str = pline;
-      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
-      if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
-      else if (status == 0) continue;
-      if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); }
-    }
-    inf.close();
-    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
-  }
-  if (nodes == 0) nodes = cpusize / 2;
-#else
-  if (nodes == 0) nodes = cpusize;
-#endif
-
-  if (dim != 3)
-  {
-    cout << "distrivute: now we only support 3-dimension" << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  MyList<Block> *BlL = 0;
-  int split_size, min_size, block_size = 0;
-  int min_width = 2 * Mymax(ghost_width, buffer_width);
-  int nxyz[dim], mmin_width[dim], min_shape[dim];
-
-  MyList<Patch> *PLi = PatchLIST;
-  for (int i = 0; i < dim; i++)
-    min_shape[i] = PLi->data->shape[i];
-  int lev = PLi->data->lev;
-  PLi = PLi->next;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    for (int i = 0; i < dim; i++)
-      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
-    if (lev != PLi->data->lev)
-      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
-    PLi = PLi->next;
-  }
-
-  for (int i = 0; i < dim; i++)
-    mmin_width[i] = Mymin(min_width, min_shape[i]);
-  min_size = mmin_width[0];
-  for (int i = 1; i < dim; i++)
-    min_size = min_size * mmin_width[i];
-
-  PLi = PatchLIST;
-  while (PLi)
-  {
-    Patch *PP = PLi->data;
-    int bs = PP->shape[0];
-    for (int i = 1; i < dim; i++)
-      bs = bs * PP->shape[i];
-    block_size = block_size + bs;
-    PLi = PLi->next;
-  }
-  split_size = Mymax(min_size, block_size / nodes);
-  split_size = Mymax(1, split_size);
-
-  int n_rank = 0;
-  PLi = PatchLIST;
-  int reacpu = 0;
-  int current_block_id = 0;
-  while (PLi) {
-    Block *ng0, *ng;
-    bool first_block_in_patch = true;
-    Patch *PP = PLi->data;
-    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
-
-    for (int i = 0; i < nxyz[0]; i++)
-    for (int j = 0; j < nxyz[1]; j++)
-    for (int k = 0; k < nxyz[2]; k++)
-    {
-        int ibbox_here[6], shape_here[3];
-        double bbox_here[6], dd;
-        Block *current_ng_start = nullptr;
-
-        bool is_heavy = false;
-        int r_l = -1, r_r = -1;
-        if (cpusize == INTERP_LB_NPROCS) {
-          for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) {
-            if (current_block_id == interp_lb_splits[si][0]) {
-              is_heavy = true;
-              r_l = interp_lb_splits[si][1];
-              r_r = interp_lb_splits[si][2];
-              break;
-            }
-          }
-        }
-
-        if (is_heavy)
-        {
-            int ib0 = (PP->shape[0] * i) / nxyz[0];
-            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-            int jb1 = (PP->shape[1] * j) / nxyz[1];
-            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-            int kb2 = (PP->shape[2] * k) / nxyz[2];
-            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-            Block *split_first_block = nullptr;
-            Block *split_last_block = nullptr;
-            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
-                              PP, r_l, r_r, ingfsi, fngfsi, periodic,
-                              split_first_block, split_last_block);
-
-            current_ng_start = split_first_block;
-            ng = split_last_block;
-        }
-        else
-        {
-            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
-            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
-            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
-            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
-            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
-            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
-
-            if (periodic) {
-                for(int d=0; d<3; d++) {
-                    ibbox_here[d] -= ghost_width;
-                    ibbox_here[d+3] += ghost_width;
-                }
-            } else {
-                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
-                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
-                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
-                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
-                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
-                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
-            }
-
-            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
-
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
-          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
-          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
-          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
-          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
-          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
-#else
-#ifdef Cell
-          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
-          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
-          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
-          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
-          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
-          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-            ng = createMappedBlock(BlL, dim, shape_here, bbox_here,
-                                   current_block_id, ingfsi, fngfsi, PP->lev);
-            current_ng_start = ng;
-        }
-
-        if (first_block_in_patch) {
-            ng0 = current_ng_start;
-            MyList<Block> *Bp_start = BlL;
-            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
-            PP->blb = Bp_start;
-            first_block_in_patch = false;
-        }
-
-        current_block_id++;
-    }
-
-    {
-      MyList<Block> *Bp_end = BlL;
-      while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
-      PP->ble = Bp_end;
-    }
-
-    PLi = PLi->next;
-  }
-  if (reacpu < nodes * 2 / 3)
-  {
-    int myrank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-    if (myrank == 0)
-      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
-  }
-
-  return BlL;
-}
-
-Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                                 int ib0_orig, int ib3_orig,
-                                 int jb1_orig, int jb4_orig,
-                                 int kb2_orig, int kb5_orig,
-                                 Patch* PP, int r_left, int r_right,
-                                 int ingfsi, int fngfsi, bool periodic,
-                                 Block* &split_first_block, Block* &split_last_block)
-{
-    int mid = (ib0_orig + ib3_orig) / 2;
-
-    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
-    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
-
-    auto createSubBlock = [&](int* ib_raw, int target_rank) {
-        int ib_final[6];
-        int sh_here[3];
-        double bb_here[6], dd;
-
-        if (periodic) {
-            ib_final[0] = ib_raw[0] - ghost_width;
-            ib_final[3] = ib_raw[3] + ghost_width;
-            ib_final[1] = ib_raw[1] - ghost_width;
-            ib_final[4] = ib_raw[4] + ghost_width;
-            ib_final[2] = ib_raw[2] - ghost_width;
-            ib_final[5] = ib_raw[5] + ghost_width;
-        } else {
-            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
-            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
-            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
-            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
-            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
-            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
-        }
-
-        sh_here[0] = ib_final[3] - ib_final[0] + 1;
-        sh_here[1] = ib_final[4] - ib_final[1] + 1;
-        sh_here[2] = ib_final[5] - ib_final[2] + 1;
-
-#ifdef Vertex
-        dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
-        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
-        bb_here[3] = PP->bbox[0] + ib_final[3] * dd;
-        dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
-        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
-        bb_here[4] = PP->bbox[1] + ib_final[4] * dd;
-        dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
-        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
-        bb_here[5] = PP->bbox[2] + ib_final[5] * dd;
-#else
-#ifdef Cell
-        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
-        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
-        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
-        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
-        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
-        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
-        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
-        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
-        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
-#endif
-#endif
-
-        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
-        if (BlL) BlL->insert(Bg);
-        else     BlL = new MyList<Block>(Bg);
-
-        return Bg;
-    };
-
-    split_first_block = createSubBlock(indices_L, r_left);
-    split_last_block  = createSubBlock(indices_R, r_right);
-    return split_last_block;
-}
-
-Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                                   int block_id, int ingfsi, int fngfsi, int lev)
-{
-    int target_rank = block_id;
-    if (INTERP_LB_NPROCS > 0) {
-      for (int ri = 0; ri < interp_lb_num_remaps; ri++) {
-        if (block_id == interp_lb_remaps[ri][0]) {
-          target_rank = interp_lb_remaps[ri][1];
-          break;
-        }
-      }
-    }
-
-    Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
-    if (BlL) BlL->insert(ng);
-    else     BlL = new MyList<Block>(ng);
-
-    return ng;
-}
-#else
-// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute
-MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
-                                    bool periodic, int nodes)
-{
-  return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes);
-}
-Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                                 int ib0_orig, int ib3_orig,
-                                 int jb1_orig, int jb4_orig,
-                                 int kb2_orig, int kb5_orig,
-                                 Patch* PP, int r_left, int r_right,
-                                 int ingfsi, int fngfsi, bool periodic,
-                                 Block* &split_first_block, Block* &split_last_block)
-{ return nullptr; }
-Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                                   int block_id, int ingfsi, int fngfsi, int lev)
-{ return nullptr; }
-#endif
-
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int start_rank, int end_rank, int nodes)
@@ -3716,7 +3338,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
 {
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
+  // double time1 = omp_get_wtime();
  int DIM = dim;

  if (dir != PACK && dir != UNPACK)
@@ -3739,7 +3361,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    varls = varls->next;
    varld = varld->next;
  }
-
  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
@@ -3753,7 +3374,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    type = 2;
  else
    type = 3;
-
  while (src && dst)
  {
    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
@@ -3763,6 +3383,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
      varld = VarListd;
      while (varls && varld)
      {
+
        if (data)
        {
          if (dir == PACK)
@@ -3783,6 +3404,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
+
            }
          if (dir == UNPACK) // from target data to corresponding grid
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
@@ -3796,8 +3418,14 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    }
    dst = dst->next;
    src = src->next;
-  }

+  }
+  // double time2 = omp_get_wtime();
+  // xxx += time2 - time1;
+  // if(myrank == 0){
+  // printf("prolong3 time = %lf\n", time2 - time1);
+
+  // }
  return size_out;
 }
 int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
@@ -3892,7 +3520,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

  int node;
-
+  // double time1 = omp_get_wtime();
  MPI_Request *reqs;
  MPI_Status *stats;
  reqs = new MPI_Request[2 * cpusize];
@@ -3961,7 +3589,9 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
    if (rec_data[node])
      delete[] rec_data[node];
  }
-
+  // double time2 = omp_get_wtime();
+  // if (myrank == 0)
+  //   printf("transfer time = %lf\n", time2 - time1);
  delete[] reqs;
  delete[] stats;
  delete[] send_data;
@@ -5664,203 +5294,6 @@ void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  delete[] transfer_src;
  delete[] transfer_dst;
 }
-
-// Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                               MyList<var> *VarList1, MyList<var> *VarList2,
-                               int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
-void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                  MyList<var> *VarList1, MyList<var> *VarList2,
-                                  int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
-}
-
-// OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
-void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                                     MyList<var> *VarList1, MyList<var> *VarList2,
-                                     int Symmetry, SyncCache &cache)
-{
-  if (!cache.valid)
-  {
-    int cpusize;
-    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
-    cache.cpusize = cpusize;
-
-    if (!cache.combined_src)
-    {
-      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
-      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
-      cache.send_lengths = new int[cpusize];
-      cache.recv_lengths = new int[cpusize];
-      cache.send_bufs = new double *[cpusize];
-      cache.recv_bufs = new double *[cpusize];
-      cache.send_buf_caps = new int[cpusize];
-      cache.recv_buf_caps = new int[cpusize];
-      for (int i = 0; i < cpusize; i++)
-      {
-        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
-        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
-      }
-      cache.max_reqs = 2 * cpusize;
-      cache.reqs = new MPI_Request[cache.max_reqs];
-      cache.stats = new MPI_Status[cache.max_reqs];
-    }
-
-    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
-    for (int node = 0; node < cpusize; node++)
-    {
-      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
-      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
-      if (src_owned) src_owned->destroyList();
-    }
-    if (dst) dst->destroyList();
-
-    cache.valid = true;
-  }
-
-  // Use transfermix instead of transfer for mix-mode interpolation
-  int myrank;
-  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  int cpusize = cache.cpusize;
-
-  int req_no = 0;
-  for (int node = 0; node < cpusize; node++)
-  {
-    if (node == myrank)
-    {
-      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = length;
-      if (length > 0)
-      {
-        if (length > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[length];
-          cache.recv_buf_caps[node] = length;
-        }
-        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      cache.send_lengths[node] = slength;
-      if (slength > 0)
-      {
-        if (slength > cache.send_buf_caps[node])
-        {
-          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
-          cache.send_bufs[node] = new double[slength];
-          cache.send_buf_caps[node] = slength;
-        }
-        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-      cache.recv_lengths[node] = rlength;
-      if (rlength > 0)
-      {
-        if (rlength > cache.recv_buf_caps[node])
-        {
-          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
-          cache.recv_bufs[node] = new double[rlength];
-          cache.recv_buf_caps[node] = rlength;
-        }
-        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
-      }
-    }
-  }
-
-  MPI_Waitall(req_no, cache.reqs, cache.stats);
-
-  for (int node = 0; node < cpusize; node++)
-    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
-      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-}
-
 // collect all buffer grid segments or blocks for given patch
 MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
 {
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -32,16 +32,6 @@ namespace Parallel
  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
-  MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
-  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                           int ib0_orig, int ib3_orig,
-                           int jb1_orig, int jb4_orig,
-                           int kb2_orig, int kb5_orig,
-                           Patch* PP, int r_left, int r_right,
-                           int ingfsi, int fngfsi, bool periodic,
-                           Block* &split_first_block, Block* &split_last_block);
-  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                           int block_id, int ingfsi, int fngfsi, int lev);
  void KillBlocks(MyList<Patch> *PatchLIST);

  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -140,15 +130,6 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
-  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                       MyList<var> *VarList1, MyList<var> *VarList2,
-                       int Symmetry, SyncCache &cache);
-  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                          MyList<var> *VarList1, MyList<var> *VarList2,
-                          int Symmetry, SyncCache &cache);
-  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
-                             MyList<var> *VarList1, MyList<var> *VarList2,
-                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -40,7 +40,7 @@ using namespace std;

 #include "derivatives.h"
 #include "ricci_gamma.h"
-
+#include "xh_bssn_rhs_compute.h"
 //================================================================================================

 // define bssn_class
@@ -2029,6 +2029,7 @@ void bssn_class::Read_Ansorg()
 void bssn_class::Evolve(int Steps)
 {
  clock_t prev_clock, curr_clock;
+  double prev_time, curr_time;
  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
  LastAnas = 0;
 #if 0
@@ -2141,8 +2142,10 @@ void bssn_class::Evolve(int Steps)
    //     if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) 
    //     { GH->levels=GH->movls; }

-    if (myrank == 0)
+    if (myrank == 0){
      curr_clock = clock();
+      curr_time = omp_get_wtime();
+    }
 #if (PSTR == 0)
    RecursiveStep(0);
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
@@ -2198,12 +2201,17 @@ void bssn_class::Evolve(int Steps)
    if (myrank == 0)
    {
      prev_clock = curr_clock;
+      prev_time = curr_time;
      curr_clock = clock();
+      curr_time = omp_get_wtime();
      cout << endl;
+      // cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
+      //      << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
+      //      << " seconds! " << endl;
+      // // cout << endl;
      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-      // cout << endl;
+            << " Computer used " << (curr_time - prev_time) 
+            << " seconds! " << endl;
    }

    if (PhysTime >= TotalTime)
@@ -2426,9 +2434,9 @@ void bssn_class::RecursiveStep(int lev)
 #endif

 #if (REGLEV == 0)
-  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2605,9 +2613,9 @@ void bssn_class::ParallelStep()
  delete[] tporg;
  delete[] tporgo;
 #if (REGLEV == 0)
-  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
+  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }
@@ -2772,9 +2780,9 @@ void bssn_class::ParallelStep()
      if (lev + 1 >= GH->movls)
      {
        //	       GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
-        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
+        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
-                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
+                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

        //               a_stream.clear();
@@ -2787,9 +2795,9 @@ void bssn_class::ParallelStep()
    // for this level
    if (YN == 1)
    {
-      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
-                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
+                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

      //               a_stream.clear();
@@ -2806,9 +2814,9 @@ void bssn_class::ParallelStep()
        if (YN == 1)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
@@ -2822,9 +2830,9 @@ void bssn_class::ParallelStep()
        if (i % 4 == 3)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
@@ -3092,7 +3100,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif

-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -3292,7 +3300,7 @@ void bssn_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
-
+          // cout<<"....................................."<<endl;
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
@@ -3457,7 +3465,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif

-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -3970,7 +3978,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif

-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -4312,7 +4320,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif

-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -4848,7 +4856,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif

-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -5048,7 +5056,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif

-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -5819,11 +5827,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5870,11 +5888,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5949,11 +5977,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5971,11 +6009,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -6036,11 +6084,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6060,11 +6118,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6101,11 +6169,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }

 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6192,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
+      Ppc = GH->PatL[lev - 1];
+      while (Ppc)
+      {
+        Pp = GH->PatL[lev];
+        while (Pp)
+        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
+          Pp = Pp->next;
+        }
+        Ppc = Ppc->next;
+      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -7263,7 +7351,7 @@ void bssn_class::Constraint_Out()
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -7766,7 +7854,7 @@ void bssn_class::Interp_Constraint(bool infg)
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -8024,7 +8112,7 @@ void bssn_class::Compute_Constraint()
          Block *cg = BP->data;
          if (myrank == cg->rank)
          {
-            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+            f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -106,38 +106,6 @@
  call getpbh(BHN,Porg,Mass)
 #endif

-!!! sanity check (disabled in production builds for performance)
-#ifdef DEBUG
-  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
-      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
-      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
-      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)
-  if(dX.ne.dX) then
-     if(sum(chi).ne.sum(chi))write(*,*)"bssn.f90: find NaN in chi"
-     if(sum(trK).ne.sum(trK))write(*,*)"bssn.f90: find NaN in trk"
-     if(sum(dxx).ne.sum(dxx))write(*,*)"bssn.f90: find NaN in dxx"
-     if(sum(gxy).ne.sum(gxy))write(*,*)"bssn.f90: find NaN in gxy"
-     if(sum(gxz).ne.sum(gxz))write(*,*)"bssn.f90: find NaN in gxz"
-     if(sum(dyy).ne.sum(dyy))write(*,*)"bssn.f90: find NaN in dyy"
-     if(sum(gyz).ne.sum(gyz))write(*,*)"bssn.f90: find NaN in gyz"
-     if(sum(dzz).ne.sum(dzz))write(*,*)"bssn.f90: find NaN in dzz"
-     if(sum(Axx).ne.sum(Axx))write(*,*)"bssn.f90: find NaN in Axx"
-     if(sum(Axy).ne.sum(Axy))write(*,*)"bssn.f90: find NaN in Axy"
-     if(sum(Axz).ne.sum(Axz))write(*,*)"bssn.f90: find NaN in Axz"
-     if(sum(Ayy).ne.sum(Ayy))write(*,*)"bssn.f90: find NaN in Ayy"
-     if(sum(Ayz).ne.sum(Ayz))write(*,*)"bssn.f90: find NaN in Ayz"
-     if(sum(Azz).ne.sum(Azz))write(*,*)"bssn.f90: find NaN in Azz"
-     if(sum(Gamx).ne.sum(Gamx))write(*,*)"bssn.f90: find NaN in Gamx"
-     if(sum(Gamy).ne.sum(Gamy))write(*,*)"bssn.f90: find NaN in Gamy"
-     if(sum(Gamz).ne.sum(Gamz))write(*,*)"bssn.f90: find NaN in Gamz"
-     if(sum(Lap).ne.sum(Lap))write(*,*)"bssn.f90: find NaN in Lap"
-     if(sum(betax).ne.sum(betax))write(*,*)"bssn.f90: find NaN in betax"
-     if(sum(betay).ne.sum(betay))write(*,*)"bssn.f90: find NaN in betay"
-     if(sum(betaz).ne.sum(betaz))write(*,*)"bssn.f90: find NaN in betaz"
-     gont = 1
-     return
-  endif
-#endif

  PI = dacos(-ONE)

@@ -634,7 +602,7 @@
  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
-! now get physical second kind of connection
+
  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
@@ -945,60 +913,103 @@
  SSA(2)=SYM
  SSA(3)=ANTI

-!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
-! lopsided_kodis shares the symmetry_bd buffer between advection and
-! dissipation, eliminating redundant full-grid copies. For metric variables
-! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
-! so the constant offset has no effect on dissipation.
+!!!!!!!!!advection term part

-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
+  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
+  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
+  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)

-  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
-  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
-  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
+  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
+  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
+  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)

-  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
+  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)

-  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
-  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
-
-#if 1 
-!! bam does not apply dissipation on gauge variables
-  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-#if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
-  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
-  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
-#endif
-#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
-  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
-  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
-#endif
-#else
-! No dissipation on gauge variables (advection only)
+  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
+  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
+  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
+!!
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
+
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
+
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
+
+  if(eps>0)then 
+! usual Kreiss-Oliger dissipation      
+  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
+  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
+  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
+#if 0
+#define i 42
+#define j 40
+#define k 40
+if(Lev == 1)then
+write(*,*) X(i),Y(j),Z(k)
+write(*,*) "before",Axx_rhs(i,j,k)
+endif
+#undef i
+#undef j
+#undef k
+!!stop
 #endif
+  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
+#if 0
+#define i 42
+#define j 40
+#define k 40
+if(Lev == 1)then
+write(*,*) X(i),Y(j),Z(k)
+write(*,*) "after",Axx_rhs(i,j,k)
+endif
+#undef i
+#undef j
+#undef k
+!!stop
+#endif
+  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
+
+#if 1 
+!! bam does not apply dissipation on gauge variables
+  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
+#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
+  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
+  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
+#endif
+#endif
+
+  endif

  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -130,11 +130,7 @@ void cgh::compose_cgh(int nprocs)
  for (int lev = 0; lev < levels; lev++)
  {
    checkPatchList(PatL[lev], false);
-#ifdef INTERP_LB_OPTIMIZE
-    Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
-#else
    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
-#endif
 #if (RPB == 1)
    // we need distributed box of PatL[lev] and PatL[lev-1]
    if (lev > 0)
@@ -1305,13 +1301,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
 }


-bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                          MyList<var> *OldList, MyList<var> *StateList,
                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                          monitor *ErrorMonitor)
 {
  if (lev < movls)
-    return false;
+    return;

 #if (0)
  // #if (PSTR == 1 || PSTR == 2)
@@ -1400,7 +1396,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
      for (bhi = 0; bhi < BH_num; bhi++)
        delete[] tmpPorg[bhi];
      delete[] tmpPorg;
-      return false;
+      return;
    }
    // x direction
    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
@@ -1504,7 +1500,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
  for (int bhi = 0; bhi < BH_num; bhi++)
    delete[] tmpPorg[bhi];
  delete[] tmpPorg;
-  return tot_flag;
 }


--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -74,7 +74,7 @@ public:
                               MyList<var> *OldList, MyList<var> *StateList,
                               MyList<var> *FutureList, MyList<var> *tmList,
                               int Symmetry, bool BB);
-   bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                        MyList<var> *OldList, MyList<var> *StateList,
                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                        monitor *ErrorMonitor);
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,12 +69,10 @@
  fy = ZEO
  fz = ZEO

-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
-! x direction
+! x direction   
        if(i+1 <= imax .and. i-1 >= imin)then
 !
 !              - f(i-1) + f(i+1)
@@ -373,8 +371,6 @@
  fxz = ZEO
  fyz = ZEO

-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/extention/include/xh_bssn_rhs_compute.h
+++ b/AMSS_NCKU_source/extention/include/xh_bssn_rhs_compute.h
@@ -0,0 +1,26 @@
+#include "xh_macrodef.h"
+#include "xh_tool.h"
+int f_compute_rhs_bssn(int *ex, double &T, 
+                       double *X, double *Y, double *Z,
+                       double *chi, double *trK,
+                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
+                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
+                       double *Gamx, double *Gamy, double *Gamz,
+                       double *Lap, double *betax, double *betay, double *betaz,
+                       double *dtSfx, double *dtSfy, double *dtSfz,
+                       double *chi_rhs, double *trK_rhs,
+                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
+                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
+                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
+                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
+                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
+                       double *rho, double *Sx, double *Sy, double *Sz,
+                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
+                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
+                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
+                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
+                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
+                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
+                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
+                       int &Symmetry, int &Lev, double &eps, int &co
+                       ); 
--- a/AMSS_NCKU_source/extention/include/xh_macrodef.h
+++ b/AMSS_NCKU_source/extention/include/xh_macrodef.h
@@ -0,0 +1,66 @@
+/* tetrad notes
+   v:r; u: phi; w: theta
+
+   tetradtype 0
+   v^a = (x,y,z)
+   orthonormal order: v,u,w
+   m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+
+   tetradtype 1
+   orthonormal order: w,u,v
+   m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
+
+   tetradtype 2
+   v_a = (x,y,z)
+   orthonormal order: v,u,w
+   m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+*/
+#define tetradtype 2
+
+/* Cell center or Vertex center */
+#define Cell
+
+/* ghost_width meaning:
+   2nd order: 2
+   4th order: 3
+   6th order: 4
+   8th order: 5
+*/
+#define ghost_width 3
+
+/* use shell or not */
+#define WithShell
+
+/* use constraint preserving boundary condition or not
+   only affect Z4c
+*/
+#define CPBC
+
+/* Gauge condition type
+   0: B^i gauge
+   1: David's puncture gauge
+   2: MB B^i gauge
+   3: RIT B^i gauge
+   4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+   5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+   6: MGB1 B^i gauge
+   7: MGB2 B^i gauge
+*/
+#define GAUGE 2
+
+/* buffer points for CPBC boundary */
+#define CPBC_ghost_width (ghost_width)
+
+/* using BSSN variable for constraint violation and psi4 calculation: 0
+   using ADM variable for constraint violation and psi4 calculation: 1
+*/
+#define ABV 0
+
+/* Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
+   1: Case C of 1112.3928, V=0
+   2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
+   3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
+   4: a2 = infinity and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
+   5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
+*/
+#define EScalar_CC 2
--- a/AMSS_NCKU_source/extention/include/xh_share_func.h
+++ b/AMSS_NCKU_source/extention/include/xh_share_func.h
@@ -0,0 +1,338 @@
+#ifndef SHARE_FUNC_H
+#define SHARE_FUNC_H
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <stdio.h>
+#include <omp.h>
+/* 主网格：0-based -> 1D */
+static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
+    const int ex1 = ex[0], ex2 = ex[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
+}
+
+/*
+ * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
+ * ord=2 => shift=1
+ * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
+ */
+static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 1;
+    const int nx = ex[0] + 2;      // ex1 + ord
+    const int ny = ex[1] + 2;
+
+    const int ii = iF + shift;     // 0..ex1+1
+    const int jj = jF + shift;     // 0..ex2+1
+    const int kk = kF + shift;     // 0..ex3+1
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
+ * ord=3 => shift=2
+ * iF/jF/kF 是 Fortran 索引（可为负）
+ */
+static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 2;                 // ord=3 -> -2..ex
+    const int nx = ex[0] + 3;            // ex1 + ord
+    const int ny = ex[1] + 3;
+
+    const int ii = iF + shift;           // 0..ex1+2
+    const int jj = jF + shift;           // 0..ex2+2
+    const int kk = kF + shift;           // 0..ex3+2
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
+ * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
+ *
+ * C 里我们把：
+ *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
+ *   funcc 用“平移下标”存为一维数组：
+ *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
+ *     总长度 nx = extc1 + ord
+ *     同理 ny = extc2 + ord, nz = extc3 + ord
+ */
+
+static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
+    const int nx = extc[0], ny = extc[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
+}
+
+static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
+    const int shift = ord - 1;          // iF = -shift .. extc1
+    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
+    const int ny = extc[1] + ord;
+
+    const int ii = iF + shift;          // 0..extc1+shift
+    const int jj = jF + shift;          // 0..extc2+shift
+    const int kk = kF + shift;          // 0..extc3+shift
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * 等价于 Fortran:
+ * funcc(1:extc1,1:extc2,1:extc3)=func
+ * do i=0,ord-1
+ *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
+ * enddo
+ */
+static inline void symmetry_bd(int ord,
+                 const int extc[3],
+                 const double *func,
+                 double *funcc,
+                 const double SoA[3])
+{
+    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
+
+    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
+    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
+    for (int k0 = 0; k0 < extc3; ++k0) {
+        for (int j0 = 0; j0 < extc2; ++j0) {
+            for (int i0 = 0; i0 < extc1; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
+                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
+            }
+        }
+    }
+
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= ord - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int jF = 1; jF <= extc2; ++jF) {
+                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= ord - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= ord - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -ord + 1; jF <= extc2; ++jF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
+            }
+        }
+    }
+}
+#endif
+
+/* 你已有的函数：idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
+static inline void fdderivs_xh(
+    int i0, int j0, int k0,
+    const int ex[3],
+    const double *fh,
+    int iminF, int jminF, int kminF,
+    int imaxF, int jmaxF, int kmaxF,
+    double Fdxdx, double Fdydy, double Fdzdz,
+    double Fdxdy, double Fdxdz, double Fdydz,
+    double Sdxdx, double Sdydy, double Sdzdz,
+    double Sdxdy, double Sdxdz, double Sdydz,
+    double *fxx, double *fxy, double *fxz,
+    double *fyy, double *fyz, double *fzz
+){
+    const double F8  = 8.0;
+    const double F16 = 16.0;
+    const double F30 = 30.0;
+    const double TWO = 2.0;
+
+    const int iF = i0 + 1;
+    const int jF = j0 + 1;
+    const int kF = k0 + 1;
+
+    const size_t p = idx_ex(i0, j0, k0, ex);
+
+    /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+    {
+        fxx[p] = Fdxdx * (
+            -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+        );
+
+        fyy[p] = Fdydy * (
+            -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+        );
+
+        fzz[p] = Fdzdz * (
+            -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+        );
+
+        /* fxy 高阶 */
+        {
+            const double t_jm2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+            const double t_jm1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+            const double t_jp1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+            const double t_jp2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+            fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+        }
+
+        /* fxz 高阶 */
+        {
+            const double t_km2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+            const double t_km1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+            const double t_kp1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+            const double t_kp2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+            fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+        }
+
+        /* fyz 高阶 */
+        {
+            const double t_km2 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+            const double t_km1 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+            const double t_kp1 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+            const double t_kp2 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+            fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+        }
+    }
+    /* 二阶分支：i±1,j±1,k±1 在范围内 */
+    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+    {
+        fxx[p] = Sdxdx * (
+            fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+        );
+
+        fyy[p] = Sdydy * (
+            fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+        );
+
+        fzz[p] = Sdzdz * (
+            fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+        );
+
+        fxy[p] = Sdxdy * (
+            fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
+            fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
+            fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+        );
+
+        fxz[p] = Sdxdz * (
+            fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+        );
+
+        fyz[p] = Sdydz * (
+            fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
+            fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+        );
+    }
+    else {
+        fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
+        fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
+    }
+}
--- a/AMSS_NCKU_source/extention/include/xh_tool.h
+++ b/AMSS_NCKU_source/extention/include/xh_tool.h
@@ -1,4 +1,4 @@
-#include "share_func.h"
+#include "xh_share_func.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
@@ -24,10 +24,4 @@ void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
-              int Symmetry, const double SoA[3]);
-
-void lopsided_kodis(const int ex[3],
-                    const double *X, const double *Y, const double *Z,
-                    const double *f, double *f_rhs,
-                    const double *Sfx, const double *Sfy, const double *Sfz,
-                    int Symmetry, const double SoA[3], double eps);
+              int Symmetry, const double SoA[3]);
--- a/AMSS_NCKU_source/extention/src/bssn_rhs
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs
--- a/AMSS_NCKU_source/extention/src/bssn_rhs-fast.c
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs-fast.c
--- a/AMSS_NCKU_source/extention/src/bssn_rhs-try.c
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs-try.c
--- a/AMSS_NCKU_source/extention/src/fdderivs-fast.c
+++ b/AMSS_NCKU_source/extention/src/fdderivs-fast.c
@@ -1,4 +1,4 @@
-#include "tool.h"
+#include "../include/tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
@@ -8,7 +8,6 @@ void fdderivs(const int ex[3],
              int Symmetry, int onoff)
 {
    (void)onoff;
-
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
@@ -33,7 +32,6 @@ void fdderivs(const int ex[3],
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;

-    const double SoA[3] = { SYM1, SYM2, SYM3 };

    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
@@ -41,19 +39,6 @@ void fdderivs(const int ex[3],
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;

-    static double *fh = NULL;
-    static size_t cap = 0;
-
-    if (fh_size > cap) {
-        free(fh);
-        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
-        cap = fh_size;
-    }
-    // double *fh = (double*)malloc(fh_size * sizeof(double));
-    if (!fh) return;
-
-    symmetry_bd(2, ex, f, fh, SoA);
-
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
@@ -71,85 +56,208 @@ void fdderivs(const int ex[3],
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);

-    /* 只清零不被主循环覆盖的边界面 */
-    {
-        /* 高边界：k0=ex3-1 */
-        for (int j0 = 0; j0 < ex2; ++j0)
-            for (int i0 = 0; i0 < ex1; ++i0) {
-                const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
-        /* 高边界：j0=ex2-1 */
-        for (int k0 = 0; k0 < ex3 - 1; ++k0)
-            for (int i0 = 0; i0 < ex1; ++i0) {
-                const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
-        /* 高边界：i0=ex1-1 */
-        for (int k0 = 0; k0 < ex3 - 1; ++k0)
-            for (int j0 = 0; j0 < ex2 - 1; ++j0) {
-                const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

-        /* 低边界：当二阶模板也不可用时，对应 i0/j0/k0=0 面 */
-        if (kminF == 1) {
-            for (int j0 = 0; j0 < ex2; ++j0)
-                for (int i0 = 0; i0 < ex1; ++i0) {
-                    const size_t p = idx_ex(i0, j0, 0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
-        }
-        if (jminF == 1) {
-            for (int k0 = 0; k0 < ex3; ++k0)
-                for (int i0 = 0; i0 < ex1; ++i0) {
-                    const size_t p = idx_ex(i0, 0, k0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
-        }
-        if (iminF == 1) {
-            for (int k0 = 0; k0 < ex3; ++k0)
-                for (int j0 = 0; j0 < ex2; ++j0) {
-                    const size_t p = idx_ex(0, j0, k0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // symmetry_bd(2, ex, f, fh, SoA);
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    for (int k0 = 0; k0 < ex[2]; ++k0) {
+        for (int j0 = 0; j0 < ex[1]; ++j0) {
+            for (int i0 = 0; i0 < ex[0]; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
+                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
+            }
        }
    }

+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= 2 - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int jF = 1; jF <= ex[1]; ++jF) {
+                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
+                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= 2 - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= 2 - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
+            }
+        }
+    }
+    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
+    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    // for (size_t p = 0; p < all; ++p) {
+    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
+    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
+    // }
+
    /*
-     * 两段式：
-     * 1) 二阶可用区域先计算二阶模板
-     * 2) 高阶可用区域再覆盖四阶模板
+     * Fortran:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
     */
-    const int i2_lo = (iminF > 0) ? iminF : 0;
-    const int j2_lo = (jminF > 0) ? jminF : 0;
-    const int k2_lo = (kminF > 0) ? kminF : 0;
-    const int i2_hi = ex1 - 2;
-    const int j2_hi = ex2 - 2;
-    const int k2_hi = ex3 - 2;
+    
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);

-    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
-    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
-    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
-    const int i4_hi = ex1 - 3;
-    const int j4_hi = ex2 - 3;
-    const int k4_hi = ex3 - 3;
+                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fxx[p] = Fdxdx * (
+                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );

-    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
-        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
+                    fyy[p] = Fdydy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );

+                    fzz[p] = Fdzdz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
+                    {
+                        const double t_jm2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+                        const double t_jm1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+                        const double t_jp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+                        const double t_jp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+                    }
+
+                    /* fxz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+
+                    /* fyz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+                }
+                /* 二阶分支：i±1,j±1,k±1 在范围内 */
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
@@ -188,131 +296,16 @@ void fdderivs(const int ex[3],
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
+                }else{
+                    fxx[p] = 0.0;
+                    fyy[p] = 0.0;
+                    fzz[p] = 0.0;
+                    fxy[p] = 0.0;
+                    fxz[p] = 0.0;
+                    fyz[p] = 0.0;
                }
            }
        }
    }
-
-    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
-        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
-
-                    fxx[p] = Fdxdx * (
-                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
-                    );
-
-                    fyy[p] = Fdydy * (
-                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
-                    );
-
-                    fzz[p] = Fdzdz * (
-                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
-                    );
-
-                    {
-                        const double t_jm2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
-
-                        const double t_jm1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
-
-                        const double t_jp1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
-
-                        const double t_jp2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
-
-                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
-                    }
-
-                    {
-                        const double t_km2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
-
-                        const double t_km1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
-
-                        const double t_kp1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
-
-                        const double t_kp2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
-
-                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
-                    }
-
-                    {
-                        const double t_km2 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
-
-                        const double t_km1 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
-
-                        const double t_kp1 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
-
-                        const double t_kp2 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
-
-                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
-                    }
-                }
-            }
-        }
-    }
-
    // free(fh);
-}
+}
--- a/AMSS_NCKU_source/extention/src/main.c
+++ b/AMSS_NCKU_source/extention/src/main.c
@@ -0,0 +1,7 @@
+#include "include/bssn_rhs_compute.h"
+
+int main() {
+    // 这里可以写一些测试代码，调用 f_compute_rhs_bssn 来验证它的正确性
+    // 例如，定义一些小的网格和初始条件，调用函数，并检查输出是否合理。
+    return 0;
+}
--- a/AMSS_NCKU_source/extention/src/new.c
+++ b/AMSS_NCKU_source/extention/src/new.c
@@ -0,0 +1,65 @@
+        SoA[0] = SYM, SoA[1] = SYM, SoA[2] = SYM;
+        #pragma omp for collapse(3)
+        for (int k0 = 0; k0 < ex[2]; ++k0) {
+            for (int j0 = 0; j0 < ex[1]; ++j0) {
+                for (int i0 = 0; i0 < ex[0]; ++i0) {
+                    const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
+                    fh[idx_funcc_F(iF, jF, kF, 2, ex)] = Lap[idx_func0(i0, j0, k0, ex)];
+                }
+            }
+        }
+
+        // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+        #pragma omp for collapse(3)
+        for (int ii = 0; ii <= 2 - 1; ++ii) {
+            const int iF_dst = -ii;       // 0, -1, -2, ...
+            const int iF_src = ii + 1;    // 1, 2, 3, ...
+            for (int kF = 1; kF <= ex[2]; ++kF) {
+                for (int jF = 1; jF <= ex[1]; ++jF) {
+                    fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
+                        fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
+                }
+            }
+        }
+
+        // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+        // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+        #pragma omp for collapse(3)
+        for (int jj = 0; jj <= 2 - 1; ++jj) {
+            const int jF_dst = -jj;
+            const int jF_src = jj + 1;
+            for (int kF = 1; kF <= ex[2]; ++kF) {
+                for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                    fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
+                        fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
+                }
+            }
+        }
+
+        // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+        #pragma omp for collapse(3)
+        for (int kk = 0; kk <= 2 - 1; ++kk) {
+            const int kF_dst = -kk;
+            const int kF_src = kk + 1;
+            for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
+                for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                    fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
+                        fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
+                }
+            }
+        }
+
+        #pragma omp for collapse(3)
+        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+            const int kF = k0 + 1;
+            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+                const int jF = j0 + 1;
+                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                    fdderivs_xh(i0, j0, k0, ex, fh, iminF, jminF, kminF, ex1, ex2, ex3, 
+                        Fdxdx, Fdydy, Fdzdz, Fdxdy, Fdxdz, Fdydz,
+                        Sdxdx, Sdydy, Sdzdz, Sdxdy, Sdxdz, Sdydz,
+                            fxx,fxy,fxz,fyy,fyz,fzz
+                    );
+                }
+            }
+        }
--- a/AMSS_NCKU_source/extention/src/xh_bssn_rhs.c
+++ b/AMSS_NCKU_source/extention/src/xh_bssn_rhs.c
--- a/AMSS_NCKU_source/extention/src/xh_fdderivs.c
+++ b/AMSS_NCKU_source/extention/src/xh_fdderivs.c
@@ -0,0 +1,311 @@
+#include "xh_tool.h"
+void fdderivs(const int ex[3],
+              const double *f,
+              double *fxx, double *fxy, double *fxz,
+              double *fyy, double *fyz, double *fzz,
+              const double *X, const double *Y, const double *Z,
+              double SYM1, double SYM2, double SYM3,
+              int Symmetry, int onoff)
+{
+    (void)onoff;
+    const int NO_SYMM = 0, EQ_SYMM = 1;
+    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
+    const double F1o4   = 2.5e-1;          // 1/4
+    const double F8     = 8.0;
+    const double F16    = 16.0;
+    const double F30    = 30.0;
+    const double F1o12  = ONE / 12.0;
+    const double F1o144 = ONE / 144.0;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
+
+
+    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
+    const size_t nx = (size_t)ex1 + 2;
+    const size_t ny = (size_t)ex2 + 2;
+    const size_t nz = (size_t)ex3 + 2;
+    const size_t fh_size = nx * ny * nz;
+
+    /* 系数：按 Fortran 原式 */
+    const double Sdxdx = ONE / (dX * dX);
+    const double Sdydy = ONE / (dY * dY);
+    const double Sdzdz = ONE / (dZ * dZ);
+
+    const double Fdxdx = F1o12 / (dX * dX);
+    const double Fdydy = F1o12 / (dY * dY);
+    const double Fdzdz = F1o12 / (dZ * dZ);
+
+    const double Sdxdy = F1o4 / (dX * dY);
+    const double Sdxdz = F1o4 / (dX * dZ);
+    const double Sdydz = F1o4 / (dY * dZ);
+
+    const double Fdxdy = F1o144 / (dX * dY);
+    const double Fdxdz = F1o144 / (dX * dZ);
+    const double Fdydz = F1o144 / (dY * dZ);
+
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // symmetry_bd(2, ex, f, fh, SoA);
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    for (int k0 = 0; k0 < ex[2]; ++k0) {
+        for (int j0 = 0; j0 < ex[1]; ++j0) {
+            for (int i0 = 0; i0 < ex[0]; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
+                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
+            }
+        }
+    }
+
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= 2 - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int jF = 1; jF <= ex[1]; ++jF) {
+                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
+                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= 2 - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= 2 - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
+            }
+        }
+    }
+    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
+    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    // for (size_t p = 0; p < all; ++p) {
+    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
+    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
+    // }
+
+    /*
+     * Fortran:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
+     */
+    
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fxx[p] = Fdxdx * (
+                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Fdydy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Fdzdz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
+                    {
+                        const double t_jm2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+                        const double t_jm1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+                        const double t_jp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+                        const double t_jp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+                    }
+
+                    /* fxz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+
+                    /* fyz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+                }
+                /* 二阶分支：i±1,j±1,k±1 在范围内 */
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
+                    fxx[p] = Sdxdx * (
+                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Sdydy * (
+                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Sdzdz * (
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    fxy[p] = Sdxdy * (
+                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                    );
+
+                    fxz[p] = Sdxdz * (
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                    );
+
+                    fyz[p] = Sdydz * (
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                    );
+                }else{
+                    fxx[p] = 0.0;
+                    fyy[p] = 0.0;
+                    fzz[p] = 0.0;
+                    fxy[p] = 0.0;
+                    fxz[p] = 0.0;
+                    fyz[p] = 0.0;
+                }
+            }
+        }
+    }
+    // free(fh);
+}
--- a/AMSS_NCKU_source/extention/src/xh_fderivs.c
+++ b/AMSS_NCKU_source/extention/src/xh_fderivs.c
@@ -1,4 +1,4 @@
-#include "tool.h"
+#include "xh_tool.h"

 /*
 * C 版 fderivs
@@ -32,11 +32,6 @@ void fderivs(const int ex[3],
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];

-    // Fortran 1-based bounds
-    const int imaxF = ex1;
-    const int jmaxF = ex2;
-    const int kmaxF = ex3;
-
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
@@ -50,8 +45,8 @@ void fderivs(const int ex[3],
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
-    static double *fh = NULL;
-    static size_t cap = 0;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

    if (fh_size > cap) {
        free(fh);
@@ -81,63 +76,26 @@ void fderivs(const int ex[3],
    }

    /*
-     * 两段式：
-     * 1) 先在二阶可用区域计算二阶模板
-     * 2) 再在高阶可用区域覆盖为四阶模板
+     * Fortran loops:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
     *
-     * 与原 if/elseif 逻辑等价，但减少逐点分支判断。
+     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
     */
-    const int i2_lo = (iminF > 0) ? iminF : 0;
-    const int j2_lo = (jminF > 0) ? jminF : 0;
-    const int k2_lo = (kminF > 0) ? kminF : 0;
-    const int i2_hi = ex1 - 2;
-    const int j2_hi = ex2 - 2;
-    const int k2_hi = ex3 - 2;
-
-    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
-    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
-    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
-    const int i4_hi = ex1 - 3;
-    const int j4_hi = ex2 - 3;
-    const int k4_hi = ex3 - 3;
-
-    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
-        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
-
-                    fx[p] = d2dx * (
-                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
-                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
-                    );
-
-                    fy[p] = d2dy * (
-                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
-                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
-                    );
-
-                    fz[p] = d2dz * (
-                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
-                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
-                    );
-                }
-            }
-        }
-    }
-
-    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
-        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);

+                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
+                if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
+                    (jF + 2) <= ex2 && (jF - 2) >= jminF &&
+                    (kF + 2) <= ex3 && (kF - 2) >= kminF)
+                {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
@@ -159,9 +117,29 @@ void fderivs(const int ex[3],
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
+                // elseif(i+1 <= imax .and. i-1 >= imin ...)
+                else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
+                         (jF + 1) <= ex2 && (jF - 1) >= jminF &&
+                         (kF + 1) <= ex3 && (kF - 1) >= kminF)
+                {
+                    fx[p] = d2dx * (
+                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d2dy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fz[p] = d2dz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+                }
            }
        }
    }

    // free(fh);
-}
+}
--- a/AMSS_NCKU_source/extention/src/xh_kodiss.c
+++ b/AMSS_NCKU_source/extention/src/xh_kodiss.c
@@ -1,4 +1,4 @@
-#include "tool.h"
+#include "xh_tool.h"

 /*
 * C 版 kodis
@@ -48,7 +48,14 @@ void kodis(const int ex[3],
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;

-    double *fh = (double*)malloc(fh_size * sizeof(double));
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
    if (!fh) return;

    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
@@ -63,28 +70,19 @@ void kodis(const int ex[3],
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
-    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
-    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
-    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
-    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
-    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
-    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
-    const int i0_hi = imaxF - 4;  // inclusive
-    const int j0_hi = jmaxF - 4;
-    const int k0_hi = kmaxF - 4;
-
-    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
-        free(fh);
-        return;
-    }
-
-    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
+    for (int k0 = 0; k0 < ex3; ++k0) {
        const int kF = k0 + 1;
-        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
+        for (int j0 = 0; j0 < ex2; ++j0) {
            const int jF = j0 + 1;
-            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
+            for (int i0 = 0; i0 < ex1; ++i0) {
                const int iF = i0 + 1;

+                // Fortran if 条件：
+                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
+                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
+                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
+                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
+                {
                    const size_t p = idx_ex(i0, j0, k0, ex);

                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
@@ -109,9 +107,10 @@ void kodis(const int ex[3],
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
+                }
            }
        }
    }

-    free(fh);
+    // free(fh);
 }
--- a/AMSS_NCKU_source/extention/src/xh_lopsided.c
+++ b/AMSS_NCKU_source/extention/src/xh_lopsided.c
@@ -1,4 +1,4 @@
-#include "tool.h"
+#include "xh_tool.h"
 /*
 * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
@@ -60,7 +60,14 @@ void lopsided(const int ex[3],
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;

-    double *fh = (double*)malloc(fh_size * sizeof(double));
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）

    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
@@ -246,7 +253,7 @@ void lopsided(const int ex[3],
            }
        }
    }
-    free(fh);
+    // free(fh);
 }


--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -883,17 +883,13 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)

  integer::i

-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
   enddo
-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
   enddo
-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
@@ -1111,177 +1107,26 @@ end subroutine d2dump
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-! common code for cell and vertex
-!------------------------------------------------------------------------------
-! Lagrangian polynomial interpolation
-!------------------------------------------------------------------------------
-#ifndef POLINT6_USE_BARYCENTRIC
-#define POLINT6_USE_BARYCENTRIC 1
-#endif
-
-!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
-  subroutine polint6_neville(xa, ya, x, y, dy)
-  implicit none
-
-  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
-
-  integer :: i, m, ns, n_m
-  real*8, dimension(6) :: c, d, ho
-  real*8 :: dif, dift, hp, h, den_val
-
-  c = ya
-  d = ya
-  ho = xa - x
-
-  ns = 1
-  dif = abs(x - xa(1))
-
-  do i = 2, 6
-    dift = abs(x - xa(i))
-    if (dift < dif) then
-      ns = i
-      dif = dift
-    end if
-  end do
-
-  y = ya(ns)
-  ns = ns - 1
-
-  do m = 1, 5
-    n_m = 6 - m
-    do i = 1, n_m
-      hp = ho(i)
-      h  = ho(i+m)
-      den_val = hp - h
-
-      if (den_val == 0.0d0) then
-        write(*,*) 'failure in polint for point',x
-        write(*,*) 'with input points: ',xa
-        stop
-      end if
-
-      den_val = (c(i+1) - d(i)) / den_val
-
-      d(i) = h * den_val
-      c(i) = hp * den_val
-    end do
-
-    if (2 * ns < n_m) then
-      dy = c(ns + 1)
-    else
-      dy = d(ns)
-      ns = ns - 1
-    end if
-    y = y + dy
-  end do
-
-  return
-  end subroutine polint6_neville
-
-!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
-  subroutine polint6_barycentric(xa, ya, x, y, dy)
-  implicit none
-
-  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
-
-  integer :: i, j
-  logical :: is_uniform
-  real*8, dimension(6) :: lambda
-  real*8 :: dx, den_i, term, num, den, step, tol
-  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
-
-  do i = 1, 6
-    if (x == xa(i)) then
-      y = ya(i)
-      dy = 0.d0
-      return
-    end if
-  end do
-
-  step = xa(2) - xa(1)
-  is_uniform = (step /= 0.d0)
-  if (is_uniform) then
-    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
-    do i = 3, 6
-      if (abs((xa(i) - xa(i-1)) - step) > tol) then
-        is_uniform = .false.
-        exit
-      end if
-    end do
-  end if
-
-  if (is_uniform) then
-    num = 0.d0
-    den = 0.d0
-    do i = 1, 6
-      term = c_uniform(i) / (x - xa(i))
-      num = num + term * ya(i)
-      den = den + term
-    end do
-    y = num / den
-    dy = 0.d0
-    return
-  end if
-
-  do i = 1, 6
-    den_i = 1.d0
-    do j = 1, 6
-      if (j /= i) then
-        dx = xa(i) - xa(j)
-        if (dx == 0.0d0) then
-          write(*,*) 'failure in polint for point',x
-          write(*,*) 'with input points: ',xa
-          stop
-        end if
-        den_i = den_i * dx
-      end if
-    end do
-    lambda(i) = 1.d0 / den_i
-  end do
-
-  num = 0.d0
-  den = 0.d0
-  do i = 1, 6
-    term = lambda(i) / (x - xa(i))
-    num = num + term * ya(i)
-    den = den + term
-  end do
-
-  y = num / den
-  dy = 0.d0
-
-  return
-  end subroutine polint6_barycentric
-
-!DIR$ ATTRIBUTES FORCEINLINE :: polint
-  subroutine polint(xa, ya, x, y, dy, ordn)
-  implicit none
-
-  integer, intent(in) :: ordn
+! common code for cell and vertex
+!------------------------------------------------------------------------------
+! Lagrangian polynomial interpolation
+!------------------------------------------------------------------------------
+
+  subroutine polint(xa, ya, x, y, dy, ordn)
+  implicit none
+
+  integer, intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy

-  integer :: i, m, ns, n_m
-  real*8, dimension(ordn) :: c, d, ho
-  real*8 :: dif, dift, hp, h, den_val
-
-  if (ordn == 6) then
-#if POLINT6_USE_BARYCENTRIC
-    call polint6_barycentric(xa, ya, x, y, dy)
-#else
-    call polint6_neville(xa, ya, x, y, dy)
-#endif
-    return
-  end if
-
-  c = ya
-  d = ya
-  ho = xa - x
+  integer :: i, m, ns, n_m
+  real*8, dimension(ordn) :: c, d, ho
+  real*8 :: dif, dift, hp, h, den_val
+
+  c = ya
+  d = ya
+  ho = xa - x

  ns = 1
  dif = abs(x - xa(1))
@@ -1325,48 +1170,13 @@ end subroutine d2dump
    y = y + dy
  end do

-  return
-  end subroutine polint
-!------------------------------------------------------------------------------
-! Compute Lagrange interpolation basis weights for one target point.
-!------------------------------------------------------------------------------
-!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
-  subroutine polint_lagrange_weights(xa, x, w, ordn)
-  implicit none
-
-  integer, intent(in) :: ordn
-  real*8, dimension(1:ordn), intent(in) :: xa
-  real*8, intent(in) :: x
-  real*8, dimension(1:ordn), intent(out) :: w
-
-  integer :: i, j
-  real*8 :: num, den, dx
-
-  do i = 1, ordn
-    num = 1.d0
-    den = 1.d0
-    do j = 1, ordn
-      if (j /= i) then
-        dx = xa(i) - xa(j)
-        if (dx == 0.0d0) then
-          write(*,*) 'failure in polint for point',x
-          write(*,*) 'with input points: ',xa
-          stop
-        end if
-        num = num * (x - xa(j))
-        den = den * dx
-      end if
-    end do
-    w(i) = num / den
-  end do
-
-  return
-  end subroutine polint_lagrange_weights
-!------------------------------------------------------------------------------
-!
-! interpolation in 2 dimensions, follow yx order
-!
-!------------------------------------------------------------------------------
+  return
+  end subroutine polint
+!------------------------------------------------------------------------------
+!
+! interpolation in 2 dimensions, follow yx order
+!
+!------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none

@@ -1414,11 +1224,11 @@ end subroutine d2dump
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy

-#ifdef POLINT_LEGACY_ORDER
-  integer  :: i,j,m,n
-  real*8, dimension(ordn,ordn) :: yatmp
-  real*8, dimension(ordn) :: ymtmp
-  real*8, dimension(ordn) :: yntmp
+#ifdef POLINT_LEGACY_ORDER
+  integer  :: i,j,m,n
+  real*8, dimension(ordn,ordn) :: yatmp
+  real*8, dimension(ordn) :: ymtmp
+  real*8, dimension(ordn) :: yntmp
  real*8, dimension(ordn) :: yqtmp

  m=size(x1a)
@@ -1428,36 +1238,29 @@ end subroutine d2dump
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
-    yntmp=yatmp(i,:)
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
-  end do
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
-#else
-  integer  :: i, j, k
-  real*8, dimension(ordn) :: w1, w2
-  real*8, dimension(ordn) :: ymtmp
-  real*8 :: yx_sum, x_sum
-
-  call polint_lagrange_weights(x1a, x1, w1, ordn)
-  call polint_lagrange_weights(x2a, x2, w2, ordn)
-
-  do k = 1, ordn
-    yx_sum = 0.d0
-    do j = 1, ordn
-      x_sum = 0.d0
-      do i = 1, ordn
-        x_sum = x_sum + w1(i) * ya(i,j,k)
-      end do
-      yx_sum = yx_sum + w2(j) * x_sum
-    end do
-    ymtmp(k) = yx_sum
-  end do
-
-  call polint(x3a, ymtmp, x3, y, dy, ordn)
-#endif
-
-  return
-  end subroutine polin3
+    yntmp=yatmp(i,:)
+    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+  end do
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
+#else
+  integer  :: j, k
+  real*8, dimension(ordn,ordn) :: yatmp
+  real*8, dimension(ordn) :: ymtmp
+  real*8 :: dy_temp
+
+  do k=1,ordn
+    do j=1,ordn
+      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
+    end do
+  end do
+  do k=1,ordn
+    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
+  end do
+  call polint(x3a, ymtmp, x3, y, dy, ordn)
+#endif
+
+  return
+  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
@@ -1800,14 +1603,11 @@ deallocate(f_flat)
 !       ^
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3

-  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
-  integer :: i,j,k
-
-  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
-    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
-  end do
-
-  return
+  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
+
+  fout = C1*f1+C2*f2+C3*f3
+
+  return

  end subroutine average2
 !-----------------------------------------------------------------------------  
--- a/AMSS_NCKU_source/interp_lb_profile.C
+++ b/AMSS_NCKU_source/interp_lb_profile.C
@@ -1,107 +0,0 @@
-#include "interp_lb_profile.h"
-#include <cstdio>
-#include <cstring>
-#include <algorithm>
-
-namespace InterpLBProfile {
-
-bool write_profile(const char *filepath, int nprocs,
-                   const double *rank_times,
-                   const int *heavy_ranks, int num_heavy,
-                   double threshold_ratio)
-{
-    FILE *fp = fopen(filepath, "wb");
-    if (!fp) return false;
-
-    ProfileHeader hdr;
-    hdr.magic = MAGIC;
-    hdr.version = VERSION;
-    hdr.nprocs = nprocs;
-    hdr.num_heavy = num_heavy;
-    hdr.threshold_ratio = threshold_ratio;
-
-    fwrite(&hdr, sizeof(hdr), 1, fp);
-    fwrite(rank_times, sizeof(double), nprocs, fp);
-    fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
-    fclose(fp);
-    return true;
-}
-
-bool read_profile(const char *filepath, int current_nprocs,
-                  int *heavy_ranks, int &num_heavy,
-                  double *rank_times, MPI_Comm comm)
-{
-    int myrank;
-    MPI_Comm_rank(comm, &myrank);
-
-    int valid = 0;
-    ProfileHeader hdr;
-    memset(&hdr, 0, sizeof(hdr));
-
-    if (myrank == 0) {
-        FILE *fp = fopen(filepath, "rb");
-        if (fp) {
-            if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
-                hdr.magic == MAGIC && hdr.version == VERSION &&
-                hdr.nprocs == current_nprocs)
-            {
-                if (fread(rank_times, sizeof(double), current_nprocs, fp)
-                    == (size_t)current_nprocs &&
-                    fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
-                    == (size_t)hdr.num_heavy)
-                {
-                    num_heavy = hdr.num_heavy;
-                    valid = 1;
-                }
-            } else if (fp) {
-                printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
-                       "nprocs=%d (current=%d)\n",
-                       hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
-            }
-            fclose(fp);
-        }
-    }
-
-    MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
-    if (!valid) return false;
-
-    MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
-    MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
-    MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
-    return true;
-}
-
-int identify_heavy_ranks(const double *rank_times, int nprocs,
-                         double threshold_ratio,
-                         int *heavy_ranks, int max_heavy)
-{
-    double sum = 0;
-    for (int i = 0; i < nprocs; i++) sum += rank_times[i];
-    double mean = sum / nprocs;
-    double threshold = threshold_ratio * mean;
-
-    // Collect candidates
-    struct RankTime { int rank; double time; };
-    RankTime *candidates = new RankTime[nprocs];
-    int ncand = 0;
-
-    for (int i = 0; i < nprocs; i++) {
-        if (rank_times[i] > threshold)
-            candidates[ncand++] = {i, rank_times[i]};
-    }
-
-    // Sort descending by time
-    std::sort(candidates, candidates + ncand,
-              [](const RankTime &a, const RankTime &b) {
-                  return a.time > b.time;
-              });
-
-    int count = (ncand < max_heavy) ? ncand : max_heavy;
-    for (int i = 0; i < count; i++)
-        heavy_ranks[i] = candidates[i].rank;
-
-    delete[] candidates;
-    return count;
-}
-
-} // namespace InterpLBProfile
--- a/AMSS_NCKU_source/interp_lb_profile.bin
+++ b/AMSS_NCKU_source/interp_lb_profile.bin
--- a/AMSS_NCKU_source/interp_lb_profile.h
+++ b/AMSS_NCKU_source/interp_lb_profile.h
@@ -1,38 +0,0 @@
-#ifndef INTERP_LB_PROFILE_H
-#define INTERP_LB_PROFILE_H
-
-#include <mpi.h>
-
-namespace InterpLBProfile {
-
-static const unsigned int MAGIC   = 0x494C4250; // "ILBP"
-static const unsigned int VERSION = 1;
-
-struct ProfileHeader {
-    unsigned int magic;
-    unsigned int version;
-    int nprocs;
-    int num_heavy;
-    double threshold_ratio;
-};
-
-// Write profile file (rank 0 only)
-bool write_profile(const char *filepath, int nprocs,
-                   const double *rank_times,
-                   const int *heavy_ranks, int num_heavy,
-                   double threshold_ratio);
-
-// Read profile file (rank 0 reads, then broadcasts to all)
-// Returns true if file found and valid for current nprocs
-bool read_profile(const char *filepath, int current_nprocs,
-                  int *heavy_ranks, int &num_heavy,
-                  double *rank_times, MPI_Comm comm);
-
-// Identify heavy ranks: those with time > threshold_ratio * mean
-int identify_heavy_ranks(const double *rank_times, int nprocs,
-                         double threshold_ratio,
-                         int *heavy_ranks, int max_heavy);
-
-} // namespace InterpLBProfile
-
-#endif /* INTERP_LB_PROFILE_H */
--- a/AMSS_NCKU_source/interp_lb_profile_data.h
+++ b/AMSS_NCKU_source/interp_lb_profile_data.h
@@ -1,29 +0,0 @@
-/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
-/* 更新：负载均衡问题已经通过优化插值函数解决，此profile静态均衡方案已弃用，本头文件现在未参与编译 */
-/* Auto-generated from interp_lb_profile.bin — do not edit */
-#ifndef INTERP_LB_PROFILE_DATA_H
-#define INTERP_LB_PROFILE_DATA_H
-
-#define INTERP_LB_NPROCS 64
-#define INTERP_LB_NUM_HEAVY 4
-
-static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
-
-/* Split table: {block_id, r_left, r_right} */
-static const int interp_lb_splits[4][3] = {
-    {27, 26, 27},
-    {35, 34, 35},
-    {28, 28, 29},
-    {36, 36, 37},
-};
-
-/* Rank remap for displaced neighbor blocks */
-static const int interp_lb_num_remaps = 4;
-static const int interp_lb_remaps[][2] = {
-    {26, 25},
-    {29, 30},
-    {34, 33},
-    {37, 38},
-};
-
-#endif /* INTERP_LB_PROFILE_DATA_H */
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
 !                       dx^4

 !  note the sign (-1)^r-1, now r=2
-!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
-!DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)

  end subroutine lopsided

-!-----------------------------------------------------------------------------
-! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
-! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
-! Mathematically identical to calling lopsided then kodis separately.
-!-----------------------------------------------------------------------------
-subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
-  implicit none
-
-!~~~~~~> Input parameters:
-
-  integer, intent(in)  :: ex(1:3),Symmetry
-  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
-  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
-
-  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
-  real*8,dimension(3),intent(in) ::SoA
-  real*8,intent(in) :: eps
-
-!~~~~~~> local variables:
-! note index -2,-1,0, so we have 3 extra points
-  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
-  real*8 :: dX,dY,dZ
-  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
-  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
-  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
-  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
-  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
-! kodis parameters
-  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
-  real*8, parameter :: cof=6.4d1   ! 2^6
-
-  dX = X(2)-X(1)
-  dY = Y(2)-Y(1)
-  dZ = Z(2)-Z(1)
-
-  d12dx = ONE/F12/dX
-  d12dy = ONE/F12/dY
-  d12dz = ONE/F12/dZ
-
-  d2dx = ONE/TWO/dX
-  d2dy = ONE/TWO/dY
-  d2dz = ONE/TWO/dZ
-
-  imax = ex(1)
-  jmax = ex(2)
-  kmax = ex(3)
-
-  imin = 1
-  jmin = 1
-  kmin = 1
-  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
-  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
-  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
-
-! Single symmetry_bd call shared by both advection and dissipation
-  call symmetry_bd(3,ex,f,fh,SoA)
-
-! ---- Advection (lopsided) loop ----
-! upper bound set ex-1 only for efficiency, 
-! the loop body will set ex 0 also
-  do k=1,ex(3)-1
-  do j=1,ex(2)-1
-  do i=1,ex(1)-1
-! x direction   
-    if(Sfx(i,j,k) > ZEO)then
-      if(i+3 <= imax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
-     elseif(i+2 <= imax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
-
-     elseif(i+1 <= imax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
-     endif
-   elseif(Sfx(i,j,k) < ZEO)then
-      if(i-3 >= imin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
-                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
-     elseif(i-2 >= imin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
-
-     elseif(i-1 >= imin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
-                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
-     endif
-   endif
-
-! y direction   
-    if(Sfy(i,j,k) > ZEO)then
-      if(j+3 <= jmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
-     elseif(j+2 <= jmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
-
-     elseif(j+1 <= jmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
-     endif
-   elseif(Sfy(i,j,k) < ZEO)then
-      if(j-3 >= jmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
-                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
-     elseif(j-2 >= jmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
-
-     elseif(j-1 >= jmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
-                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
-     endif
-   endif
-
-! z direction   
-    if(Sfz(i,j,k) > ZEO)then
-      if(k+3 <= kmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
-     elseif(k+2 <= kmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
-
-     elseif(k+1 <= kmax)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
-     endif
-   elseif(Sfz(i,j,k) < ZEO)then
-      if(k-3 >= kmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
-                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
-                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
-     elseif(k-2 >= kmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
-                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
-
-     elseif(k-1 >= kmin)then
-     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
-                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
-                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
-     endif
-   endif
-  enddo
-  enddo
-  enddo
-
-! ---- Dissipation (kodis) loop ----
-  if(eps > ZEO) then
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-
-  if(i-3 >= imin .and. i+3 <= imax .and. &
-     j-3 >= jmin .and. j+3 <= jmax .and. &
-     k-3 >= kmin .and. k+3 <= kmax) then
-   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
-                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
-                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
-                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
-                          TWT* fh(i,j,k)            )/dX + &
-                                                  (     &
-                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
-                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
-                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
-                          TWT* fh(i,j,k)            )/dY + &
-                                                  (     &
-                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
-                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
-                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
-                          TWT* fh(i,j,k)            )/dZ )
-  endif
-
-  enddo
-  enddo
-  enddo
-  endif
-
-  return
-
-  end subroutine lopsided_kodis
-
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/macrodef.fh
+++ b/AMSS_NCKU_source/macrodef.fh
@@ -1,77 +1,83 @@
-
-#define tetradtype 2
-
-#define Cell
-
-#define ghost_width 3
-
-
-
-#define GAUGE 0
-
-#define CPBC_ghost_width  (ghost_width)
-
-#define ABV 0
-
-#define EScalar_CC 2
-
-#if 0
-
-define tetradtype
-    v:r; u: phi; w: theta
-    tetradtype 0
-    v^a = (x,y,z)
-    orthonormal order: v,u,w
-    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-    tetradtype 1
-    orthonormal order: w,u,v
-    m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
-    tetradtype 2
-    v_a = (x,y,z)
-    orthonormal order: v,u,w
-    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-
-define Cell or Vertex
-    Cell center or Vertex center
-
-define ghost_width
-    2nd order: 2
-    4th order: 3
-    6th order: 4
-    8th order: 5
-
-define WithShell
-    use shell or not
-
-define CPBC
-    use constraint preserving boundary condition or not
-    only affect Z4c
-    CPBC only supports WithShell
-
-define GAUGE
-    0: B^i gauge
-    1: David puncture gauge
-    2: MB B^i gauge
-    3: RIT B^i gauge
-    4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-    5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-    6: MGB1 B^i gauge
-    7: MGB2 B^i gauge
-
-define CPBC_ghost_width  (ghost_width)
-    buffer points for CPBC boundary
-
-define ABV
-    0: using BSSN variable for constraint violation and psi4 calculation
-    1: using ADM variable for constraint violation and psi4 calculation
-
-define EScalar_CC
-Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
-    1: Case C of 1112.3928, V=0
-    2: shell with   phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
-    3: ground state of Schrodinger-Newton system,  f(R) = R+a2*R^2 induced V
-    4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
-    5: shell with   phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
-
-#endif
-
+
+
+#if 0
+note here
+v:r; u: phi; w: theta
+tetradtype 0
+v^a = (x,y,z)
+orthonormal order: v,u,w
+m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+tetradtype 1
+orthonormal order: w,u,v
+m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
+tetradtype 2
+v_a = (x,y,z)
+orthonormal order: v,u,w
+m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+#endif
+#define tetradtype 2
+
+#if 0
+note here
+Cell center or Vertex center
+#endif
+#define Cell
+
+#if 0
+note here
+2nd order: 2
+4th order: 3
+6th order: 4
+8th order: 5
+#endif
+#define ghost_width 3
+
+#if 0
+note here
+use shell or not
+#endif
+#define WithShell
+
+#if 0
+note here
+use constraint preserving boundary condition or not
+only affect Z4c
+#endif
+#define CPBC
+
+#if 0
+note here
+Gauge condition type
+0: B^i gauge
+1: David's puncture gauge
+2: MB B^i gauge
+3: RIT B^i gauge
+4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+6: MGB1 B^i gauge
+7: MGB2 B^i gauge
+#endif
+#define GAUGE 2
+
+#if 0
+buffer points for CPBC boundary
+#endif
+#define CPBC_ghost_width  (ghost_width)
+
+#if 0
+using BSSN variable for constraint violation and psi4 calculation: 0
+using ADM variable for constraint violation and psi4 calculation: 1
+#endif
+#define ABV 0
+
+#if 0
+Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
+1: Case C of 1112.3928, V=0
+2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
+3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
+4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
+5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
+#endif
+#define EScalar_CC 2
+
+
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -1,145 +1,112 @@
-
-#ifndef MICRODEF_H
-#define MICRODEF_H
-
-#include "macrodef.fh"  
-
-// application parameters
-
-#define SommerType 0
-
-#define GaussInt
-
-#define ABEtype 0
-
-//#define With_AHF
-#define Psi4type 0
-
-//#define Point_Psi4
-
-#define RPS 1
-
-#define AGM 0
-
-#define RPB 0
-
-#define MAPBH 1
-
-#define PSTR 0
-
-#define REGLEV 0
-
-//#define USE_GPU
-
-//#define CHECKDETAIL
-
-//#define FAKECHECK
-
-//
-// define SommerType
-//     sommerfeld boundary type
-//     0: bam
-//     1: shibata
-//
-// define GaussInt
-//     for Using Gauss-Legendre quadrature in theta direction
-//
-// define ABEtype
-//     0: BSSN vacuum
-//     1: coupled to scalar field
-//     2: Z4c vacuum
-//     3: coupled to Maxwell field
-//
-// define With_AHF
-//     using Apparent Horizon Finder
-//
-// define Psi4type
-//     Psi4 calculation method
-//     0: EB method
-//     1: 4-D method
-//
-// define Point_Psi4
-//     for Using point psi4 or not
-//
-// define RPS
-//     RestrictProlong in Step (0) or after Step (1)
-//
-// define AGM
-//     Enforce algebra constraint
-//     for every RK4 sub step: 0
-//     only when iter_count == 3: 1
-//     after routine Step: 2
-//
-// define RPB
-//     Restrict Prolong using BAM style 1 or old style 0
-//
-// define MAPBH
-//     1: move Analysis out ot 4 sub steps and treat PBH with Euler method
-//
-// define PSTR
-//     parallel structure
-//     0: level by level
-//     1: considering all levels
-//     2: as 1 but reverse the CPU order
-//     3: Frank's scheme
-//
-// define REGLEV
-//     regrid for every level or for all levels at a time
-//     0: for every level;
-//     1: for all
-//
-// define USE_GPU
-//     use gpu or not
-//
-// define CHECKDETAIL
-//     use checkpoint for every process
-//
-// define FAKECHECK
-//     use FakeCheckPrepare to write CheckPoint
-//
-
-////================================================================
-//  some basic parameters for numerical calculation
-////================================================================
-
-#define dim 3
-
-//#define Cell or Vertex in "macrodef.fh" 
-
-#define buffer_width 6
-
-#define SC_width buffer_width
-
-#define CS_width (2*buffer_width)
-
-//
-// define Cell or Vertex in "macrodef.fh" 
-//
-// define buffer_width
-//     buffer point number for mesh refinement interface
-//
-// define SC_width buffer_width
-//     buffer point number shell-box interface, on shell
-//
-// define CS_width
-//     buffer point number shell-box interface, on box
-//
-
-#if(buffer_width < ghost_width)
-#   error we always assume buffer_width>ghost_width
-#endif
-
-#define PACK 1
-#define UNPACK 2
-
-#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
-#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
-
-#define feq(a,b,d) (fabs(a-b)<d)
-#define flt(a,b,d) ((a-b)<d)
-#define fgt(a,b,d) ((a-b)>d)
-
-#define TINY 1e-10
-
-#endif   /* MICRODEF_H */
-
+
+#ifndef MICRODEF_H
+#define MICRODEF_H
+
+#include "macrodef.fh"
+
+// application parameters
+
+/// ****
+// sommerfeld boundary type
+// 0: bam, 1: shibata
+#define SommerType 0
+
+/// ****
+// for Using Gauss-Legendre quadrature in theta direction
+#define GaussInt
+
+/// ****
+// 0: BSSN vacuum
+// 1: coupled to scalar field
+// 2: Z4c vacuum
+// 3: coupled to Maxwell field
+//
+#define ABEtype 2
+
+/// ****
+// using Apparent Horizon Finder
+//#define With_AHF
+
+/// ****
+// Psi4 calculation method
+// 0: EB method
+// 1: 4-D method
+//
+#define Psi4type 0
+
+/// ****
+// for Using point psi4 or not
+//#define Point_Psi4
+
+/// ****
+// RestrictProlong in Step (0) or after Step (1)
+#define RPS 1
+
+/// ****
+// Enforce algebra constraint
+// for every RK4 sub step: 0
+// only when iter_count == 3: 1
+// after routine Step: 2
+#define AGM 0
+
+/// ****
+// Restrict Prolong using BAM style 1 or old style 0
+#define RPB 0
+
+/// ****
+// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
+#define MAPBH 1
+
+/// ****
+// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
+#define PSTR 0
+
+/// ****
+// regrid for every level or for all levels at a time
+// 0: for every level; 1: for all
+#define REGLEV 0
+
+/// ****
+// use gpu or not
+//#define USE_GPU
+
+/// ****
+// use checkpoint for every process
+//#define CHECKDETAIL
+
+/// ****
+// use FakeCheckPrepare to write CheckPoint
+//#define FAKECHECK
+////================================================================
+//  some basic parameters for numerical calculation
+#define dim 3
+
+//#define Cell or Vertex in "microdef.fh"
+
+// ******
+// buffer point number for mesh refinement interface
+#define buffer_width 6
+
+// ******
+// buffer point number shell-box interface, on shell
+#define SC_width buffer_width
+// buffer point number shell-box interface, on box
+#define CS_width (2*buffer_width)
+
+#if(buffer_width < ghost_width)
+#error we always assume buffer_width>ghost_width
+#endif
+
+#define PACK 1
+#define UNPACK 2
+
+#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
+#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
+
+#define feq(a,b,d) (fabs(a-b)<d)
+#define flt(a,b,d) ((a-b)<d)
+#define fgt(a,b,d) ((a-b)>d)
+
+#define TINY 1e-10
+
+#endif   /* MICRODEF_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,35 +1,6 @@


-include makefile.inc
-
-## polint(ordn=6) kernel selector:
-##   1 (default): barycentric fast path
-##   0          : fallback to Neville path
-POLINT6_USE_BARY ?= 1
-POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-
-## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
-##   make                        -> opt  (PGO-guided, maximum performance)
-##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
-PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
-
-ifeq ($(PGO_MODE),instrument)
-## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
-CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
-## PGO has been turned off, now tested and found to be negative optimization
-## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
-
-
-CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-endif
+include makefile.inc

 .SUFFIXES: .o .f90 .C .for .cu

@@ -37,7 +8,7 @@ endif
 	$(f90) $(f90appflags) -c $< -o $@

 .C.o:
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@

 .for.o:
 	$(f77) -c $< -o $@
@@ -45,65 +16,20 @@ endif
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

-# C rewrite of BSSN RHS kernel and helpers
-bssn_rhs_c.o: bssn_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-fderivs_c.o: fderivs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-fdderivs_c.o: fdderivs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-kodiss_c.o: kodiss_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-lopsided_c.o: lopsided_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-lopsided_kodis_c.o: lopsided_kodis_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
-TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
-TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -fprofile-instr-use=$(TP_PROFDATA) \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include
-
 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@

 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@

 # Input files
-
-## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
-ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
-CFILES =
-else
-# C++ mode (default): C rewrite of bssn_rhs and helper kernels
-CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
-endif
-
-## RK4 kernel switch (independent from USE_CXX_KERNELS)
-ifeq ($(USE_CXX_RK4),1)
-CFILES += rungekutta4_rout_c.o
-RK4_F90_OBJ =
-else
-RK4_F90_OBJ = rungekutta4_rout.o
-endif
-
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
 	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
+	   NullShellPatch2_Evo.o writefile_f.o xh_bssn_rhs.o xh_fdderivs.o xh_fderivs.o xh_kodiss.o xh_lopsided.o \
+	   xh_global_interp.o xh_polint3.o
 	   
 C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
@@ -113,12 +39,12 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 	   NullShellPatch2_Evo.o \
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o

-F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
-	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
-	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
-	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
-           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
+F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+	   prolongrestrict_cell.o prolongrestrict_vertex.o\
+	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
+	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
+           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
           fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
 	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
 	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
@@ -126,14 +52,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
 	   NullNews2.o tool_f.o

-ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: include original bssn_rhs.o
-F90FILES = $(F90FILES_BASE) bssn_rhs.o
-else
-# C++ mode (default): bssn_rhs.o replaced by C++ kernel
-F90FILES = $(F90FILES_BASE)
-endif
-
 F77FILES = zbesh.o

 AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
@@ -146,7 +64,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
 CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

 # file dependences
-$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -155,7 +73,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
 	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
 	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
-             initial_null2.h NullShellPatch2.h 
+             initial_null2.h NullShellPatch2.h xh_bssn_rhs_compute.h xh_global_interp.h
             
 $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -169,7 +87,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
             
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h

-$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h

 TwoPunctureFILES: TwoPunctures.h

@@ -178,14 +96,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o

 # projects
-ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
+	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
 	
-ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)

 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)

 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,65 +1,32 @@
-## GCC version (commented out)
-## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-
-## Intel oneAPI version with oneMKL (Optimized for performance)
-filein  = -I/usr/include/ -I${MKLROOT}/include
-
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
-
-## Memory allocator switch
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
-##   0           : use system default allocator (ptmalloc)
-USE_TBBMALLOC ?= 1
-TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
-ifneq ($(wildcard $(TBBMALLOC_SO)),)
-TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
-else
-TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
-endif
-ifeq ($(USE_TBBMALLOC),1)
-LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
-endif
-
-## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
-##   opt        : (default) maximum performance with PGO profile-guided optimization
-##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
-PGO_MODE ?= opt
-
-## Interp_Points load balance profiling mode
-##   off        : (default) no load balance instrumentation
-##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
-##   optimize   : Pass 2 — read profile and apply block rebalancing
-INTERP_LB_MODE ?= off
-
-ifeq ($(INTERP_LB_MODE),profile)
-INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
-else ifeq ($(INTERP_LB_MODE),optimize)
-INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
-else
-INTERP_LB_FLAGS =
-endif
-
-## Kernel implementation switch
-##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
-##   0           : fall back to original Fortran kernels
-USE_CXX_KERNELS ?= 1
-
-## RK4 kernel implementation switch
-##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
-##   0           : use original Fortran rungekutta4_rout.o
-USE_CXX_RK4 ?= 1
-
-f90          = ifx
-f77          = ifx
-CXX          = icpx
-CC           = icx
-CLINKER      = mpiicpx
-
-Cu = nvcc
-CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
-#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+## GCC version (commented out)
+## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
+## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
+## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
+
+## Intel oneAPI version with oneMKL (Optimized for performance)
+filein  = -I/usr/include/ -I${MKLROOT}/include
+
+## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+
+## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
+## -fprofile-instr-use: use collected profile data to guide optimization decisions
+##   (branch prediction, basic block layout, inlining, loop unrolling)
+PROFDATA     = /home/hxh/AMSS-NCKU/pgo_profile/default.profdata
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+               -fprofile-instr-use=$(PROFDATA) \
+               -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+               -fprofile-instr-use=$(PROFDATA) \
+               -align array64byte -fpp -I${MKLROOT}/include
+f90          = ifx
+f77          = ifx
+CXX          = icpx
+CC           = icx
+CLINKER      = mpiicpx 
+
+Cu = nvcc
+CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
+#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1934,33 +1934,18 @@
 ! when if=1 -> ic=0, this is different to vertex center grid 
  real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3))   :: funcc
  integer,dimension(3) :: cxI
-  integer :: i,j,k,ii,jj,kk,px,py,pz
+  integer :: i,j,k,ii,jj,kk
  real*8, dimension(6,6) :: tmp2
  real*8, dimension(6) :: tmp1
-  integer, dimension(extf(1)) :: cix
-  integer, dimension(extf(2)) :: ciy
-  integer, dimension(extf(3)) :: ciz
-  integer, dimension(extf(1)) :: pix
-  integer, dimension(extf(2)) :: piy
-  integer, dimension(extf(3)) :: piz

  real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
  real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
-  real*8, dimension(6,2), parameter :: WC = reshape((/&
-      C1,C2,C3,C4,C5,C6,&
-      C6,C5,C4,C3,C2,C1/), (/6,2/))

  integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
-  integer::maxcx,maxcy,maxcz

  real*8,dimension(3) :: CD,FD
-  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
-  real*8 :: tmp_xyz_line(extc(1))   ! 存储整条 X 线上完成 Y 向融合后的结果
-  real*8 :: v1, v2, v3, v4, v5, v6
-  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max
-  real*8 :: res_line
-  real*8 :: tmp_z_slab(extc(1), extc(2))  ! 分配在 k 循环外
+  
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2035,123 +2020,145 @@
          return
  endif

-  do i = imino,imaxo
-     ii = i + lbf(1) - 1
-     cix(i) = ii/2 - lbc(1) + 1
-     if(ii/2*2 == ii)then
-        pix(i) = 1
-     else
-        pix(i) = 2
-     endif
-  enddo
-  do j = jmino,jmaxo
-     jj = j + lbf(2) - 1
-     ciy(j) = jj/2 - lbc(2) + 1
-     if(jj/2*2 == jj)then
-        piy(j) = 1
-     else
-        piy(j) = 2
-     endif
-  enddo
-  do k = kmino,kmaxo
-     kk = k + lbf(3) - 1
-     ciz(k) = kk/2 - lbc(3) + 1
-     if(kk/2*2 == kk)then
-        piz(k) = 1
-     else
-        piz(k) = 2
-     endif
-  enddo
-
-  maxcx = maxval(cix(imino:imaxo))
-  maxcy = maxval(ciy(jmino:jmaxo))
-  maxcz = maxval(ciz(kmino:kmaxo))
-  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
-     write(*,*)"error in prolong"
-     return
-  endif
-
  call symmetry_bd(3,extc,func,funcc,SoA)
-     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
-jc_min = minval(ciy(jmino:jmaxo))
-jc_max = maxval(ciy(jmino:jmaxo))
-
-do k = kmino, kmaxo
-    pz = piz(k); kc = ciz(k)
-    ! --- Pass 1: Z 方向，只算一次 ---
-    do iy = jc_min-3, jc_max+3   ! 仅需的 iy 范围
-        do ii = imini-3, imaxi+3  ! 仅需的 ii 范围
-            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
-        end do
-    end do
-
-    do j = jmino, jmaxo
-        py = piy(j); jc = ciy(j)
-        ! --- Pass 2: Y 方向 ---
-        do ii = imini-3, imaxi+3
-            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
-        end do
-        ! --- Pass 3: X 方向 ---
-        do i = imino, imaxo
-            funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
-        end do
-    end do
-end do
-
+     
 !~~~~~~> prolongation start...
+  do k = kmino,kmaxo
+   do j = jmino,jmaxo
+    do i = imino,imaxo
+       cxI(1) = i
+       cxI(2) = j
+       cxI(3) = k
+! change to coarse level reference
+!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---| 
+!|=======x===============x===============x===============x=======|
+       cxI = (cxI+lbf-1)/2
+! change to array index      
+       cxI = cxI - lbc + 1
+
+       if(any(cxI+3 > extc)) write(*,*)"error in prolong"
+       ii=i+lbf(1)-1
+       jj=j+lbf(2)-1
+       kk=k+lbf(3)-1
 #if 0
- do k = kmino, kmaxo
-     pz = piz(k)
-     kc = ciz(k)
+       if(ii/2*2==ii)then
+         if(jj/2*2==jj)then
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           endif
+         else
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           endif
+         endif
+       else
+         if(jj/2*2==jj)then
+           if(kk/2*2==kk)then               
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           endif
+         else
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           endif
+         endif
+       endif
+#else 
+       if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+       else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+       endif

-     do j = jmino, jmaxo
-        py = piy(j)
-        jc = ciy(j)
+       if(jj/2*2==jj)then
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+       else
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+       endif

-! --- 步骤 1 & 2 融合：分段处理 X 轴，提升 Cache 命中率 ---
-        ! 我们将 ii 循环逻辑重组，减少对 funcc 的跨行重复访问
-        do ii = 1, extc(1)
-           ! 1. 先做 Z 方向的 6 条线插值（针对当前的 ii 和当前的 6 个 iy）
-           ! 我们直接在这里把 Y 方向的加权也做了，省去 tmp_yz 数组
-           ! 这样 funcc 的数据读进来后立即完成所有维度的贡献，不再写回内存
-           
-           res_line = 0.0d0
-           do jj = 1, 6
-              iy = jc - 3 + jj
-              ! 这一行代码是核心：一次性完成 Z 插值并加上 Y 的权重
-              ! 编译器会把 WC(jj, py) 存在寄存器里
-              res_line = res_line + WC(jj, py) * ( &
-                         WC(1, pz) * funcc(ii, iy, kc-2) + &
-                         WC(2, pz) * funcc(ii, iy, kc-1) + &
-                         WC(3, pz) * funcc(ii, iy, kc  ) + &
-                         WC(4, pz) * funcc(ii, iy, kc+1) + &
-                         WC(5, pz) * funcc(ii, iy, kc+2) + &
-                         WC(6, pz) * funcc(ii, iy, kc+3) )
-           end do
-           tmp_xyz_line(ii) = res_line
-        end do
-
-
-
-
-        ! 3. 【降维：X 向】最后在最内层只处理 X 方向的 6 点加权
-        ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
-        do i = imino, imaxo
-           px = pix(i)
-           ic = cix(i)
-           
-           ! 直接从预计算好的 line 中读取连续的 6 个点
-           ! ic-2 到 ic+3 对应原始 6 点算子
-           funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
-                         WC(2,px)*tmp_xyz_line(ic-1) + &
-                         WC(3,px)*tmp_xyz_line(ic  ) + &
-                         WC(4,px)*tmp_xyz_line(ic+1) + &
-                         WC(5,px)*tmp_xyz_line(ic+2) + &
-                         WC(6,px)*tmp_xyz_line(ic+3)
-        end do
-     end do
-  end do
+       if(ii/2*2==ii)then
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+       else
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+       endif
 #endif
+    enddo
+   enddo
+  enddo
+
  return

  end subroutine prolong3
@@ -2350,11 +2357,7 @@ end do
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo

  real*8,dimension(3) :: CD,FD
-
-  real*8 :: tmp_xz_plane(extf(1), 6) 
-  real*8 :: tmp_x_line(extf(1))
-  integer :: fi, fj, fk, ii, jj, kk
-
+  
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2436,56 +2439,6 @@ end do
  call symmetry_bd(2,extf,funf,funff,SoA)

 !~~~~~~> restriction start...
-do k = kmino, kmaxo
-    fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
-
-    do j = jmino, jmaxo
-        fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
-        
-        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
-        ! 确保 ii 循环是最内层且连续访问
-        !DIR$ VECTOR ALWAYS
-        do ii = 1, extf(1)
-            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
-            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
-            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
-                                  C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
-                                  C3*(funff(ii,fj-2,fk  )+funff(ii,fj-2,fk+1))
-            tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
-                                  C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
-                                  C3*(funff(ii,fj-1,fk  )+funff(ii,fj-1,fk+1))
-            tmp_xz_plane(ii, 3) = C1*(funff(ii,fj  ,fk-2)+funff(ii,fj  ,fk+3)) + &
-                                  C2*(funff(ii,fj  ,fk-1)+funff(ii,fj  ,fk+2)) + &
-                                  C3*(funff(ii,fj  ,fk  )+funff(ii,fj  ,fk+1))
-            tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
-                                  C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
-                                  C3*(funff(ii,fj+1,fk  )+funff(ii,fj+1,fk+1))
-            tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
-                                  C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
-                                  C3*(funff(ii,fj+2,fk  )+funff(ii,fj+2,fk+1))
-            tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
-                                  C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
-                                  C3*(funff(ii,fj+3,fk  )+funff(ii,fj+3,fk+1))
-        end do
-
-        ! 优化点 2: 同样向量化 Y 方向压缩
-        !DIR$ VECTOR ALWAYS
-        do ii = 1, extf(1)
-            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
-                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
-                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
-        end do
-
-        ! 优化点 3: 最终写入，利用已经缓存在 tmp_x_line 的数据
-        do i = imino, imaxo
-            fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
-            func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
-                            C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
-                            C3*(tmp_x_line(fi  ) + tmp_x_line(fi+1))
-        end do
-    end do
-end do
-#if 0
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
@@ -2509,7 +2462,7 @@ end do
    enddo
   enddo
  enddo
-#endif
+  
  return

  end subroutine restrict3
--- a/AMSS_NCKU_source/rungekutta4_rout_c.C
+++ b/AMSS_NCKU_source/rungekutta4_rout_c.C
@@ -1,212 +0,0 @@
-#include "rungekutta4_rout.h"
-#include <cstdio>
-#include <cstdlib>
-#include <cstddef>
-#include <complex>
-#include <immintrin.h>
-
-namespace {
-
-inline void rk4_stage0(std::size_t n,
-                       const double *__restrict f0,
-                       const double *__restrict frhs,
-                       double *__restrict f1,
-                       double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d vr = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d vr = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * frhs[i];
-    }
-}
-
-inline void rk4_rhs_accum(std::size_t n,
-                          const double *__restrict f1,
-                          double *__restrict frhs) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d v2 = _mm512_set1_pd(2.0);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        const __m512d vrhs = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
-    }
-#elif defined(__AVX2__)
-    const __m256d v2 = _mm256_set1_pd(2.0);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        const __m256d vrhs = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        frhs[i] = frhs[i] + 2.0 * f1[i];
-    }
-}
-
-inline void rk4_f1_from_f0_f1(std::size_t n,
-                              const double *__restrict f0,
-                              double *__restrict f1,
-                              double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * f1[i];
-    }
-}
-
-inline void rk4_stage3(std::size_t n,
-                       const double *__restrict f0,
-                       double *__restrict f1,
-                       const double *__restrict frhs,
-                       double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        const __m512d vr = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        const __m256d vr = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * (f1[i] + frhs[i]);
-    }
-}
-
-} // namespace
-
-extern "C" {
-
-void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
-    constexpr double F1o6 = 1.0 / 6.0;
-    constexpr double HLF = 0.5;
-    constexpr double TWO = 2.0;
-
-    switch (RK4) {
-    case 0:
-        f1 = f0 + HLF * dT * f_rhs;
-        break;
-    case 1:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + HLF * dT * f1;
-        break;
-    case 2:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + dT * f1;
-        break;
-    case 3:
-        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
-        break;
-    default:
-        std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-}
-
-void rungekutta4_cplxscalar_(double &dT,
-                             std::complex<double> &f0,
-                             std::complex<double> &f1,
-                             std::complex<double> &f_rhs,
-                             int &RK4) {
-    constexpr double F1o6 = 1.0 / 6.0;
-    constexpr double HLF = 0.5;
-    constexpr double TWO = 2.0;
-
-    switch (RK4) {
-    case 0:
-        f1 = f0 + HLF * dT * f_rhs;
-        break;
-    case 1:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + HLF * dT * f1;
-        break;
-    case 2:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + dT * f1;
-        break;
-    case 3:
-        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
-        break;
-    default:
-        std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-}
-
-int f_rungekutta4_rout(int *ex, double &dT,
-                       double *f0, double *f1, double *f_rhs,
-                       int &RK4) {
-    const std::size_t n = static_cast<std::size_t>(ex[0]) *
-                          static_cast<std::size_t>(ex[1]) *
-                          static_cast<std::size_t>(ex[2]);
-    const double *const __restrict f0r = f0;
-    double *const __restrict f1r = f1;
-    double *const __restrict frhs = f_rhs;
-
-    if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
-        std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-
-    switch (RK4) {
-    case 0:
-        rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
-        break;
-    case 1:
-        rk4_rhs_accum(n, f1r, frhs);
-        rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
-        break;
-    case 2:
-        rk4_rhs_accum(n, f1r, frhs);
-        rk4_f1_from_f0_f1(n, f0r, f1r, dT);
-        break;
-    default:
-        rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
-        break;
-    }
-
-    return 0;
-}
-
-} // extern "C"
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -1,246 +0,0 @@
-#ifndef SHARE_FUNC_H
-#define SHARE_FUNC_H
-
-#include <stdlib.h>
-#include <stddef.h>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-/* 主网格：0-based -> 1D */
-static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
-    const int ex1 = ex[0], ex2 = ex[1];
-    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
-}
-
-/*
- * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
- * ord=2 => shift=1
- * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
- */
-static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
-    const int shift = 1;
-    const int nx = ex[0] + 2;      // ex1 + ord
-    const int ny = ex[1] + 2;
-
-    const int ii = iF + shift;     // 0..ex1+1
-    const int jj = jF + shift;     // 0..ex2+1
-    const int kk = kF + shift;     // 0..ex3+1
-
-    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
-}
-
-/*
- * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
- * ord=3 => shift=2
- * iF/jF/kF 是 Fortran 索引（可为负）
- */
-static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
-    const int shift = 2;                 // ord=3 -> -2..ex
-    const int nx = ex[0] + 3;            // ex1 + ord
-    const int ny = ex[1] + 3;
-
-    const int ii = iF + shift;           // 0..ex1+2
-    const int jj = jF + shift;           // 0..ex2+2
-    const int kk = kF + shift;           // 0..ex3+2
-
-    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
-}
-
-/*
- * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
- * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
- *
- * C 里我们把：
- *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
- *   funcc 用“平移下标”存为一维数组：
- *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
- *     总长度 nx = extc1 + ord
- *     同理 ny = extc2 + ord, nz = extc3 + ord
- */
-
-static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
-    const int nx = extc[0], ny = extc[1];
-    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
-}
-
-static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
-    const int shift = ord - 1;          // iF = -shift .. extc1
-    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
-    const int ny = extc[1] + ord;
-
-    const int ii = iF + shift;          // 0..extc1+shift
-    const int jj = jF + shift;          // 0..extc2+shift
-    const int kk = kF + shift;          // 0..extc3+shift
-
-    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
-}
-
-/*
- * 等价于 Fortran:
- * funcc(1:extc1,1:extc2,1:extc3)=func
- * do i=0,ord-1
- *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
- * enddo
- * do i=0,ord-1
- *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
- * enddo
- * do i=0,ord-1
- *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
- * enddo
- */
-static inline void symmetry_bd_impl(int ord,
-                 int shift,
-                 const int extc[3],
-                 const double *__restrict func,
-                 double *__restrict funcc,
-                 const double SoA[3])
-{
-    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
-    const int nx = extc1 + ord;
-    const int ny = extc2 + ord;
-
-    const size_t snx = (size_t)nx;
-    const size_t splane = (size_t)nx * (size_t)ny;
-    const size_t interior_i = (size_t)shift + 1u;          /* iF = 1 */
-    const size_t interior_j = ((size_t)shift + 1u) * snx;  /* jF = 1 */
-    const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
-    const size_t interior0 = interior_k + interior_j + interior_i;
-
-    /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
-    for (int k0 = 0; k0 < extc3; ++k0) {
-        const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
-        const size_t dst_k0 = interior0 + (size_t)k0 * splane;
-        for (int j0 = 0; j0 < extc2; ++j0) {
-            const double *src = src_k + (size_t)j0 * (size_t)extc1;
-            double *dst = funcc + dst_k0 + (size_t)j0 * snx;
-            memcpy(dst, src, (size_t)extc1 * sizeof(double));
-        }
-    }
-
-    /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
-    const double s1 = SoA[0];
-    if (s1 == 1.0) {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = funcc[off + src_i];
-                }
-            }
-        }
-    } else if (s1 == -1.0) {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = -funcc[off + src_i];
-                }
-            }
-        }
-    } else {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = funcc[off + src_i] * s1;
-                }
-            }
-        }
-    }
-
-    /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
-    const double s2 = SoA[1];
-    if (s2 == 1.0) {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = src[i];
-            }
-        }
-    } else if (s2 == -1.0) {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = -src[i];
-            }
-        }
-    } else {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
-            }
-        }
-    }
-
-    /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
-    const double s3 = SoA[2];
-    if (s3 == 1.0) {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
-        }
-    } else if (s3 == -1.0) {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
-        }
-    } else {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
-        }
-    }
-}
-
-static inline void symmetry_bd(int ord,
-                 const int extc[3],
-                 const double *func,
-                 double *funcc,
-                 const double SoA[3])
-{
-    if (ord <= 0) return;
-
-    /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
-    if (ord == 2) {
-        symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
-        return;
-    }
-    if (ord == 3) {
-        symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
-        return;
-    }
-
-    symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
-}
-#endif
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -2653,6 +2653,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var

  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
+
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);

  double Mass_out = 0;
--- a/AMSS_NCKU_source/xh_bssn_rhs.C
+++ b/AMSS_NCKU_source/xh_bssn_rhs.C
--- a/AMSS_NCKU_source/xh_bssn_rhs_compute.h
+++ b/AMSS_NCKU_source/xh_bssn_rhs_compute.h
@@ -0,0 +1,30 @@
+#include "xh_tool.h"
+
+
+extern "C"
+{
+int f_compute_rhs_bssn_xh(int *ex, double &T, 
+                       double *X, double *Y, double *Z,
+                       double *chi, double *trK,
+                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
+                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
+                       double *Gamx, double *Gamy, double *Gamz,
+                       double *Lap, double *betax, double *betay, double *betaz,
+                       double *dtSfx, double *dtSfy, double *dtSfz,
+                       double *chi_rhs, double *trK_rhs,
+                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
+                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
+                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
+                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
+                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
+                       double *rho, double *Sx, double *Sy, double *Sz,
+                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
+                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
+                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
+                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
+                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
+                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
+                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
+                       int &Symmetry, int &Lev, double &eps, int &co
+                       ); 
+}
--- a/AMSS_NCKU_source/xh_fdderivs.C
+++ b/AMSS_NCKU_source/xh_fdderivs.C
@@ -0,0 +1,311 @@
+#include "xh_tool.h"
+void fdderivs(const int ex[3],
+              const double *f,
+              double *fxx, double *fxy, double *fxz,
+              double *fyy, double *fyz, double *fzz,
+              const double *X, const double *Y, const double *Z,
+              double SYM1, double SYM2, double SYM3,
+              int Symmetry, int onoff)
+{
+    (void)onoff;
+    const int NO_SYMM = 0, EQ_SYMM = 1;
+    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
+    const double F1o4   = 2.5e-1;          // 1/4
+    const double F8     = 8.0;
+    const double F16    = 16.0;
+    const double F30    = 30.0;
+    const double F1o12  = ONE / 12.0;
+    const double F1o144 = ONE / 144.0;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
+
+
+    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
+    const size_t nx = (size_t)ex1 + 2;
+    const size_t ny = (size_t)ex2 + 2;
+    const size_t nz = (size_t)ex3 + 2;
+    const size_t fh_size = nx * ny * nz;
+
+    /* 系数：按 Fortran 原式 */
+    const double Sdxdx = ONE / (dX * dX);
+    const double Sdydy = ONE / (dY * dY);
+    const double Sdzdz = ONE / (dZ * dZ);
+
+    const double Fdxdx = F1o12 / (dX * dX);
+    const double Fdydy = F1o12 / (dY * dY);
+    const double Fdzdz = F1o12 / (dZ * dZ);
+
+    const double Sdxdy = F1o4 / (dX * dY);
+    const double Sdxdz = F1o4 / (dX * dZ);
+    const double Sdydz = F1o4 / (dY * dZ);
+
+    const double Fdxdy = F1o144 / (dX * dY);
+    const double Fdxdz = F1o144 / (dX * dZ);
+    const double Fdydz = F1o144 / (dY * dZ);
+
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // symmetry_bd(2, ex, f, fh, SoA);
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    for (int k0 = 0; k0 < ex[2]; ++k0) {
+        for (int j0 = 0; j0 < ex[1]; ++j0) {
+            for (int i0 = 0; i0 < ex[0]; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
+                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
+            }
+        }
+    }
+
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= 2 - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int jF = 1; jF <= ex[1]; ++jF) {
+                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
+                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= 2 - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= ex[2]; ++kF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= 2 - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
+            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
+                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
+                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
+            }
+        }
+    }
+    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
+    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    // for (size_t p = 0; p < all; ++p) {
+    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
+    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
+    // }
+
+    /*
+     * Fortran:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
+     */
+    
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fxx[p] = Fdxdx * (
+                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Fdydy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Fdzdz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
+                    {
+                        const double t_jm2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+                        const double t_jm1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+                        const double t_jp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+                        const double t_jp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+                    }
+
+                    /* fxz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+
+                    /* fyz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+                }
+                /* 二阶分支：i±1,j±1,k±1 在范围内 */
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
+                    fxx[p] = Sdxdx * (
+                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Sdydy * (
+                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Sdzdz * (
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    fxy[p] = Sdxdy * (
+                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                    );
+
+                    fxz[p] = Sdxdz * (
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                    );
+
+                    fyz[p] = Sdydz * (
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
+                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
+                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                    );
+                }else{
+                    fxx[p] = 0.0;
+                    fyy[p] = 0.0;
+                    fzz[p] = 0.0;
+                    fxy[p] = 0.0;
+                    fxz[p] = 0.0;
+                    fyz[p] = 0.0;
+                }
+            }
+        }
+    }
+    // free(fh);
+}
--- a/AMSS_NCKU_source/xh_fderivs.C
+++ b/AMSS_NCKU_source/xh_fderivs.C
@@ -0,0 +1,145 @@
+#include "xh_tool.h"
+
+/*
+ * C 版 fderivs
+ *
+ * Fortran:
+ * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
+ *
+ * 约定：
+ *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
+ *   X: ex1, Y: ex2, Z: ex3
+ */
+void fderivs(const int ex[3],
+             const double *f,
+             double *fx, double *fy, double *fz,
+             const double *X, const double *Y, const double *Z,
+             double SYM1, double SYM2, double SYM3,
+             int Symmetry, int onoff)
+{
+    (void)onoff; // Fortran 里没用到
+
+    const double ZEO = 0.0, ONE = 1.0;
+    const double TWO = 2.0, EIT = 8.0;
+    const double F12 = 12.0;
+
+    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    // dX = X(2)-X(1) -> C: X[1]-X[0]
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+
+    int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
+
+    // SoA(1:3) = SYM1,SYM2,SYM3
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
+
+    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
+    const size_t nx = (size_t)ex1 + 2;
+    const size_t ny = (size_t)ex2 + 2;
+    const size_t nz = (size_t)ex3 + 2;
+    const size_t fh_size = nx * ny * nz;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
+    if (!fh) return;
+
+    // call symmetry_bd(2,ex,f,fh,SoA)
+    symmetry_bd(2, ex, f, fh, SoA);
+
+    const double d12dx = ONE / F12 / dX;
+    const double d12dy = ONE / F12 / dY;
+    const double d12dz = ONE / F12 / dZ;
+
+    const double d2dx  = ONE / TWO / dX;
+    const double d2dy  = ONE / TWO / dY;
+    const double d2dz  = ONE / TWO / dZ;
+
+    // fx = fy = fz = 0
+    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    for (size_t p = 0; p < all; ++p) {
+        fx[p] = ZEO;
+        fy[p] = ZEO;
+        fz[p] = ZEO;
+    }
+
+    /*
+     * Fortran loops:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
+     *
+     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
+     */
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
+                const int iF = i0 + 1;
+                const size_t p = idx_ex(i0, j0, k0, ex);
+
+                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
+                if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
+                    (jF + 2) <= ex2 && (jF - 2) >= jminF &&
+                    (kF + 2) <= ex3 && (kF - 2) >= kminF)
+                {
+                    fx[p] = d12dx * (
+                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d12dy * (
+                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
+                    );
+
+                    fz[p] = d12dz * (
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
+                    );
+                }
+                // elseif(i+1 <= imax .and. i-1 >= imin ...)
+                else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
+                         (jF + 1) <= ex2 && (jF - 1) >= jminF &&
+                         (kF + 1) <= ex3 && (kF - 1) >= kminF)
+                {
+                    fx[p] = d2dx * (
+                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d2dy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fz[p] = d2dz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+                }
+            }
+        }
+    }
+
+    // free(fh);
+}
--- a/AMSS_NCKU_source/xh_global_interp.C
+++ b/AMSS_NCKU_source/xh_global_interp.C
@@ -0,0 +1,143 @@
+#include "xh_global_interp.h"
+
+/* 你已有的 polin3（由前面 Fortran->C 翻译得到） */
+// void polin3(const double *x1a, const double *x2a, const double *x3a,
+//             const double *ya, double x1, double x2, double x3,
+//             double *y, double *dy, int ordn);
+
+/*
+  你需要提供 decide3d 的实现（这里仅声明）。
+  Fortran: decide3d(ex,f,f,cxB,cxT,SoA,ya,ORDN,Symmetry)
+  - ex: [3]
+  - f: 三维场（列主序）
+  - cxB/cxT: 3 维窗口起止（Fortran 1-based，且可能 <=0）
+  - SoA: [3]
+  - ya: 输出 ORDN^3 的采样块（列主序）
+  - return: 0 表示正常；非 0 表示错误（对应 Fortran logical = .true.）
+*/
+// int xh_decide3d(const int ex[3],
+//              const double *f_in,
+//              const double *f_in2,   /* Fortran 里传了 f,f；按原样保留 */
+//              const int cxB[3],
+//              const int cxT[3],
+//              const double SoA[3],
+//              double *ya,
+//              int ordn,
+//              int symmetry);
+
+/* 把 Fortran 1-based 下标 idxF (可为负/0) 映射到 C 的 X[idx] 访问（只用于 X(2-cxB) 这种表达式） */
+static inline double X_at_FortranIndex(const double *X, int idxF) {
+    /* Fortran: X(1) 对应 C: X[0] */
+    return X[idxF - 1];
+}
+
+/* Fortran 整数截断：idint 在这里可用 (int) 实现（对正数等价于 floor） */
+static inline int idint_like(double a) {
+    return (int)a;  /* trunc toward zero */
+}
+
+/* global_interp 的 C 版 */
+void xh_global_interp(const int ex[3],
+                   const double *X, const double *Y, const double *Z,
+                   const double *f,                 /* f(ex1,ex2,ex3) column-major */
+                   double &f_int,
+                   double x1, double y1, double z1,
+                   int ORDN,
+                   const double SoA[3],
+                   int symmetry)
+{
+    // double time1, time2;
+    // time1 = omp_get_wtime();
+    enum { NO_SYMM = 0, EQUATORIAL = 1, OCTANT = 2 };
+
+    int j, m;
+    int imin, jmin, kmin;
+    int cxB[3], cxT[3], cxI[3], cmin[3], cmax[3];
+    double cx[3];
+    double dX, dY, dZ, ddy;
+
+    /* Fortran: imin=lbound(f,1) ... 通常是 1；这里按 1 处理 */
+    imin = 1; jmin = 1; kmin = 1;
+
+    dX = X_at_FortranIndex(X, imin + 1) - X_at_FortranIndex(X, imin);
+    dY = X_at_FortranIndex(Y, jmin + 1) - X_at_FortranIndex(Y, jmin);
+    dZ = X_at_FortranIndex(Z, kmin + 1) - X_at_FortranIndex(Z, kmin);
+
+    /* x1a(j) = (j-1)*1.0  (j=1..ORDN) */
+    double *x1a = (double*)malloc((size_t)ORDN * sizeof(double));
+    double *ya  = (double*)malloc((size_t)ORDN * (size_t)ORDN * (size_t)ORDN * sizeof(double));
+    if (!x1a || !ya) {
+        fprintf(stderr, "global_interp: malloc failed\n");
+        exit(1);
+    }
+    for (j = 0; j < ORDN; j++) x1a[j] = (double)j;
+
+    /* cxI(m) = idint((p - P(1))/dP + 0.4) + 1  (Fortran 1-based) */
+    cxI[0] = idint_like((x1 - X_at_FortranIndex(X, 1)) / dX + 0.4) + 1;
+    cxI[1] = idint_like((y1 - X_at_FortranIndex(Y, 1)) / dY + 0.4) + 1;
+    cxI[2] = idint_like((z1 - X_at_FortranIndex(Z, 1)) / dZ + 0.4) + 1;
+
+    /* cxB = cxI - ORDN/2 + 1 ; cxT = cxB + ORDN - 1 */
+    int half = ORDN / 2;  /* Fortran 整数除法 */
+    for (m = 0; m < 3; m++) {
+        cxB[m] = cxI[m] - half + 1;
+        cxT[m] = cxB[m] + ORDN - 1;
+    }
+
+    /* cmin=1; cmax=ex */
+    cmin[0] = cmin[1] = cmin[2] = 1;
+    cmax[0] = ex[0];
+    cmax[1] = ex[1];
+    cmax[2] = ex[2];
+
+    /* 对称边界时允许 cxB 为负/0（与 Fortran 一致） */
+    if (symmetry == OCTANT && fabs(X_at_FortranIndex(X, 1)) < dX) cmin[0] = -half + 2;
+    if (symmetry == OCTANT && fabs(X_at_FortranIndex(Y, 1)) < dY) cmin[1] = -half + 2;
+    if (symmetry != NO_SYMM && fabs(X_at_FortranIndex(Z, 1)) < dZ) cmin[2] = -half + 2;
+
+    /* 夹紧窗口 [cxB,cxT] 到 [cmin,cmax] */
+    for (m = 0; m < 3; m++) {
+        if (cxB[m] < cmin[m]) {
+            cxB[m] = cmin[m];
+            cxT[m] = cxB[m] + ORDN - 1;
+        }
+        if (cxT[m] > cmax[m]) {
+            cxT[m] = cmax[m];
+            cxB[m] = cxT[m] + 1 - ORDN;
+        }
+    }
+
+    /*
+      cx(m) 的计算：如果 cxB>0:
+        cx = (p - P(cxB))/dP
+      else:
+        cx = (p + P(2 - cxB))/dP
+      注意这里的 cxB 是 Fortran 1-based 语义下的整数，可能 <=0。
+    */
+    if (cxB[0] > 0) cx[0] = (x1 - X_at_FortranIndex(X, cxB[0])) / dX;
+    else           cx[0] = (x1 + X_at_FortranIndex(X, 2 - cxB[0])) / dX;
+
+    if (cxB[1] > 0) cx[1] = (y1 - X_at_FortranIndex(Y, cxB[1])) / dY;
+    else           cx[1] = (y1 + X_at_FortranIndex(Y, 2 - cxB[1])) / dY;
+
+    if (cxB[2] > 0) cx[2] = (z1 - X_at_FortranIndex(Z, cxB[2])) / dZ;
+    else           cx[2] = (z1 + X_at_FortranIndex(Z, 2 - cxB[2])) / dZ;
+
+    /* decide3d: 填充 ya(1:ORDN,1:ORDN,1:ORDN) */
+    if (xh_decide3d(ex, f, f, cxB, cxT, SoA, ya, ORDN, symmetry)) {
+        printf("global_interp position: %g %g %g\n", x1, y1, z1);
+        printf("data range: %g %g   %g %g   %g %g\n",
+               X_at_FortranIndex(X, 1), X_at_FortranIndex(X, ex[0]),
+               X_at_FortranIndex(Y, 1), X_at_FortranIndex(Y, ex[1]),
+               X_at_FortranIndex(Z, 1), X_at_FortranIndex(Z, ex[2]));
+        exit(1);
+    }
+
+    /* polin3(x1a,x1a,x1a,ya,cx(1),cx(2),cx(3),f_int,ddy,ORDN) */
+    xh_polin3(x1a, x1a, x1a, ya, cx[0], cx[1], cx[2], f_int, &ddy, ORDN);
+ 
+    free(x1a);
+    free(ya);
+    // time2 = omp_get_wtime();
+    // printf("Time for global_interp: %lf seconds\n", time2 - time1);
+}
--- a/AMSS_NCKU_source/xh_global_interp.h
+++ b/AMSS_NCKU_source/xh_global_interp.h
@@ -0,0 +1,12 @@
+#include "xh_po.h"
+
+extern "C"{
+    void xh_global_interp(const int ex[3],
+                    const double *X, const double *Y, const double *Z,
+                    const double *f,                 /* f(ex1,ex2,ex3) column-major */
+                    double &f_int,
+                    double x1, double y1, double z1,
+                    int ORDN,
+                    const double SoA[3],
+                    int symmetry);
+}
--- a/AMSS_NCKU_source/xh_kodiss.C
+++ b/AMSS_NCKU_source/xh_kodiss.C
@@ -0,0 +1,116 @@
+#include "xh_tool.h"
+
+/*
+ * C 版 kodis
+ *
+ * Fortran signature:
+ * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
+ *
+ * 约定：
+ *   X: ex1, Y: ex2, Z: ex3
+ *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
+ *   SoA[3]
+ *   eps: double
+ */
+void kodis(const int ex[3],
+           const double *X, const double *Y, const double *Z,
+           const double *f, double *f_rhs,
+           const double SoA[3],
+           int Symmetry, double eps)
+{
+    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
+    const double cof = 64.0;             // 2^6
+    const int NO_SYMM = 0, OCTANT = 2;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
+    const double dX = X[1] - X[0];
+    const double dY = Y[1] - Y[0];
+    const double dZ = Z[1] - Z[0];
+    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
+
+    // Fortran: imax=ex(1) 等是 1-based 上界
+    const int imaxF = ex1;
+    const int jmaxF = ex2;
+    const int kmaxF = ex3;
+
+    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
+    int iminF = 1, jminF = 1, kminF = 1;
+
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
+    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
+    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
+
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
+    const size_t nx = (size_t)ex1 + 3;
+    const size_t ny = (size_t)ex2 + 3;
+    const size_t nz = (size_t)ex3 + 3;
+    const size_t fh_size = nx * ny * nz;
+
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    if (!fh) return;
+
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
+    symmetry_bd(3, ex, f, fh, SoA);
+
+    /*
+     * Fortran loops:
+     * do k=1,ex3
+     * do j=1,ex2
+     * do i=1,ex1
+     *
+     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
+     * 并定义 Fortran index: iF=i0+1, ...
+     */
+    for (int k0 = 0; k0 < ex3; ++k0) {
+        const int kF = k0 + 1;
+        for (int j0 = 0; j0 < ex2; ++j0) {
+            const int jF = j0 + 1;
+            for (int i0 = 0; i0 < ex1; ++i0) {
+                const int iF = i0 + 1;
+
+                // Fortran if 条件：
+                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
+                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
+                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
+                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
+                {
+                    const size_t p = idx_ex(i0, j0, k0, ex);
+
+                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
+                    const double Dx_term =
+                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
+                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
+                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
+                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
+
+                    const double Dy_term =
+                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
+                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
+                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
+                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
+
+                    const double Dz_term =
+                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
+                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
+                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
+                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
+
+                    // Fortran:
+                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
+                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
+                }
+            }
+        }
+    }
+
+    // free(fh);
+}
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -1,25 +1,32 @@
-#include "tool.h"
-
+#include "xh_tool.h"
 /*
- * Combined advection (lopsided) + KO dissipation (kodis).
- * Uses one shared symmetry_bd buffer per call.
+ * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
+ * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
+ *
+ * 约定：
+ *   nghost = 3
+ *   ex[3]  = {ex1,ex2,ex3}
+ *   f      = 原始网格 (ex1*ex2*ex3)
+ *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
+ *   SoA[3] = 输入参数
 */
-void lopsided_kodis(const int ex[3],
-                    const double *X, const double *Y, const double *Z,
-                    const double *f, double *f_rhs,
-                    const double *Sfx, const double *Sfy, const double *Sfz,
-                    int Symmetry, const double SoA[3], double eps)
+void lopsided(const int ex[3],
+              const double *X, const double *Y, const double *Z,
+              const double *f, double *f_rhs,
+              const double *Sfx, const double *Sfy, const double *Sfz,
+              int Symmetry, const double SoA[3])
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
-    const double F6 = 6.0, F18 = 18.0;
+    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
-    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
-    const double cof = 64.0; // 2^6

-    const int NO_SYMM = 0, EQ_SYMM = 1;
+    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
+    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系

    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];

+    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
+    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
@@ -28,37 +35,70 @@ void lopsided_kodis(const int ex[3],
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;

+    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
+    const double d2dx  = ONE / TWO / dX;
+    const double d2dy  = ONE / TWO / dY;
+    const double d2dz  = ONE / TWO / dZ;
+    (void)d2dx; (void)d2dy; (void)d2dz;
+
+    // Fortran:
+    // imax = ex(1); jmax = ex(2); kmax = ex(3)
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;

+    // Fortran:
+    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;

-    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;

-    double *fh = (double*)malloc(fh_size * sizeof(double));
-    if (!fh) return;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
+    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
+
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);

-    // Advection (same stencil logic as lopsided_c.C)
+    /*
+     * Fortran 主循环：
+     * do k=1,ex(3)-1
+     * do j=1,ex(2)-1
+     * do i=1,ex(1)-1
+     *
+     * 转成 C 0-based：
+     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
+     *
+     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
+     * iF=i0+1, jF=j0+1, kF=k0+1
+     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
+
                const size_t p = idx_ex(i0, j0, k0, ex);

+                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
+                    // Fortran: if(i+3 <= imax)
+                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
@@ -66,13 +106,17 @@ void lopsided_kodis(const int ex[3],
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
-                    } else if (i0 <= ex1 - 3) {
+                    }
+                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
+                    else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
-                    } else if (i0 <= ex1 - 2) {
+                    }
+                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
+                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
@@ -81,6 +125,8 @@ void lopsided_kodis(const int ex[3],
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
+                    // Fortran: if(i-3 >= imin)
+                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
@@ -88,13 +134,17 @@ void lopsided_kodis(const int ex[3],
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
-                    } else if ((i0 - 1) >= iminF) {
+                    }
+                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
+                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
-                    } else if (i0 >= iminF) {
+                    }
+                    // elseif(i-1 >= imin) <=> i0 >= iminF
+                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
@@ -104,8 +154,10 @@ void lopsided_kodis(const int ex[3],
                    }
                }

+                // ---------------- y direction ----------------
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
+                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
@@ -151,6 +203,7 @@ void lopsided_kodis(const int ex[3],
                    }
                }

+                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
@@ -200,49 +253,10 @@ void lopsided_kodis(const int ex[3],
            }
        }
    }
-
-    // KO dissipation (same domain restriction as kodiss_c.C)
-    if (eps > ZEO) {
-        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
-        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
-        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
-        const int i0_hi = imaxF - 4; // inclusive
-        const int j0_hi = jmaxF - 4;
-        const int k0_hi = kmaxF - 4;
-
-        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
-            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
-                const int kF = k0 + 1;
-                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
-                    const int jF = j0 + 1;
-                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
-                        const int iF = i0 + 1;
-                        const size_t p = idx_ex(i0, j0, k0, ex);
-
-                        const double Dx_term =
-                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
-                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
-                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
-                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
-
-                        const double Dy_term =
-                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
-                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
-                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
-                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
-
-                        const double Dz_term =
-                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
-                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
-                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
-                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
-
-                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
-                    }
-                }
-            }
-        }
-    }
-
-    free(fh);
+    // free(fh);
 }
+
+
+
+
+
--- a/AMSS_NCKU_source/xh_po.h
+++ b/AMSS_NCKU_source/xh_po.h
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+int xh_decide3d(const int ex[3],
+             const double *f,
+             const double *fpi,   /* 这里未用，Fortran 也没用到 */
+             const int cxB[3],
+             const int cxT[3],
+             const double SoA[3],
+             double *ya,
+             int ordn,
+             int Symmetry);
+void xh_polint(const double *xa, const double *ya, double x,
+                   double *y, double *dy, int ordn);
+
+void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
+                   const double *ya, double x1, double x2, double x3,
+                   double &y, double *dy, int ordn);
--- a/AMSS_NCKU_source/xh_polint3.C
+++ b/AMSS_NCKU_source/xh_polint3.C
@@ -0,0 +1,258 @@
+#include "xh_po.h"
+/*
+  ex[0..2]  == Fortran ex(1:3)
+  cxB/cxT   == Fortran cxB(1:3), cxT(1:3)  (可能 <=0)
+  SoA[0..2] == Fortran SoA(1:3)
+  f, fpi    == Fortran f(ex1,ex2,ex3) column-major (1-based in formulas)
+  ya        == 连续内存，尺寸为 ORDN^3，对应 Fortran ya(cxB1:cxT1, cxB2:cxT2, cxB3:cxT3)
+              但注意：我们用 offset 映射把 Fortran 的 i/j/k 坐标写进去。
+*/
+
+static inline int imax(int a, int b) { return a > b ? a : b; }
+static inline int imin(int a, int b) { return a < b ? a : b; }
+
+/* f(i,j,k): Fortran column-major, i/j/k are Fortran 1-based in [1..ex] */
+#define F(i,j,k) f[((i)-1) + ex1 * (((j)-1) + ex2 * ((k)-1))]
+
+/*
+  ya(i,j,k): i in [cxB1..cxT1], j in [cxB2..cxT2], k in [cxB3..cxT3]
+  我们把它映射到 C 的 0..ORDN-1 立方体：
+    ii = i - cxB1
+    jj = j - cxB2
+    kk = k - cxB3
+  并按 column-major 存储（与 Fortran 一致，方便直接喂给你的 polin3）
+*/
+#define YA(i,j,k) ya[((i)-cxB1) + ordn * (((j)-cxB2) + ordn * ((k)-cxB3))]
+
+int xh_decide3d(const int ex[3],
+             const double *f,
+             const double *fpi,   /* 这里未用，Fortran 也没用到 */
+             const int cxB[3],
+             const int cxT[3],
+             const double SoA[3],
+             double *ya,
+             int ordn,
+             int Symmetry)         /* Symmetry 在 decide3d 里也没直接用 */
+{
+    (void)fpi;
+    (void)Symmetry;
+
+    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
+
+    int fmin1[3], fmin2[3], fmax1[3], fmax2[3];
+    int i, j, k, m;
+
+    int gont = 0;
+
+    /* 方便 YA 宏使用 */
+    const int cxB1 = cxB[0], cxB2 = cxB[1], cxB3 = cxB[2];
+
+    for (m = 0; m < 3; m++) {
+        /* Fortran 的 “NaN 检查” 在整数上基本无意义，这里不额外处理 */
+
+        fmin1[m] = imax(1, cxB[m]);
+        fmax1[m] = cxT[m];
+
+        fmin2[m] = cxB[m];
+        fmax2[m] = imin(0, cxT[m]);
+
+        /* if((fmin1<=fmax1) and (fmin1<1 or fmax1>ex)) gont=true */
+        if ((fmin1[m] <= fmax1[m]) && (fmin1[m] < 1 || fmax1[m] > ex[m])) gont = 1;
+
+        /* if((fmin2<=fmax2) and (2-fmax2<1 or 2-fmin2>ex)) gont=true */
+        if ((fmin2[m] <= fmax2[m]) && (2 - fmax2[m] < 1 || 2 - fmin2[m] > ex[m])) gont = 1;
+    }
+
+    if (gont) {
+        printf("error in decide3d\n");
+        printf("cxB: %d %d %d   cxT: %d %d %d   ex: %d %d %d\n",
+               cxB[0], cxB[1], cxB[2], cxT[0], cxT[1], cxT[2], ex[0], ex[1], ex[2]);
+        printf("fmin1: %d %d %d  fmax1: %d %d %d\n",
+               fmin1[0], fmin1[1], fmin1[2], fmax1[0], fmax1[1], fmax1[2]);
+        printf("fmin2: %d %d %d  fmax2: %d %d %d\n",
+               fmin2[0], fmin2[1], fmin2[2], fmax2[0], fmax2[1], fmax2[2]);
+        return 1;
+    }
+
+    /* ---- 填充 ya：完全照 Fortran 两大块循环写 ---- */
+
+    /* k in [fmin1(3)..fmax1(3)] */
+    for (k = fmin1[2]; k <= fmax1[2]; k++) {
+
+        /* j in [fmin1(2)..fmax1(2)] */
+        for (j = fmin1[1]; j <= fmax1[1]; j++) {
+
+            /* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,j,k) */
+            for (i = fmin1[0]; i <= fmax1[0]; i++) {
+                YA(i, j, k) = F(i, j, k);
+            }
+
+            /* i in [fmin2(1)..fmax2(1)] : ya(i,j,k)=f(2-i,j,k)*SoA(1) */
+            for (i = fmin2[0]; i <= fmax2[0]; i++) {
+                YA(i, j, k) = F(2 - i, j, k) * SoA[0];
+            }
+        }
+
+        /* j in [fmin2(2)..fmax2(2)] */
+        for (j = fmin2[1]; j <= fmax2[1]; j++) {
+
+            /* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,2-j,k)*SoA(2) */
+            for (i = fmin1[0]; i <= fmax1[0]; i++) {
+                YA(i, j, k) = F(i, 2 - j, k) * SoA[1];
+            }
+
+            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,k)*SoA(1)*SoA(2) */
+            for (i = fmin2[0]; i <= fmax2[0]; i++) {
+                YA(i, j, k) = F(2 - i, 2 - j, k) * SoA[0] * SoA[1];
+            }
+        }
+    }
+
+    /* k in [fmin2(3)..fmax2(3)] */
+    for (k = fmin2[2]; k <= fmax2[2]; k++) {
+
+        /* j in [fmin1(2)..fmax1(2)] */
+        for (j = fmin1[1]; j <= fmax1[1]; j++) {
+
+            /* i in [fmin1(1)..fmax1(1)] : ya=f(i,j,2-k)*SoA(3) */
+            for (i = fmin1[0]; i <= fmax1[0]; i++) {
+                YA(i, j, k) = F(i, j, 2 - k) * SoA[2];
+            }
+
+            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,j,2-k)*SoA(1)*SoA(3) */
+            for (i = fmin2[0]; i <= fmax2[0]; i++) {
+                YA(i, j, k) = F(2 - i, j, 2 - k) * SoA[0] * SoA[2];
+            }
+        }
+
+        /* j in [fmin2(2)..fmax2(2)] */
+        for (j = fmin2[1]; j <= fmax2[1]; j++) {
+
+            /* i in [fmin1(1)..fmax1(1)] : ya=f(i,2-j,2-k)*SoA(2)*SoA(3) */
+            for (i = fmin1[0]; i <= fmax1[0]; i++) {
+                YA(i, j, k) = F(i, 2 - j, 2 - k) * SoA[1] * SoA[2];
+            }
+
+            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,2-k)*SoA1*SoA2*SoA3 */
+            for (i = fmin2[0]; i <= fmax2[0]; i++) {
+                YA(i, j, k) = F(2 - i, 2 - j, 2 - k) * SoA[0] * SoA[1] * SoA[2];
+            }
+        }
+    }
+
+    return 0;
+}
+
+#undef F
+#undef YA
+
+void xh_polint(const double *xa, const double *ya, double x,
+                   double *y, double *dy, int ordn)
+{
+    int i, m, ns, n_m;
+    double dif, dift, hp, h, den_val;
+
+    double *c  = (double*)malloc((size_t)ordn * sizeof(double));
+    double *d  = (double*)malloc((size_t)ordn * sizeof(double));
+    double *ho = (double*)malloc((size_t)ordn * sizeof(double));
+    if (!c || !d || !ho) {
+        fprintf(stderr, "polint: malloc failed\n");
+        exit(1);
+    }
+
+    for (i = 0; i < ordn; i++) {
+        c[i]  = ya[i];
+        d[i]  = ya[i];
+        ho[i] = xa[i] - x;
+    }
+
+    ns  = 0;                      // Fortran ns=1 -> C ns=0
+    dif = fabs(x - xa[0]);
+
+    for (i = 1; i < ordn; i++) {
+        dift = fabs(x - xa[i]);
+        if (dift < dif) {
+            ns  = i;
+            dif = dift;
+        }
+    }
+
+    *y  = ya[ns];
+    ns -= 1;                      // Fortran ns=ns-1
+
+    for (m = 1; m <= ordn - 1; m++) {
+        n_m = ordn - m;           // number of active points this round
+        for (i = 0; i < n_m; i++) {
+            hp      = ho[i];
+            h       = ho[i + m];
+            den_val = hp - h;
+
+            if (den_val == 0.0) {
+                fprintf(stderr, "failure in polint for point %g\n", x);
+                fprintf(stderr, "with input points xa: ");
+                for (int t = 0; t < ordn; t++) fprintf(stderr, "%g ", xa[t]);
+                fprintf(stderr, "\n");
+                exit(1);
+            }
+
+            den_val = (c[i + 1] - d[i]) / den_val;
+            d[i]    = h  * den_val;
+            c[i]    = hp * den_val;
+        }
+
+        // Fortran: if (2*ns < n_m) then dy=c(ns+1) else dy=d(ns); ns=ns-1
+        // Here ns is C-indexed and can be -1; logic still matches.
+        if (2 * ns < n_m) {
+            *dy = c[ns + 1];
+        } else {
+            *dy = d[ns];
+            ns -= 1;
+        }
+        *y += *dy;
+    }
+
+    free(c);
+    free(d);
+    free(ho);
+}
+
+void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
+                   const double *ya, double x1, double x2, double x3,
+                   double &y, double *dy, int ordn)
+{
+    // ya is ordn x ordn x ordn in Fortran layout (column-major)
+    #define YA3(i,j,k) ya[(i) + ordn*((j) + ordn*(k))]  // i,j,k: 0..ordn-1
+
+    int j, k;
+    double dy_temp;
+
+    // yatmp(j,k) in Fortran code is ordn x ordn, treat column-major:
+    // yatmp(j,k) -> yatmp[j + ordn*k]
+    double *yatmp = (double*)malloc((size_t)ordn * (size_t)ordn * sizeof(double));
+    double *ymtmp = (double*)malloc((size_t)ordn * sizeof(double));
+    if (!yatmp || !ymtmp) {
+        fprintf(stderr, "polin3: malloc failed\n");
+        exit(1);
+    }
+    #define YAT(j,k) yatmp[(j) + ordn*(k)]
+
+    for (k = 0; k < ordn; k++) {
+        for (j = 0; j < ordn; j++) {
+            // call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp)
+            // ya(:,j,k) contiguous: base is &YA3(0,j,k)
+            xh_polint(x1a, &YA3(0, j, k), x1, &YAT(j, k), &dy_temp, ordn);
+        }
+    }
+
+    for (k = 0; k < ordn; k++) {
+        // call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp)
+        xh_polint(x2a, &YAT(0, k), x2, &ymtmp[k], &dy_temp, ordn);
+    }
+
+    xh_polint(x3a, ymtmp, x3, &y, dy, ordn);
+
+    #undef YAT
+    free(yatmp);
+    free(ymtmp);
+    #undef YA3
+}
--- a/AMSS_NCKU_source/xh_share_func.h
+++ b/AMSS_NCKU_source/xh_share_func.h
@@ -0,0 +1,338 @@
+#ifndef SHARE_FUNC_H
+#define SHARE_FUNC_H
+
+#include <stdlib.h>
+#include <stddef.h>
+#include <math.h>
+#include <stdio.h>
+#include <omp.h>
+/* 主网格：0-based -> 1D */
+static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
+    const int ex1 = ex[0], ex2 = ex[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
+}
+
+/*
+ * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
+ * ord=2 => shift=1
+ * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
+ */
+static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 1;
+    const int nx = ex[0] + 2;      // ex1 + ord
+    const int ny = ex[1] + 2;
+
+    const int ii = iF + shift;     // 0..ex1+1
+    const int jj = jF + shift;     // 0..ex2+1
+    const int kk = kF + shift;     // 0..ex3+1
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
+ * ord=3 => shift=2
+ * iF/jF/kF 是 Fortran 索引（可为负）
+ */
+static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
+    const int shift = 2;                 // ord=3 -> -2..ex
+    const int nx = ex[0] + 3;            // ex1 + ord
+    const int ny = ex[1] + 3;
+
+    const int ii = iF + shift;           // 0..ex1+2
+    const int jj = jF + shift;           // 0..ex2+2
+    const int kk = kF + shift;           // 0..ex3+2
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
+ * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
+ *
+ * C 里我们把：
+ *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
+ *   funcc 用“平移下标”存为一维数组：
+ *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
+ *     总长度 nx = extc1 + ord
+ *     同理 ny = extc2 + ord, nz = extc3 + ord
+ */
+
+static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
+    const int nx = extc[0], ny = extc[1];
+    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
+}
+
+static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
+    const int shift = ord - 1;          // iF = -shift .. extc1
+    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
+    const int ny = extc[1] + ord;
+
+    const int ii = iF + shift;          // 0..extc1+shift
+    const int jj = jF + shift;          // 0..extc2+shift
+    const int kk = kF + shift;          // 0..extc3+shift
+
+    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
+}
+
+/*
+ * 等价于 Fortran:
+ * funcc(1:extc1,1:extc2,1:extc3)=func
+ * do i=0,ord-1
+ *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
+ * enddo
+ * do i=0,ord-1
+ *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
+ * enddo
+ */
+static inline void symmetry_bd(int ord,
+                 const int extc[3],
+                 const double *func,
+                 double *funcc,
+                 const double SoA[3])
+{
+    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
+
+    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
+    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
+    for (int k0 = 0; k0 < extc3; ++k0) {
+        for (int j0 = 0; j0 < extc2; ++j0) {
+            for (int i0 = 0; i0 < extc1; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
+                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
+            }
+        }
+    }
+
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= ord - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int jF = 1; jF <= extc2; ++jF) {
+                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= ord - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= ord - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -ord + 1; jF <= extc2; ++jF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
+            }
+        }
+    }
+}
+#endif
+
+/* 你已有的函数：idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
+static inline void fdderivs_xh(
+    int i0, int j0, int k0,
+    const int ex[3],
+    const double *fh,
+    int iminF, int jminF, int kminF,
+    int imaxF, int jmaxF, int kmaxF,
+    double Fdxdx, double Fdydy, double Fdzdz,
+    double Fdxdy, double Fdxdz, double Fdydz,
+    double Sdxdx, double Sdydy, double Sdzdz,
+    double Sdxdy, double Sdxdz, double Sdydz,
+    double *fxx, double *fxy, double *fxz,
+    double *fyy, double *fyz, double *fzz
+){
+    const double F8  = 8.0;
+    const double F16 = 16.0;
+    const double F30 = 30.0;
+    const double TWO = 2.0;
+
+    const int iF = i0 + 1;
+    const int jF = j0 + 1;
+    const int kF = k0 + 1;
+
+    const size_t p = idx_ex(i0, j0, k0, ex);
+
+    /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+    {
+        fxx[p] = Fdxdx * (
+            -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+        );
+
+        fyy[p] = Fdydy * (
+            -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+        );
+
+        fzz[p] = Fdzdz * (
+            -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+             fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+        );
+
+        /* fxy 高阶 */
+        {
+            const double t_jm2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+            const double t_jm1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+            const double t_jp1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+            const double t_jp2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+            fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+        }
+
+        /* fxz 高阶 */
+        {
+            const double t_km2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+            const double t_km1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+            const double t_kp1 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+            const double t_kp2 =
+                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+            fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+        }
+
+        /* fyz 高阶 */
+        {
+            const double t_km2 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+            const double t_km1 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+            const double t_kp1 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+            const double t_kp2 =
+                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+            fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+        }
+    }
+    /* 二阶分支：i±1,j±1,k±1 在范围内 */
+    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+    {
+        fxx[p] = Sdxdx * (
+            fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+        );
+
+        fyy[p] = Sdydy * (
+            fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+        );
+
+        fzz[p] = Sdzdz * (
+            fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
+            fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+        );
+
+        fxy[p] = Sdxdy * (
+            fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
+            fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
+            fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+        );
+
+        fxz[p] = Sdxdz * (
+            fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
+            fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+        );
+
+        fyz[p] = Sdydz * (
+            fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
+            fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
+            fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+        );
+    }
+    else {
+        fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
+        fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
+    }
+}
--- a/AMSS_NCKU_source/xh_tool.h
+++ b/AMSS_NCKU_source/xh_tool.h
@@ -0,0 +1,27 @@
+#include "xh_share_func.h"
+void fdderivs(const int ex[3],
+              const double *f,
+              double *fxx, double *fxy, double *fxz,
+              double *fyy, double *fyz, double *fzz,
+              const double *X, const double *Y, const double *Z,
+              double SYM1, double SYM2, double SYM3,
+              int Symmetry, int onoff);
+
+void fderivs(const int ex[3],
+             const double *f,
+             double *fx, double *fy, double *fz,
+             const double *X, const double *Y, const double *Z,
+             double SYM1, double SYM2, double SYM3,
+             int Symmetry, int onoff);
+
+void kodis(const int ex[3],
+           const double *X, const double *Y, const double *Z,
+           const double *f, double *f_rhs,
+           const double SoA[3],
+           int Symmetry, double eps);
+
+void lopsided(const int ex[3],
+              const double *X, const double *Y, const double *Z,
+              const double *f, double *f_rhs,
+              const double *Sfx, const double *Sfy, const double *Sfz,
+              int Symmetry, const double SoA[3]);
--- a/BBH_orbit_parameter.py
+++ b/BBH_orbit_parameter.py
--- a/generate_TwoPuncture_input.py
+++ b/generate_TwoPuncture_input.py
@@ -1,195 +1,195 @@
-
-##################################################################
-##
-## Generate input file for the AMSS-NCKU TwoPuncture routine
-## Author: Xiaoqu
-## 2024/11/27
-## Modified: 2025/01/21
-##
-##################################################################
-
-
-import numpy
-import os 
-import AMSS_NCKU_Input as input_data          ## import program input file
-import math
-
-##################################################################
-
-## Import binary black hole coordinates
-
-## If puncture data are set to "Automatically-BBH", compute initial orbital
-## positions and momenta according to the settings and rescale the total
-## binary mass to M = 1 for TwoPuncture input.
-
-if (input_data.puncture_data_set == "Automatically-BBH" ):
-
-    mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
-    
-    if ( mass_ratio_Q < 1.0 ):
-        print( " mass_ratio setting is wrong, please reset!!!" ) 
-        print( " set the first black hole to be the larger mass!!!" ) 
-        
-    BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
-    BBH_M2 = 1.0          / ( 1.0 + mass_ratio_Q )
-
-    ## Load binary separation and eccentricity
-    distance = input_data.Distance
-    e0       = input_data.e0
-    
-    ## Set binary component coordinates
-    ## Note: place the larger-mass black hole at positive y and the
-    ## smaller-mass black hole at negative y to follow Brugmann's convention
-    ## Coordinate convention for TwoPuncture input (Brugmann):
-    ##  -----0-----> y
-    ##   -      +     
-
-
-    BBH_X1 = 0.0
-    BBH_Y1 = distance * 1.0 / ( 1 + mass_ratio_Q )
-    BBH_Z1 = 0.0
-
-    BBH_X2 = 0.0
-    BBH_Y2 = - distance * mass_ratio_Q / ( 1 + mass_ratio_Q )
-    BBH_Z2 = 0.0
-    
-    position_BH    = numpy.zeros( (2,3) )
-    position_BH[0] = [BBH_X1, BBH_Y1, BBH_Z1]
-    position_BH[1] = [BBH_X2, BBH_Y2, BBH_Z2]
-    
-    ## Optionally load momentum from parameter file
-    ## momentum_BH  = input_data.momentum_BH
-
-    ## Compute orbital momenta using the BBH_orbit_parameter module
-    import BBH_orbit_parameter 
-
-    ## Use the dimensionless spins defined in BBH_orbit_parameter
-    BBH_S1 = BBH_orbit_parameter.S1
-    BBH_S2 = BBH_orbit_parameter.S2
-
-    momentum_BH = numpy.zeros( (2,3) )
-
-    ## Compute initial orbital momenta from post-Newtonian-based routine
-    momentum_BH[0], momentum_BH[1] = BBH_orbit_parameter.generate_BBH_orbit_parameters( BBH_M1, BBH_M2, BBH_S1, BBH_S2, distance, e0 ) 
-
-    ## Set spin angular momentum input for TwoPuncture
-    ## Note: these are dimensional angular momenta (not dimensionless); multiply
-    ## by the square of the mass scale. Here masses are scaled so total M=1.
-    ## angular_momentum_BH = input_data.angular_momentum_BH
-
-    angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )  
-    
-    for i in range(input_data.puncture_number):
-    
-        if ( input_data.Symmetry == "equatorial-symmetry" ):
-            if i==0:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * input_data.parameter_BH[i,2] ]
-            elif i==1:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * input_data.parameter_BH[i,2] ]
-            else:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
-                
-        elif ( input_data.Symmetry == "no-symmetry" ):
-        
-            if i==0:
-                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
-            elif i==1:
-                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
-            else:
-                angular_momentum_BH[i] = (input_data.parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
-            
-    #######################################################
-
-## If puncture data are set to "Manually", read initial positions and momenta
-## directly from the parameter file. Rescale the total binary mass to M=1
-## for TwoPuncture input.
-
-elif (input_data.puncture_data_set == "Manually" ):
-
-    mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
-    
-    if ( mass_ratio_Q < 1.0 ):
-        print( " mass_ratio setting is wrong, please reset!!!" ) 
-        print( " set the first black hole to be the larger mass!!!" ) 
-        
-    BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
-    BBH_M2 = 1.0          / ( 1.0 + mass_ratio_Q )
-    
-    parameter_BH = input_data.parameter_BH
-    position_BH  = input_data.position_BH
-    momentum_BH  = input_data.momentum_BH
-    
-    ## Compute binary separation and load eccentricity
-    distance = math.sqrt( (position_BH[0,0]-position_BH[1,0])**2 + (position_BH[0,1]-position_BH[1,1])**2 + (position_BH[0,2]-position_BH[1,2])**2 )
-    e0       = input_data.e0
-
-    ## Set spin angular momentum input for TwoPuncture
-    ## Note: these are dimensional angular momenta (not dimensionless); multiply
-    ## by the square of the mass scale. Here masses are scaled so total M=1.
-
-    ## angular_momentum_BH = input_data.angular_momentum_BH
-
-    angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )   
-
-        
-    for i in range(input_data.puncture_number):
-    
-        if ( input_data.Symmetry == "equatorial-symmetry" ):
-            if i==0:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * parameter_BH[i,2] ]
-            elif i==1:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * parameter_BH[i,2] ]
-            else:
-                angular_momentum_BH[i] = [ 0.0, 0.0, (parameter_BH[i,0]**2) * parameter_BH[i,2] ]
-                
-        elif ( input_data.Symmetry == "no-symmetry" ):
-            if i==0:
-                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
-            elif i==1:
-                angular_momentum_BH[i] = (BBH_M2**2) * input_data.dimensionless_spin_BH[i]
-            else:
-                angular_momentum_BH[i] = (parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
-
-
-##################################################################
-
-## Write the above binary data into the AMSS-NCKU TwoPuncture input file
-    
-def generate_AMSSNCKU_TwoPuncture_input(): 
-
-    file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU-TwoPuncture.input"), "w") 
-
-    print( "#  -----0-----> y",                           file=file1 )
-    print( "#   -      +      use Brugmann's convention", file=file1 )
-    print( "ABE::mp        = -1.0",                       file=file1 )   ## use negative values so the code solves for bare masses automatically
-    print( "ABE::mm        = -1.0",                       file=file1 )
-    print( "# b            =  D/2",                       file=file1 )
-    print( "ABE::b         = ", ( distance / 2.0 ),       file=file1 )
-    print( "ABE::P_plusx   = ", momentum_BH[0,0],         file=file1 )
-    print( "ABE::P_plusy   = ", momentum_BH[0,1],         file=file1 )
-    print( "ABE::P_plusz   = ", momentum_BH[0,2],         file=file1 )
-    print( "ABE::P_minusx  = ", momentum_BH[1,0],         file=file1 )
-    print( "ABE::P_minusy  = ", momentum_BH[1,1],         file=file1 )
-    print( "ABE::P_minusz  = ", momentum_BH[1,2],         file=file1 )
-    print( "ABE::S_plusx   = ", angular_momentum_BH[0,0], file=file1 )
-    print( "ABE::S_plusy   = ", angular_momentum_BH[0,1], file=file1 )
-    print( "ABE::S_plusz   = ", angular_momentum_BH[0,2], file=file1 )
-    print( "ABE::S_minusx  = ", angular_momentum_BH[1,0], file=file1 )
-    print( "ABE::S_minusy  = ", angular_momentum_BH[1,1], file=file1 )
-    print( "ABE::S_minusz  = ", angular_momentum_BH[1,2], file=file1 )
-    print( "ABE::Mp        = ", BBH_M1,                   file=file1 )
-    print( "ABE::Mm        = ", BBH_M2,                   file=file1 )
-    print( "ABE::admtol    =  1.e-8",                     file=file1 )
-    print( "ABE::Newtontol =  5.e-12",                    file=file1 )
-    print( "ABE::nA        =  50",                        file=file1 )
-    print( "ABE::nB        =  50",                        file=file1 )
-    print( "ABE::nphi      =  26",                        file=file1 )
-    print( "ABE::Newtonmaxit =  50",                      file=file1 )
-    
-    file1.close()
-
-    return file1
-    
-##################################################################
-    
-    
+
+##################################################################
+##
+## Generate input file for the AMSS-NCKU TwoPuncture routine
+## Author: Xiaoqu
+## 2024/11/27
+## Modified: 2025/01/21
+##
+##################################################################
+
+
+import numpy
+import os 
+import AMSS_NCKU_Input as input_data          ## import program input file
+import math
+
+##################################################################
+
+## Import binary black hole coordinates
+
+## If puncture data are set to "Automatically-BBH", compute initial orbital
+## positions and momenta according to the settings and rescale the total
+## binary mass to M = 1 for TwoPuncture input.
+
+if (input_data.puncture_data_set == "Automatically-BBH" ):
+
+    mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
+    
+    if ( mass_ratio_Q < 1.0 ):
+        print( " mass_ratio setting is wrong, please reset!!!" ) 
+        print( " set the first black hole to be the larger mass!!!" ) 
+        
+    BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
+    BBH_M2 = 1.0          / ( 1.0 + mass_ratio_Q )
+
+    ## Load binary separation and eccentricity
+    distance = input_data.Distance
+    e0       = input_data.e0
+    
+    ## Set binary component coordinates
+    ## Note: place the larger-mass black hole at positive y and the
+    ## smaller-mass black hole at negative y to follow Brugmann's convention
+    ## Coordinate convention for TwoPuncture input (Brugmann):
+    ##  -----0-----> y
+    ##   -      +     
+
+
+    BBH_X1 = 0.0
+    BBH_Y1 = distance * 1.0 / ( 1 + mass_ratio_Q )
+    BBH_Z1 = 0.0
+
+    BBH_X2 = 0.0
+    BBH_Y2 = - distance * mass_ratio_Q / ( 1 + mass_ratio_Q )
+    BBH_Z2 = 0.0
+    
+    position_BH    = numpy.zeros( (2,3) )
+    position_BH[0] = [BBH_X1, BBH_Y1, BBH_Z1]
+    position_BH[1] = [BBH_X2, BBH_Y2, BBH_Z2]
+    
+    ## Optionally load momentum from parameter file
+    ## momentum_BH  = input_data.momentum_BH
+
+    ## Compute orbital momenta using the BBH_orbit_parameter module
+    import BBH_orbit_parameter 
+
+    ## Use the dimensionless spins defined in BBH_orbit_parameter
+    BBH_S1 = BBH_orbit_parameter.S1
+    BBH_S2 = BBH_orbit_parameter.S2
+
+    momentum_BH = numpy.zeros( (2,3) )
+
+    ## Compute initial orbital momenta from post-Newtonian-based routine
+    momentum_BH[0], momentum_BH[1] = BBH_orbit_parameter.generate_BBH_orbit_parameters( BBH_M1, BBH_M2, BBH_S1, BBH_S2, distance, e0 ) 
+
+    ## Set spin angular momentum input for TwoPuncture
+    ## Note: these are dimensional angular momenta (not dimensionless); multiply
+    ## by the square of the mass scale. Here masses are scaled so total M=1.
+    ## angular_momentum_BH = input_data.angular_momentum_BH
+
+    angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )  
+    
+    for i in range(input_data.puncture_number):
+    
+        if ( input_data.Symmetry == "equatorial-symmetry" ):
+            if i==0:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * input_data.parameter_BH[i,2] ]
+            elif i==1:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * input_data.parameter_BH[i,2] ]
+            else:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
+                
+        elif ( input_data.Symmetry == "no-symmetry" ):
+        
+            if i==0:
+                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
+            elif i==1:
+                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
+            else:
+                angular_momentum_BH[i] = (input_data.parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
+            
+    #######################################################
+
+## If puncture data are set to "Manually", read initial positions and momenta
+## directly from the parameter file. Rescale the total binary mass to M=1
+## for TwoPuncture input.
+
+elif (input_data.puncture_data_set == "Manually" ):
+
+    mass_ratio_Q = input_data.parameter_BH[0,0] / input_data.parameter_BH[1,0]
+    
+    if ( mass_ratio_Q < 1.0 ):
+        print( " mass_ratio setting is wrong, please reset!!!" ) 
+        print( " set the first black hole to be the larger mass!!!" ) 
+        
+    BBH_M1 = mass_ratio_Q / ( 1.0 + mass_ratio_Q )
+    BBH_M2 = 1.0          / ( 1.0 + mass_ratio_Q )
+    
+    parameter_BH = input_data.parameter_BH
+    position_BH  = input_data.position_BH
+    momentum_BH  = input_data.momentum_BH
+    
+    ## Compute binary separation and load eccentricity
+    distance = math.sqrt( (position_BH[0,0]-position_BH[1,0])**2 + (position_BH[0,1]-position_BH[1,1])**2 + (position_BH[0,2]-position_BH[1,2])**2 )
+    e0       = input_data.e0
+
+    ## Set spin angular momentum input for TwoPuncture
+    ## Note: these are dimensional angular momenta (not dimensionless); multiply
+    ## by the square of the mass scale. Here masses are scaled so total M=1.
+
+    ## angular_momentum_BH = input_data.angular_momentum_BH
+
+    angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )   
+
+        
+    for i in range(input_data.puncture_number):
+    
+        if ( input_data.Symmetry == "equatorial-symmetry" ):
+            if i==0:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M1**2) * parameter_BH[i,2] ]
+            elif i==1:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (BBH_M2**2) * parameter_BH[i,2] ]
+            else:
+                angular_momentum_BH[i] = [ 0.0, 0.0, (parameter_BH[i,0]**2) * parameter_BH[i,2] ]
+                
+        elif ( input_data.Symmetry == "no-symmetry" ):
+            if i==0:
+                angular_momentum_BH[i] = (BBH_M1**2) * input_data.dimensionless_spin_BH[i]
+            elif i==1:
+                angular_momentum_BH[i] = (BBH_M2**2) * input_data.dimensionless_spin_BH[i]
+            else:
+                angular_momentum_BH[i] = (parameter_BH[i,0]**2) * input_data.dimensionless_spin_BH[i]
+
+
+##################################################################
+
+## Write the above binary data into the AMSS-NCKU TwoPuncture input file
+    
+def generate_AMSSNCKU_TwoPuncture_input(): 
+
+    file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU-TwoPuncture.input"), "w") 
+
+    print( "#  -----0-----> y",                           file=file1 )
+    print( "#   -      +      use Brugmann's convention", file=file1 )
+    print( "ABE::mp        = -1.0",                       file=file1 )   ## use negative values so the code solves for bare masses automatically
+    print( "ABE::mm        = -1.0",                       file=file1 )
+    print( "# b            =  D/2",                       file=file1 )
+    print( "ABE::b         = ", ( distance / 2.0 ),       file=file1 )
+    print( "ABE::P_plusx   = ", momentum_BH[0,0],         file=file1 )
+    print( "ABE::P_plusy   = ", momentum_BH[0,1],         file=file1 )
+    print( "ABE::P_plusz   = ", momentum_BH[0,2],         file=file1 )
+    print( "ABE::P_minusx  = ", momentum_BH[1,0],         file=file1 )
+    print( "ABE::P_minusy  = ", momentum_BH[1,1],         file=file1 )
+    print( "ABE::P_minusz  = ", momentum_BH[1,2],         file=file1 )
+    print( "ABE::S_plusx   = ", angular_momentum_BH[0,0], file=file1 )
+    print( "ABE::S_plusy   = ", angular_momentum_BH[0,1], file=file1 )
+    print( "ABE::S_plusz   = ", angular_momentum_BH[0,2], file=file1 )
+    print( "ABE::S_minusx  = ", angular_momentum_BH[1,0], file=file1 )
+    print( "ABE::S_minusy  = ", angular_momentum_BH[1,1], file=file1 )
+    print( "ABE::S_minusz  = ", angular_momentum_BH[1,2], file=file1 )
+    print( "ABE::Mp        = ", BBH_M1,                   file=file1 )
+    print( "ABE::Mm        = ", BBH_M2,                   file=file1 )
+    print( "ABE::admtol    =  1.e-8",                     file=file1 )
+    print( "ABE::Newtontol =  5.e-12",                    file=file1 )
+    print( "ABE::nA        =  50",                        file=file1 )
+    print( "ABE::nB        =  50",                        file=file1 )
+    print( "ABE::nphi      =  26",                        file=file1 )
+    print( "ABE::Newtonmaxit =  50",                      file=file1 )
+    
+    file1.close()
+
+    return file1
+    
+##################################################################
+    
+    
--- a/generate_interp_lb_header.py
+++ b/generate_interp_lb_header.py
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
-import struct, sys
-
-if len(sys.argv) < 3:
-    print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
-    sys.exit(1)
-
-with open(sys.argv[1], 'rb') as f:
-    magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
-    threshold = struct.unpack('d', f.read(8))[0]
-    times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
-    heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
-
-# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
-# (or vice versa depending on which neighbor is lighter)
-splits = []
-for hr in heavy:
-    prev_t = times[hr - 1] if hr > 0 else 1e30
-    next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
-    if prev_t <= next_t:
-        splits.append((hr, hr - 1, hr))  # (block_id, r_left, r_right)
-    else:
-        splits.append((hr, hr, hr + 1))
-
-# Also remap the displaced neighbor blocks
-remaps = {}
-for hr, r_l, r_r in splits:
-    if r_l != hr:
-        # We took r_l's slot, so remap block r_l to its other neighbor
-        displaced = r_l
-        if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
-            remaps[displaced] = displaced - 1
-        elif displaced < nprocs - 1:
-            remaps[displaced] = displaced + 1
-    else:
-        displaced = r_r
-        if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
-            remaps[displaced] = displaced + 1
-        elif displaced > 0:
-            remaps[displaced] = displaced - 1
-
-with open(sys.argv[2], 'w') as out:
-    out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
-    out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
-    out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
-    out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
-    out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
-    out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
-    out.write(", ".join(str(h) for h in heavy))
-    out.write("};\n\n")
-    out.write("/* Split table: {block_id, r_left, r_right} */\n")
-    out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
-    for bid, rl, rr in splits:
-        out.write(f"    {{{bid}, {rl}, {rr}}},\n")
-    out.write("};\n\n")
-    out.write("/* Rank remap for displaced neighbor blocks */\n")
-    out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
-    out.write(f"static const int interp_lb_remaps[][2] = {{\n")
-    for src, dst in sorted(remaps.items()):
-        out.write(f"    {{{src}, {dst}}},\n")
-    if not remaps:
-        out.write("    {-1, -1},\n")
-    out.write("};\n\n")
-    out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
-
-print(f"Generated {sys.argv[2]}:")
-print(f"  {num_heavy} heavy blocks to split: {heavy}")
-for bid, rl, rr in splits:
-    print(f"    block {bid}: split -> rank {rl} (left), rank {rr} (right)")
-for src, dst in sorted(remaps.items()):
-    print(f"    block {src}: remap -> rank {dst}")
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -1,221 +1,192 @@
-
-##################################################################
-##
-## This file defines the commands used to build and run AMSS-NCKU
-## Author: Xiaoqu
-## 2025/01/24
-##
-##################################################################
-
-
-import AMSS_NCKU_Input as input_data
-import subprocess
-import time
-
-
-def get_last_n_cores_per_socket(n=32):
-    """
-    Read CPU topology via lscpu and return a taskset -c string
-    selecting the last `n` cores of each NUMA node (socket).
-
-    Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
-    -> "taskset -c 24-55,80-111"
-    """
-    result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
-
-    # Build a dict: node_id -> sorted list of CPU ids
-    node_cpus = {}
-    for line in result.stdout.splitlines():
-        if line.startswith("#") or not line.strip():
-            continue
-        parts = line.split(",")
-        if len(parts) < 2:
-            continue
-        node_id, cpu_id = int(parts[0]), int(parts[1])
-        node_cpus.setdefault(node_id, []).append(cpu_id)
-
-    segments = []
-    for node_id in sorted(node_cpus):
-        cpus = sorted(node_cpus[node_id])
-        selected = cpus[-n:]          # last n cores of this socket
-        segments.append(f"{selected[0]}-{selected[-1]}")
-
-    cpu_str = ",".join(segments)
-    total = len(segments) * n
-    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
-    #return f"taskset -c {cpu_str}"
-    return f""
-
-
-## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
-NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
-
-## Build parallelism: match the number of bound cores
-BUILD_JOBS = 64
-
-
-##################################################################
-
-
-
-##################################################################
-
-## Compile the AMSS-NCKU main program ABE
-
-def makefile_ABE():
-
-    print(                                                        )
-    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
-    print(                                                        )
-
-    ## Build command with CPU binding to nohz_full cores
-    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
-    elif (input_data.GPU_Calculation == "yes"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
-    else:
-        print( " CPU/GPU numerical calculation setting is wrong " )
-        print(                                                    )
- 
-    ## Execute the command with subprocess.Popen and stream output
-    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-
-    ## Read and print output lines as they arrive
-    for line in makefile_process.stdout:
-        print(line, end='')  # stream output in real time
-
-    ## Wait for the process to finish
-    makefile_return_code = makefile_process.wait()
-    if makefile_return_code != 0:
-        raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
-        
-    print(                                                                  )
-    print( " Compilation of the AMSS-NCKU executable file ABE is finished " ) 
-    print(                                                                  )
-    
-    return
-        
-##################################################################
-
-
-
-##################################################################
-
-## Compile the AMSS-NCKU TwoPuncture program TwoPunctureABE
-
-def makefile_TwoPunctureABE():
-
-    print(                                                            )
-    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
-    print(                                                            )
-    
-    ## Build command with CPU binding to nohz_full cores
-    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
-
-    ## Execute the command with subprocess.Popen and stream output
-    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
-    
-    ## Read and print output lines as they arrive
-    for line in makefile_process.stdout:
-        print(line, end='')  # stream output in real time
-        
-    ## Wait for the process to finish
-    makefile_return_code = makefile_process.wait()
-    if makefile_return_code != 0:
-        raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
-        
-    print(                                                                             )
-    print( " Compilation of the AMSS-NCKU executable file TwoPunctureABE is finished " )
-    print(                                                                             )
-    
-    return
-    
-##################################################################
-
-
-
-##################################################################
-
-## Run the AMSS-NCKU main program ABE
-
-def run_ABE():
-
-    print(                                                      )
-    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
-    print(                                                      )
-
-    ## Define the command to run; cast other values to strings as needed
-    
-    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
-        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
-        mpi_command_outfile = "ABE_out.log"
-    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
-        mpi_command_outfile = "ABEGPU_out.log"
- 
-    ## Execute the MPI command and stream output
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-
-    ## Write ABE run output to file while printing to stdout
-    with open(mpi_command_outfile, 'w') as file0:  
-        ## Read and print output lines; also write each line to file
-        for line in mpi_process.stdout:
-            print(line, end='')  # stream output in real time
-            file0.write(line)    # write the line to file
-            file0.flush()        # flush to ensure each line is written immediately (optional)            
-    file0.close()
-
-    ## Wait for the process to finish
-    mpi_return_code = mpi_process.wait()
-    
-    print(                                           )
-    print( " The ABE/ABEGPU simulation is finished " ) 
-    print(                                           )
-    
-    return
-
-##################################################################
-
-
-
-##################################################################
-
-## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
-
-def run_TwoPunctureABE():
-    tp_time1=time.time()
-    print(                                                          )
-    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
-    print(                                                          )
-    
-    ## Define the command to run
-    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
-    TwoPuncture_command         = " ./TwoPunctureABE"
-    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
-
-    ## Execute the command with subprocess.Popen and stream output
-    TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-
-    ## Write TwoPunctureABE run output to file while printing to stdout
-    with open(TwoPuncture_command_outfile, 'w') as file0:  
-        ## Read and print output lines; also write each line to file
-        for line in TwoPuncture_process.stdout:
-            print(line, end='')  # stream output in real time
-            file0.write(line)    # write the line to file
-            file0.flush()        # flush to ensure each line is written immediately (optional)                 
-    file0.close()
-
-    ## Wait for the process to finish
-    TwoPuncture_command_return_code = TwoPuncture_process.wait()
-    
-    print(                                               )
-    print( " The TwoPunctureABE simulation is finished " ) 
-    print(                                               )
-    tp_time2=time.time()
-    et=tp_time2-tp_time1
-    print(f"Used time: {et}")
-    return
-
-##################################################################
-    
+
+##################################################################
+##
+## This file defines the commands used to build and run AMSS-NCKU
+## Author: Xiaoqu
+## 2025/01/24
+##
+##################################################################
+
+
+import AMSS_NCKU_Input as input_data
+import subprocess
+import time
+## CPU core binding configuration using taskset
+## taskset ensures all child processes inherit the CPU affinity mask
+## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
+## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
+#NUMACTL_CPU_BIND = "taskset -c 0-111"
+NUMACTL_CPU_BIND = "taskset -c 0-47"
+NUMACTL_CPU_BIND2 = "OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47} taskset -c 0-47"
+#NUMACTL_CPU_BIND2 = "taskset -c 0-1"
+## Build parallelism configuration
+## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
+## Set make -j to utilize available cores for faster builds
+BUILD_JOBS = 32
+
+
+##################################################################
+
+
+##################################################################
+
+## Compile the AMSS-NCKU main program ABE
+
+def makefile_ABE():
+
+    print(                                                        )
+    print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " ) 
+    print(                                                        )
+
+    ## Build command with CPU binding to nohz_full cores
+    if (input_data.GPU_Calculation == "no"):
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+    elif (input_data.GPU_Calculation == "yes"):
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
+    else:
+        print( " CPU/GPU numerical calculation setting is wrong " )
+        print(                                                    )
+ 
+    ## Execute the command with subprocess.Popen and stream output
+    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+    ## Read and print output lines as they arrive
+    for line in makefile_process.stdout:
+        print(line, end='')  # stream output in real time
+
+    ## Wait for the process to finish
+    makefile_return_code = makefile_process.wait()
+    if makefile_return_code != 0:
+        raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
+        
+    print(                                                                  )
+    print( " Compilation of the AMSS-NCKU executable file ABE is finished " ) 
+    print(                                                                  )
+    
+    return
+        
+##################################################################
+
+
+
+##################################################################
+
+## Compile the AMSS-NCKU TwoPuncture program TwoPunctureABE
+
+def makefile_TwoPunctureABE():
+
+    print(                                                            )
+    print( " Compiling the AMSS-NCKU executable file TwoPunctureABE " )
+    print(                                                            )
+    
+    ## Build command with CPU binding to nohz_full cores
+    makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
+
+    ## Execute the command with subprocess.Popen and stream output
+    makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) 
+    
+    ## Read and print output lines as they arrive
+    for line in makefile_process.stdout:
+        print(line, end='')  # stream output in real time
+        
+    ## Wait for the process to finish
+    makefile_return_code = makefile_process.wait()
+    if makefile_return_code != 0:
+        raise subprocess.CalledProcessError(makefile_return_code, makefile_command)
+        
+    print(                                                                             )
+    print( " Compilation of the AMSS-NCKU executable file TwoPunctureABE is finished " )
+    print(                                                                             )
+    
+    return
+    
+##################################################################
+
+
+
+##################################################################
+
+## Run the AMSS-NCKU main program ABE
+
+def run_ABE():
+
+    print(                                                      )
+    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
+    print(                                                      )
+
+    ## Define the command to run; cast other values to strings as needed
+    
+    if (input_data.GPU_Calculation == "no"):
+        #mpi_command         = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command = """ OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES=cores mpirun -np 1 --cpu-bind=sockets  ./ABE """
+        mpi_command_outfile = "ABE_out.log"
+    elif (input_data.GPU_Calculation == "yes"):
+        mpi_command         = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command_outfile = "ABEGPU_out.log"
+ 
+    ## Execute the MPI command and stream output
+    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+    ## Write ABE run output to file while printing to stdout
+    with open(mpi_command_outfile, 'w') as file0:  
+        ## Read and print output lines; also write each line to file
+        for line in mpi_process.stdout:
+            print(line, end='')  # stream output in real time
+            file0.write(line)    # write the line to file
+            file0.flush()        # flush to ensure each line is written immediately (optional)            
+    file0.close()
+
+    ## Wait for the process to finish
+    mpi_return_code = mpi_process.wait()
+    
+    print(                                           )
+    print( " The ABE/ABEGPU simulation is finished " ) 
+    print(                                           )
+    
+    return
+
+##################################################################
+
+
+
+##################################################################
+
+## Run the AMSS-NCKU TwoPuncture program TwoPunctureABE
+
+def run_TwoPunctureABE():
+    tp_time1=time.time()
+    print(                                                          )
+    print( " Running the AMSS-NCKU executable file TwoPunctureABE " ) 
+    print(                                                          )
+    
+    ## Define the command to run
+    #TwoPuncture_command         = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
+    TwoPuncture_command         = " ./TwoPunctureABE"
+    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
+
+    ## Execute the command with subprocess.Popen and stream output
+    TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+    ## Write TwoPunctureABE run output to file while printing to stdout
+    with open(TwoPuncture_command_outfile, 'w') as file0:  
+        ## Read and print output lines; also write each line to file
+        for line in TwoPuncture_process.stdout:
+            print(line, end='')  # stream output in real time
+            file0.write(line)    # write the line to file
+            file0.flush()        # flush to ensure each line is written immediately (optional)                 
+    file0.close()
+
+    ## Wait for the process to finish
+    TwoPuncture_command_return_code = TwoPuncture_process.wait()
+    
+    print(                                               )
+    print( " The TwoPunctureABE simulation is finished " ) 
+    print(                                               )
+    tp_time2=time.time()
+    et=tp_time2-tp_time1
+    print(f"Used time: {et}")
+    return
+
+##################################################################
+    
--- a/numerical_grid.py
+++ b/numerical_grid.py
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
-import multiprocessing
-
-def run_plot_task(task):
-    """Execute a single plotting task.
-    
-    Parameters
-    ----------
-    task : tuple
-        A tuple of (function, args_tuple) where function is a callable
-        plotting function and args_tuple contains its arguments.
-    """
-    func, args = task
-    return func(*args)
-
-
-def run_plot_tasks_parallel(plot_tasks):
-    """Execute a list of independent plotting tasks in parallel.
-
-    Uses the 'fork' context to create worker processes so that the main
-    script is NOT re-imported/re-executed in child processes.
-
-    Parameters
-    ----------
-    plot_tasks : list of tuples
-        Each element is (function, args_tuple).
-    """
-    ctx = multiprocessing.get_context('fork')
-    with ctx.Pool() as pool:
-        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -0,0 +1,97 @@
+# AMSS-NCKU PGO Profile Analysis Report
+
+## 1. Profiling Environment
+
+| Item | Value |
+|------|-------|
+| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
+| Instrumentation Flag | `-fprofile-instr-generate` |
+| Optimization Level (instrumented) | `-O2 -xHost -fma` |
+| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
+| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
+| Merged Profile | `default.profdata` (394 KB) |
+| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
+
+## 2. Reduced Simulation Parameters (for profiling run)
+
+| Parameter | Production Value | Profiling Value |
+|-----------|-----------------|-----------------|
+| MPI_processes | 64 | 1 |
+| grid_level | 9 | 4 |
+| static_grid_level | 5 | 3 |
+| static_grid_number | 96 | 24 |
+| moving_grid_number | 48 | 16 |
+| largest_box_xyz_max | 320^3 | 160^3 |
+| Final_Evolution_Time | 1000.0 | 10.0 |
+| Evolution_Step_Number | 10,000,000 | 1,000 |
+| Detector_Number | 12 | 2 |
+
+## 3. Profile Summary
+
+| Metric | Value |
+|--------|-------|
+| Total instrumented functions | 1,392 |
+| Functions with non-zero counts | 117 (8.4%) |
+| Functions with zero counts | 1,275 (91.6%) |
+| Maximum function entry count | 386,459,248 |
+| Maximum internal block count | 370,477,680 |
+| Total block count | 4,198,023,118 |
+
+## 4. Top 20 Hotspot Functions
+
+| Rank | Total Count | Max Block Count | Function | Category |
+|------|------------|-----------------|----------|----------|
+| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
+| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
+| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
+| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
+| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
+| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
+| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
+| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
+| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
+| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
+| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
+| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
+| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
+| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
+| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
+| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
+| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
+| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
+| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
+| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
+
+## 5. Hotspot Category Breakdown
+
+Top 20 functions account for ~98% of total execution counts:
+
+| Category | Functions | Combined Count | Share |
+|----------|-----------|---------------|-------|
+| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
+| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
+| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
+| Time integration | rungekutta4_rout_ | ~119M | ~3% |
+| Dissipation | kodis_ | ~92M | ~2% |
+| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
+
+## 6. Conclusions
+
+1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
+2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
+3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
+4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
+5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
+
+## 7. PGO Phase 2 Usage
+
+To apply the profile, use the following flags in `makefile.inc`:
+
+```makefile
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -align array64byte -fpp -I${MKLROOT}/include
+```
--- a/pgo_profile/TwoPunctureABE.profdata
+++ b/pgo_profile/TwoPunctureABE.profdata
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/pgo_profile/default_9726853898452064389_0.profdata
+++ b/pgo_profile/default_9726853898452064389_0.profdata
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
-import matplotlib
-matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations

--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################

-## Restrict OpenMP to one thread per process so that running
-## many workers in parallel does not create an O(workers * BLAS_threads)
-## thread explosion.  The variable MUST be set before numpy/scipy
-## are imported, because the BLAS library reads them only at load time.
-import os
-os.environ.setdefault("OMP_NUM_THREADS",        "1")
-
 import numpy
 import scipy
-import matplotlib
-matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data

+import os
+

 #########################################################################################

@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):

 ####################################################################################

-
-####################################################################################
-## Allow this module to be run as a standalone script so that each
-## binary-data plot can be executed in a fresh subprocess whose BLAS
-## environment variables (set above) take effect before numpy loads.
-##
-## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
-####################################################################################
-
-if __name__ == '__main__':
-    import sys
-    if len(sys.argv) != 4:
-        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
-        sys.exit(1)
-    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
-
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################

 import numpy                               ## numpy for array operations
-import matplotlib
-matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities

 import plot_binary_data
 import AMSS_NCKU_Input as input_data
-import subprocess
-import sys
-import multiprocessing

 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots

@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)

-    ## Plot each file in parallel using subprocesses.
-    ## Each subprocess is a fresh Python process where the BLAS thread-count
-    ## environment variables (set at the top of plot_binary_data.py) take
-    ## effect before numpy is imported.  This avoids the thread explosion
-    ## that occurs when multiprocessing.Pool with 'fork' context inherits
-    ## already-initialized multi-threaded BLAS from the parent.
-    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
-    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
-
-    running = []
-    failed  = []
+    ## Plot each file in the list
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
-            [sys.executable, script, filename, binary_outdir, figure_outdir],
-        )
-        running.append( (proc, filename) )
-        ## Keep at most max_workers subprocesses active at a time
-        if len(running) >= max_workers:
-            p, fn = running.pop(0)
-            p.wait()
-            if p.returncode != 0:
-                failed.append(fn)
-
-    ## Wait for all remaining subprocesses to finish
-    for p, fn in running:
-        p.wait()
-        if p.returncode != 0:
-            failed.append(fn)
-
-    if failed:
-        print( " WARNING: the following binary data plots failed:" )
-        for fn in failed:
-            print( "   ", fn )
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)

    print(                        )
    print( " Binary Data Plot Has been Finished " )
--- a/renew_puncture_parameter.py
+++ b/renew_puncture_parameter.py
@@ -1,133 +1,133 @@
-
-##################################################################
-##
-## Update puncture parameters from TwoPuncture output
-## Author: Xiaoqu
-## 2024/12/04
-##
-##################################################################
-
-import AMSS_NCKU_Input as input_data
-import numpy
-import os
-
-##################################################################
-
-
-
-##################################################################
-
-def read_TwoPuncture_Output(Output_File_directory):
-
-    dimensionless_mass_BH = numpy.zeros( input_data.puncture_number )
-    bare_mass_BH          = numpy.zeros( input_data.puncture_number )        ## initialize bare mass for each black hole
-    position_BH           = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize initial position for each black hole
-    momentum_BH           = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize momentum for each black hole
-    angular_momentum_BH   = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize spin angular momentum for each black hole
-    
-    # Read TwoPuncture output file
-    data = numpy.loadtxt( os.path.join(Output_File_directory, "puncture_parameters_new.txt") )
-    # Ensure data is parsed as a 1-D array
-    data = data.reshape(-1)
-    
-    for i in range(input_data.puncture_number):
-        
-        ## Read parameters for the first two punctures from TwoPuncture output
-        ## For additional punctures, read parameters from the input file
-        if i<2:
-            bare_mass_BH[i]          = data[12*i]
-            dimensionless_mass_BH[i] = data[12*i+1]
-            position_BH[i]           = [ data[12*i+3], data[12*i+4],  data[12*i+5]  ]
-            momentum_BH[i]           = [ data[12*i+6], data[12*i+7],  data[12*i+8]  ]
-            angular_momentum_BH[i]   = [ data[12*i+9], data[12*i+10], data[12*i+11] ]
-        else:
-            dimensionless_mass_BH[i] = input_data.parameter_BH[i,0]
-            bare_mass_BH[i]          = input_data.parameter_BH[i,0]
-            position_BH[i]           = input_data.position_BH[i]
-            momentum_BH[i]           = input_data.momentum_BH[i]
-            ## Read angular momentum according to symmetry
-            if ( input_data.Symmetry == "equatorial-symmetry" ):
-                angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
-            elif ( input_data.Symmetry == "no-symmetry" ):
-                angular_momentum_BH[i] = (dimensionless_mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
-    
-    return bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH
-    
-##################################################################
-
-
-##################################################################
-
-## Append the computed puncture information into the AMSS-NCKU input file
-
-def append_AMSSNCKU_BSSN_input(File_directory, TwoPuncture_File_directory): 
-
-    charge_Q_BH = numpy.zeros( input_data.puncture_number )   ## initialize charge for each black hole
-
-    ## If using Ansorg-TwoPuncture to solve the initial-data problem, read
-    ## bare masses, positions and angular momenta from TwoPuncture output
-    if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
-        bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH = read_TwoPuncture_Output(TwoPuncture_File_directory)
-        # set charge for each black hole
-        for i in range(input_data.puncture_number):
-            charge_Q_BH[i] = dimensionless_mass_BH[i] * input_data.parameter_BH[i,1]
-    
-    ## If using another method for initial data, read parameters directly from input
-    else:
-        position_BH = input_data.position_BH
-        momentum_BH = input_data.momentum_BH
-        ## angular_momentum_BH = input_data.angular_momentum_BH
-        angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize spin angular momentum array
-        mass_BH             = numpy.zeros( input_data.puncture_number      )   ## initialize mass array
-
-        ## Set charge and spin angular momentum for each puncture
-        for i in range(input_data.puncture_number):
-
-            if ( input_data.Symmetry == "octant-symmetry" ):
-                mass_BH[i]             = input_data.parameter_BH[i,0]
-                charge_Q_BH[i]         = mass_BH[i]* input_data.parameter_BH[i,1]
-                angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
-            elif ( input_data.Symmetry == "equatorial-symmetry" ):
-                mass_BH[i]             = input_data.parameter_BH[i,0]
-                charge_Q_BH[i]         = mass_BH[i]* input_data.parameter_BH[i,1]
-                angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
-            elif ( input_data.Symmetry == "no-symmetry" ):
-                mass_BH[i]             = input_data.parameter_BH[i,0]
-                angular_momentum_BH[i] = (mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
-                charge_Q_BH[i]         = mass_BH[i]      * input_data.parameter_BH[i,1]
-
-    file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU.input"), "a")   ## open file in append mode
-
-    ## Output BSSN related settings
-    
-    print(                                                                           file=file1 )
-    print( "BSSN::chitiny  = 1e-5",                                                  file=file1 ) 
-    print( "BSSN::time refinement start from level = ", input_data.refinement_level, file=file1 )
-    print( "BSSN::BH_num   =  ",                        input_data.puncture_number,  file=file1 )
-    
-    for i in range(input_data.puncture_number):
-    
-        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
-            print( f"BSSN::Mass[{i}]  = { bare_mass_BH[i] } ",      file=file1 )
-        else:
-            print( f"BSSN::Mass[{i}]  = { mass_BH[i] } ",           file=file1 )
-            
-        print( f"BSSN::Qchar[{i}] = { charge_Q_BH[i] } ",           file=file1 )
-        print( f"BSSN::Porgx[{i}] = { position_BH[i,0] } ",         file=file1 )
-        print( f"BSSN::Porgy[{i}] = { position_BH[i,1] } ",         file=file1 )
-        print( f"BSSN::Porgz[{i}] = { position_BH[i,2] } ",         file=file1 )
-        print( f"BSSN::Pmomx[{i}] = { momentum_BH[i,0] } ",         file=file1 )
-        print( f"BSSN::Pmomy[{i}] = { momentum_BH[i,1] } ",         file=file1 )
-        print( f"BSSN::Pmomz[{i}] = { momentum_BH[i,2] } ",         file=file1 )
-        print( f"BSSN::Spinx[{i}] = { angular_momentum_BH[i,0] } ", file=file1 )
-        print( f"BSSN::Spiny[{i}] = { angular_momentum_BH[i,1] } ", file=file1 )
-        print( f"BSSN::Spinz[{i}] = { angular_momentum_BH[i,2] } ", file=file1 )
-            
-    print(                                                          file=file1 )
-    
-    file1.close()
-
-    return
-    
-#################################################
-
+
+##################################################################
+##
+## Update puncture parameters from TwoPuncture output
+## Author: Xiaoqu
+## 2024/12/04
+##
+##################################################################
+
+import AMSS_NCKU_Input as input_data
+import numpy
+import os
+
+##################################################################
+
+
+
+##################################################################
+
+def read_TwoPuncture_Output(Output_File_directory):
+
+    dimensionless_mass_BH = numpy.zeros( input_data.puncture_number )
+    bare_mass_BH          = numpy.zeros( input_data.puncture_number )        ## initialize bare mass for each black hole
+    position_BH           = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize initial position for each black hole
+    momentum_BH           = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize momentum for each black hole
+    angular_momentum_BH   = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize spin angular momentum for each black hole
+    
+    # Read TwoPuncture output file
+    data = numpy.loadtxt( os.path.join(Output_File_directory, "puncture_parameters_new.txt") )
+    # Ensure data is parsed as a 1-D array
+    data = data.reshape(-1)
+    
+    for i in range(input_data.puncture_number):
+        
+        ## Read parameters for the first two punctures from TwoPuncture output
+        ## For additional punctures, read parameters from the input file
+        if i<2:
+            bare_mass_BH[i]          = data[12*i]
+            dimensionless_mass_BH[i] = data[12*i+1]
+            position_BH[i]           = [ data[12*i+3], data[12*i+4],  data[12*i+5]  ]
+            momentum_BH[i]           = [ data[12*i+6], data[12*i+7],  data[12*i+8]  ]
+            angular_momentum_BH[i]   = [ data[12*i+9], data[12*i+10], data[12*i+11] ]
+        else:
+            dimensionless_mass_BH[i] = input_data.parameter_BH[i,0]
+            bare_mass_BH[i]          = input_data.parameter_BH[i,0]
+            position_BH[i]           = input_data.position_BH[i]
+            momentum_BH[i]           = input_data.momentum_BH[i]
+            ## Read angular momentum according to symmetry
+            if ( input_data.Symmetry == "equatorial-symmetry" ):
+                angular_momentum_BH[i] = [ 0.0, 0.0, (input_data.parameter_BH[i,0]**2) * input_data.parameter_BH[i,2] ]
+            elif ( input_data.Symmetry == "no-symmetry" ):
+                angular_momentum_BH[i] = (dimensionless_mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
+    
+    return bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH
+    
+##################################################################
+
+
+##################################################################
+
+## Append the computed puncture information into the AMSS-NCKU input file
+
+def append_AMSSNCKU_BSSN_input(File_directory, TwoPuncture_File_directory): 
+
+    charge_Q_BH = numpy.zeros( input_data.puncture_number )   ## initialize charge for each black hole
+
+    ## If using Ansorg-TwoPuncture to solve the initial-data problem, read
+    ## bare masses, positions and angular momenta from TwoPuncture output
+    if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
+        bare_mass_BH, dimensionless_mass_BH, position_BH, momentum_BH, angular_momentum_BH = read_TwoPuncture_Output(TwoPuncture_File_directory)
+        # set charge for each black hole
+        for i in range(input_data.puncture_number):
+            charge_Q_BH[i] = dimensionless_mass_BH[i] * input_data.parameter_BH[i,1]
+    
+    ## If using another method for initial data, read parameters directly from input
+    else:
+        position_BH = input_data.position_BH
+        momentum_BH = input_data.momentum_BH
+        ## angular_momentum_BH = input_data.angular_momentum_BH
+        angular_momentum_BH = numpy.zeros( (input_data.puncture_number, 3) )   ## initialize spin angular momentum array
+        mass_BH             = numpy.zeros( input_data.puncture_number      )   ## initialize mass array
+
+        ## Set charge and spin angular momentum for each puncture
+        for i in range(input_data.puncture_number):
+
+            if ( input_data.Symmetry == "octant-symmetry" ):
+                mass_BH[i]             = input_data.parameter_BH[i,0]
+                charge_Q_BH[i]         = mass_BH[i]* input_data.parameter_BH[i,1]
+                angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
+            elif ( input_data.Symmetry == "equatorial-symmetry" ):
+                mass_BH[i]             = input_data.parameter_BH[i,0]
+                charge_Q_BH[i]         = mass_BH[i]* input_data.parameter_BH[i,1]
+                angular_momentum_BH[i] = [ 0.0, 0.0, (mass_BH[i]**2) * input_data.parameter_BH[i,2] ]
+            elif ( input_data.Symmetry == "no-symmetry" ):
+                mass_BH[i]             = input_data.parameter_BH[i,0]
+                angular_momentum_BH[i] = (mass_BH[i]**2) * input_data.dimensionless_spin_BH[i]
+                charge_Q_BH[i]         = mass_BH[i]      * input_data.parameter_BH[i,1]
+
+    file1 = open( os.path.join(input_data.File_directory, "AMSS-NCKU.input"), "a")   ## open file in append mode
+
+    ## Output BSSN related settings
+    
+    print(                                                                           file=file1 )
+    print( "BSSN::chitiny  = 1e-5",                                                  file=file1 ) 
+    print( "BSSN::time refinement start from level = ", input_data.refinement_level, file=file1 )
+    print( "BSSN::BH_num   =  ",                        input_data.puncture_number,  file=file1 )
+    
+    for i in range(input_data.puncture_number):
+    
+        if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
+            print( f"BSSN::Mass[{i}]  = { bare_mass_BH[i] } ",      file=file1 )
+        else:
+            print( f"BSSN::Mass[{i}]  = { mass_BH[i] } ",           file=file1 )
+            
+        print( f"BSSN::Qchar[{i}] = { charge_Q_BH[i] } ",           file=file1 )
+        print( f"BSSN::Porgx[{i}] = { position_BH[i,0] } ",         file=file1 )
+        print( f"BSSN::Porgy[{i}] = { position_BH[i,1] } ",         file=file1 )
+        print( f"BSSN::Porgz[{i}] = { position_BH[i,2] } ",         file=file1 )
+        print( f"BSSN::Pmomx[{i}] = { momentum_BH[i,0] } ",         file=file1 )
+        print( f"BSSN::Pmomy[{i}] = { momentum_BH[i,1] } ",         file=file1 )
+        print( f"BSSN::Pmomz[{i}] = { momentum_BH[i,2] } ",         file=file1 )
+        print( f"BSSN::Spinx[{i}] = { angular_momentum_BH[i,0] } ", file=file1 )
+        print( f"BSSN::Spiny[{i}] = { angular_momentum_BH[i,1] } ", file=file1 )
+        print( f"BSSN::Spinz[{i}] = { angular_momentum_BH[i,2] } ", file=file1 )
+            
+    print(                                                          file=file1 )
+    
+    file1.close()
+
+    return
+    
+#################################################
+