skip redundant MPI ghost cell syncs for stages 0, 1 & 2

BSSN 每个 RK4 时间步执行 4 次 MPI ghost zone 同步： Stage 0（预测）结束后：Parallel::Sync(SynchList_pre) Stage 1（校正 1）结束后：Parallel::Sync(SynchList_cor) Stage 2（校正 2）结束后：Parallel::Sync(SynchList_cor) Stage 3（校正 3）结束后：Parallel::Sync(SynchList_cor) ← 必要（为下一步提供 ghost） bssnEM_class.C、Z4c_class.C 结构相同，一起修改了
Initialize output arrays to zero in fdderivs_c.C and fderivs_c.C
2026-02-26 16:16:33 +08:00 · 2026-02-26 11:48:28 +08:00
38 changed files with 1500 additions and 3525 deletions
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -270,12 +270,6 @@ if not os.path.exists( ABE_file ):
 ## Copy the executable ABE (or ABEGPU) into the run directory
 shutil.copy2(ABE_file, output_directory)

-## Copy interp load balance profile if present (for optimize pass)
-interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
-if os.path.exists(interp_lb_profile):
-    shutil.copy2(interp_lb_profile, output_directory)
-    print( " Copied interp_lb_profile.bin to run directory " )
-
 ###########################

 ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,13 +1,9 @@
 #!/usr/bin/env python3
 """
-AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
+AMSS-NCKU GW150914 Simulation Regression Test Script

 Verification Requirements:
-1. RMS errors < 1% for:
-   - 3D Vector Total RMS
-   - X Component RMS
-   - Y Component RMS
-   - Z Component RMS
+1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
 2. ADM constraint violation < 2 (Grid Level 0)

 RMS Calculation Method:
@@ -61,62 +57,79 @@ def load_constraint_data(filepath):
                data.append([float(x) for x in parts[:8]])
    return np.array(data)

-def calculate_all_rms_errors(bh_data_ref, bh_data_target):
+
+def calculate_rms_error(bh_data_ref, bh_data_target):
    """
-    Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
-    Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
-    Returns the maximum error between BH1 and BH2 for each category.
+    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
+
+    This function computes the RMS error independently for BH1 and BH2 trajectories,
+    then returns the maximum of the two as the final RMS error metric.
+
+    For each black hole, the RMS is calculated as:
+        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
+
+    where:
+        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
+        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
+
+    Args:
+        bh_data_ref: Reference (baseline) trajectory data
+        bh_data_target: Target (optimized) trajectory data
+
+    Returns:
+        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
+        error: Error message if any
    """
+    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))

    if M < 10:
        return None, "Insufficient data points for comparison"

-    results = {}
+    # Extract XY coordinates for both black holes
+    x1_ref = bh_data_ref['x1'][:M]
+    y1_ref = bh_data_ref['y1'][:M]
+    x2_ref = bh_data_ref['x2'][:M]
+    y2_ref = bh_data_ref['y2'][:M]

-    for bh in ['1', '2']:
-        x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
-        x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
+    x1_new = bh_data_target['x1'][:M]
+    y1_new = bh_data_target['y1'][:M]
+    x2_new = bh_data_target['x2'][:M]
+    y2_new = bh_data_target['y2'][:M]

-        # 核心修改：根据组委会的邮件指示，分母统一使用 r = sqrt(x^2 + y^2)
-        r_ref = np.sqrt(x_r**2 + y_r**2)
-        r_new = np.sqrt(x_n**2 + y_n**2)
-        denom_max = np.maximum(r_ref, r_new)
+    # Calculate RMS for BH1
+    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
+    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
+    r1_new = np.sqrt(x1_new**2 + y1_new**2)
+    r1_max = np.maximum(r1_ref, r1_new)

-        valid = denom_max > 1e-15
-        if np.sum(valid) < 10:
-            results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
-            continue
+    # Calculate RMS for BH2
+    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
+    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
+    r2_new = np.sqrt(x2_new**2 + y2_new**2)
+    r2_max = np.maximum(r2_ref, r2_new)

-        def calc_rms(delta):
-            # 将对应分量的偏差除以统一的轨道半径分母 denom_max
-            return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
+    # Avoid division by zero for BH1
+    valid_mask1 = r1_max > 1e-15
+    if np.sum(valid_mask1) < 10:
+        return None, "Insufficient valid data points for BH1"

-        # 1. Total 3D Vector RMS
-        delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
-        rms_3d = calc_rms(delta_vec)
+    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
+    rms_bh1 = np.sqrt(np.mean(terms1)) * 100

-        # 2. Component-wise RMS (分离计算各轴，但共用半径分母)
-        rms_x = calc_rms(np.abs(x_r - x_n))
-        rms_y = calc_rms(np.abs(y_r - y_n))
-        rms_z = calc_rms(np.abs(z_r - z_n))
+    # Avoid division by zero for BH2
+    valid_mask2 = r2_max > 1e-15
+    if np.sum(valid_mask2) < 10:
+        return None, "Insufficient valid data points for BH2"

-        results[f'BH{bh}'] = {
-            '3D_Vector': rms_3d,
-            'X_Component': rms_x,
-            'Y_Component': rms_y,
-            'Z_Component': rms_z
-        }
+    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
+    rms_bh2 = np.sqrt(np.mean(terms2)) * 100

-    # 获取 BH1 和 BH2 中的最大误差
-    max_rms = {
-        '3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
-        'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
-        'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
-        'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
-    }
+    # Final RMS is the maximum of BH1 and BH2
+    rms_final = max(rms_bh1, rms_bh2)
+
+    return rms_final, None

-    return max_rms, None

 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
@@ -142,32 +155,34 @@ def analyze_constraint_violation(constraint_data, n_levels=9):


 def print_header():
+    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print(Color.BOLD + "   AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
+    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)

-def print_rms_results(rms_dict, error, threshold=1.0):
-    print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
-    print("-" * 65)
+
+def print_rms_results(rms_rel, error, threshold=1.0):
+    """Print RMS error results"""
+    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
+    print("-" * 45)

    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False

-    all_passed = True
-    print(f"   Requirement: < {threshold}%\n")
+    passed = rms_rel < threshold

-    for key, val in rms_dict.items():
-        passed = val < threshold
-        all_passed = all_passed and passed
-        status = get_status_text(passed)
-        print(f"   {key:15}: {val:8.4f}%   |   Status: {status}")
+    print(f"   RMS relative error: {rms_rel:.4f}%")
+    print(f"   Requirement:        < {threshold}%")
+    print(f"   Status:             {get_status_text(passed)}")
+
+    return passed

-    return all_passed

 def print_constraint_results(results, threshold=2.0):
+    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
-    print("-" * 65)
+    print("-" * 45)

    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
@@ -185,6 +200,7 @@ def print_constraint_results(results, threshold=2.0):


 def print_summary(rms_passed, constraint_passed):
+    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
@@ -194,7 +210,7 @@ def print_summary(rms_passed, constraint_passed):
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)

-    print(f"   [1] Comprehensive RMS check:      {res_rms}")
+    print(f"   [1] RMS trajectory check:         {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
@@ -203,48 +219,61 @@ def print_summary(rms_passed, constraint_passed):

    return all_passed

+
 def main():
+    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")

+    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")

+    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")

+    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
+
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
+
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)

+    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")

+    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)

-    # Output modified RMS results
-    rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
-    rms_passed = print_rms_results(rms_dict, error)
+    # Calculate RMS error
+    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
+    rms_passed = print_rms_results(rms_rel, error)

-    # Output constraint results
+    # Analyze constraint violation
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)

+    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
+
+    # Return exit code
    sys.exit(0 if all_passed else 1)

+
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -7,178 +7,12 @@
 #include <string>
 #include <cmath>
 #include <new>
-#include <vector>
 using namespace std;

 #include "misc.h"
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
-#ifdef INTERP_LB_PROFILE
-#include "interp_lb_profile.h"
-#endif
-
-namespace
-{
-struct InterpBlockView
-{
-  Block *bp;
-  double llb[dim];
-  double uub[dim];
-};
-
-struct BlockBinIndex
-{
-  int bins[dim];
-  double lo[dim];
-  double inv[dim];
-  vector<InterpBlockView> views;
-  vector<vector<int>> bin_to_blocks;
-  bool valid;
-
-  BlockBinIndex() : valid(false)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      bins[i] = 1;
-      lo[i] = 0.0;
-      inv[i] = 0.0;
-    }
-  }
-};
-
-inline int clamp_int(int v, int lo, int hi)
-{
-  return (v < lo) ? lo : ((v > hi) ? hi : v);
-}
-
-inline int coord_to_bin(double x, double lo, double inv, int nb)
-{
-  if (nb <= 1 || inv <= 0.0)
-    return 0;
-  int b = int(floor((x - lo) * inv));
-  return clamp_int(b, 0, nb - 1);
-}
-
-inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
-{
-  return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
-}
-
-inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
-{
-  for (int i = 0; i < dim; i++)
-  {
-    if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
-      return false;
-  }
-  return true;
-}
-
-void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
-{
-  index = BlockBinIndex();
-
-  MyList<Block> *Bp = patch->blb;
-  while (Bp)
-  {
-    Block *BP = Bp->data;
-    InterpBlockView view;
-    view.bp = BP;
-    for (int i = 0; i < dim; i++)
-    {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
-#endif
-      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    index.views.push_back(view);
-    if (Bp == patch->ble)
-      break;
-    Bp = Bp->next;
-  }
-
-  const int nblocks = int(index.views.size());
-  if (nblocks <= 0)
-    return;
-
-  int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
-  bins_1d = clamp_int(bins_1d, 1, 32);
-  for (int i = 0; i < dim; i++)
-  {
-    index.bins[i] = bins_1d;
-    index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
-    const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
-    if (hi > index.lo[i] && bins_1d > 1)
-      index.inv[i] = bins_1d / (hi - index.lo[i]);
-    else
-      index.inv[i] = 0.0;
-  }
-
-  index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
-
-  for (int bi = 0; bi < nblocks; bi++)
-  {
-    const InterpBlockView &view = index.views[bi];
-    int bmin[dim], bmax[dim];
-    for (int d = 0; d < dim; d++)
-    {
-      const double low = view.llb[d] - DH[d] / 2;
-      const double up = view.uub[d] + DH[d] / 2;
-      bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
-      bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
-      if (bmax[d] < bmin[d])
-      {
-        int t = bmin[d];
-        bmin[d] = bmax[d];
-        bmax[d] = t;
-      }
-    }
-
-    for (int bz = bmin[2]; bz <= bmax[2]; bz++)
-      for (int by = bmin[1]; by <= bmax[1]; by++)
-        for (int bx = bmin[0]; bx <= bmax[0]; bx++)
-          index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
-  }
-
-  index.valid = true;
-}
-
-int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
-{
-  if (!index.valid)
-    return -1;
-
-  const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
-  const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
-  const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
-  const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
-
-  for (size_t ci = 0; ci < cand.size(); ci++)
-  {
-    const int bi = cand[ci];
-    if (point_in_block_view(index.views[bi], pox, DH))
-      return bi;
-  }
-
-  // Fallback to full scan for numerical edge cases around bin boundaries.
-  for (size_t bi = 0; bi < index.views.size(); bi++)
-    if (point_in_block_view(index.views[bi], pox, DH))
-      return int(bi);
-
-  return -1;
-}
-} // namespace

 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -530,11 +364,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;

-  double DH[dim];
+  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);

  for (int j = 0; j < NN; j++) // run along points
  {
@@ -557,10 +389,39 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }

-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = block_index.views[block_i].bp;
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
@@ -576,6 +437,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
          }
        }
      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
  }

  // Replace MPI_Allreduce with per-owner MPI_Bcast:
@@ -642,9 +507,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
-#ifdef INTERP_LB_PROFILE
-  double t_interp_start = MPI_Wtime();
-#endif
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -667,11 +529,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;

-  double DH[dim];
+  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);

  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
@@ -695,10 +555,39 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }

-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp)
    {
-      Block *BP = block_index.views[block_i].bp;
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
@@ -713,12 +602,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
          }
        }
      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
  }
-
-#ifdef INTERP_LB_PROFILE
-  double t_interp_end = MPI_Wtime();
-  double t_interp_local = t_interp_end - t_interp_start;
-#endif

  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
@@ -876,31 +764,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
-
-#ifdef INTERP_LB_PROFILE
-  {
-    static bool profile_written = false;
-    if (!profile_written) {
-      double *all_times = nullptr;
-      if (myrank == 0) all_times = new double[nprocs];
-      MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
-                 all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
-      if (myrank == 0) {
-        int heavy[64];
-        int nh = InterpLBProfile::identify_heavy_ranks(
-            all_times, nprocs, 2.5, heavy, 64);
-        InterpLBProfile::write_profile(
-            "interp_lb_profile.bin", nprocs,
-            all_times, heavy, nh, 2.5);
-        printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
-        for (int i = 0; i < nh; i++)
-          printf("  Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
-        delete[] all_times;
-      }
-      profile_written = true;
-    }
-  }
-#endif
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -934,11 +797,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);

-  double DH[dim];
+  double DH[dim], llb[dim], uub[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);

  for (int j = 0; j < NN; j++) // run along points
  {
@@ -961,10 +822,39 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }

-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = block_index.views[block_i].bp;
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
@@ -980,6 +870,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
          }
        }
      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
  }

  // Collect unique global owner ranks and translate to local ranks in Comm_here
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -32,16 +32,6 @@ namespace Parallel
  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
-  MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
-  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
-                           int ib0_orig, int ib3_orig,
-                           int jb1_orig, int jb4_orig,
-                           int kb2_orig, int kb5_orig,
-                           Patch* PP, int r_left, int r_right,
-                           int ingfsi, int fngfsi, bool periodic,
-                           Block* &split_first_block, Block* &split_last_block);
-  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
-                           int block_id, int ingfsi, int fngfsi, int lev);
  void KillBlocks(MyList<Patch> *PatchLIST);

  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -108,9 +98,6 @@ namespace Parallel
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
-    int *tc_req_node;
-    int *tc_req_is_recv;
-    int *tc_completed;
    SyncCache();
    void invalidate();
    void destroy();
@@ -124,10 +111,7 @@ namespace Parallel
  struct AsyncSyncState {
    int req_no;
    bool active;
-    int *req_node;
-    int *req_is_recv;
-    int pending_recv;
-    AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
+    AsyncSyncState() : req_no(0), active(false) {}
  };

  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -485,25 +485,7 @@ void Z4c_class::Step(int lev, int YN)
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-    }
-  }
-#endif
+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)

  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -868,6 +850,8 @@ void Z4c_class::Step(int lev, int YN)
    }
 #endif

+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

 #ifdef WithShell
@@ -887,6 +871,7 @@ void Z4c_class::Step(int lev, int YN)
        }
      }
 #endif
+    } // end CA-RK4 guard
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
@@ -1558,7 +1543,7 @@ void Z4c_class::Step(int lev, int YN)
    }
  }

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  // CA-RK4: skip post-prediction MPI ghost sync (redundant; ghost cells computable locally)

  if (lev == 0)
  {
@@ -2120,6 +2105,8 @@ void Z4c_class::Step(int lev, int YN)
      }
    }

+    // CA-RK4: only MPI sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3)
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

    if (lev == 0)
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -1221,25 +1221,7 @@ void bssnEM_class::Step(int lev, int YN)
  }
 #endif

-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used " 
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-    }
-  }
-#endif
+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)

  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1683,6 +1665,8 @@ void bssnEM_class::Step(int lev, int YN)
    }
 #endif

+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);

 #ifdef WithShell
@@ -1702,6 +1686,7 @@ void bssnEM_class::Step(int lev, int YN)
        }
      }
 #endif
+    } // end CA-RK4 guard
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -736,8 +736,6 @@ void bssn_class::Initialize()
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
-  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
-  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }

 //================================================================================================
@@ -2215,7 +2213,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif

 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2431,7 +2429,7 @@ void bssn_class::RecursiveStep(int lev)
  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2610,7 +2608,7 @@ void bssn_class::ParallelStep()
  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2777,7 +2775,7 @@ void bssn_class::ParallelStep()
        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

        //               a_stream.clear();
        //               a_stream.str("");
@@ -2792,7 +2790,7 @@ void bssn_class::ParallelStep()
      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

      //               a_stream.clear();
      //               a_stream.str("");
@@ -2811,7 +2809,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -2827,7 +2825,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -3351,27 +3349,7 @@ void bssn_class::Step(int lev, int YN)
  }
 #endif

-  Parallel::AsyncSyncState async_pre;
-  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-
-#ifdef WithShell
-  if (lev == 0)
-  {
-    clock_t prev_clock, curr_clock;
-    if (myrank == 0)
-      curr_clock = clock();
-    SH->Synch(SynchList_pre, Symmetry);
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Shell stuff synchronization used "
-           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
-           << " seconds! " << endl;
-    }
-  }
-#endif
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
+  // CA-RK4: skip post-prediction sync (redundant; ghost cells computable locally)

 #ifdef WithShell
  // Complete non-blocking error reduction and check
@@ -3711,6 +3689,8 @@ void bssn_class::Step(int lev, int YN)
    }
 #endif

+    // CA-RK4: only sync after last corrector (iter_count == 3); stages 1 & 2 are redundant
+    if (iter_count == 3) {
      Parallel::AsyncSyncState async_cor;
      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);

@@ -3732,6 +3712,7 @@ void bssn_class::Step(int lev, int YN)
      }
 #endif
      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+    } // end CA-RK4 guard

 #ifdef WithShell
    // Complete non-blocking error reduction and check
@@ -5748,12 +5729,6 @@ void bssn_class::SHStep()
 // 0: do not use mixing two levels data for OutBD; 1: do use

 #define MIXOUTB 0
-// In the cached Restrict->OutBdLow2Hi path, coarse Sync is usually redundant:
-// OutBdLow2Hi_cached reads coarse owned cells (build_owned_gsl type-4), not coarse ghost/buffer cells.
-// Keep a switch to restore the old behavior if needed for debugging.
-#ifndef RP_SYNC_COARSE_AFTER_RESTRICT
-#define RP_SYNC_COARSE_AFTER_RESTRICT 0
-#endif
 void bssn_class::RestrictProlong(int lev, int YN, bool BB,
                                 MyList<var> *SL, MyList<var> *OL, MyList<var> *corL)
 // we assume
@@ -5804,7 +5779,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5817,9 +5792,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5830,7 +5803,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5857,7 +5830,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5870,9 +5843,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -5883,7 +5854,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -5952,19 +5923,17 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5976,19 +5945,17 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6043,19 +6010,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6069,19 +6034,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif

-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
-#endif

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6122,7 +6085,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6135,7 +6098,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    {
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6154,16 +6117,13 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
-#endif
    }

    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
  }
 }
 #undef MIXOUTB
-#undef RP_SYNC_COARSE_AFTER_RESTRICT

 //================================================================================================

--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -130,8 +130,6 @@ public:
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
-       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
-       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong

       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -39,6 +39,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
    // printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);

    // temp variable
+    double gxx[all],gyy[all],gzz[all];
    double chix[all],chiy[all],chiz[all];
    double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all];
    double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all];
@@ -50,9 +51,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
    double Gamxx[all],Gamxy[all],Gamxz[all];
    double Gamyx[all],Gamyy[all],Gamyz[all];
    double Gamzx[all],Gamzy[all],Gamzz[all];
-    double Kx[all], Ky[all], Kz[all], S[all];
+    double Kx[all], Ky[all], Kz[all], div_beta[all], S[all];
    double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all];
-    double alpn1[all], chin1[all];
+    double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all];
    double gupxx[all], gupxy[all], gupxz[all];
    double gupyy[all], gupyz[all], gupzz[all];
    double SSS[3] = { 1.0,  1.0,  1.0};
@@ -69,34 +70,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
    const double FF = 0.75, eta = 2.0;
    const double F1o3 = 1.0/3.0, F2o3 = 2.0/3.0, F3o2 = 1.5, F1o6 = 1.0/6.0;
    const double F16 = 16.0, F8 = 8.0;
-    #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
+    #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
    double reta[all];
    /* 使用时：reta[idx]，其中 idx = i + nx*(j + ny*k)  (Fortran列主序) */
    #endif
-
-    #if (GAUGE == 6 || GAUGE == 7)
-    int BHN = 0;
-    double Porg[9] = {0.0};
-    double Mass[3] = {0.0};
-
-    extern "C" {
-        #ifdef fortran1
-        void getpbh(int &, double *, double *);
-        #elif defined(fortran2)
-        void GETPBH(int &, double *, double *);
-        #else
-        void getpbh_(int &, double *, double *);
-        #endif
-    }
-
-    #ifdef fortran1
-    getpbh(BHN, Porg, Mass);
-    #elif defined(fortran2)
-    GETPBH(BHN, Porg, Mass);
-    #else
-    getpbh_(BHN, Porg, Mass);
-    #endif
-    #endif
    PI = acos(-1.0); 
    dX = X[1] - X[0];
    dY = Y[1] - Y[0];
@@ -106,6 +83,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
        for(int i=0;i<all;i+=1){
            alpn1[i] = Lap[i] + 1.0;
            chin1[i] = chi[i] + 1.0;
+            gxx[i] = dxx[i] + 1.0;
+            gyy[i] = dyy[i] + 1.0;
+            gzz[i] = dzz[i] + 1.0;
        }
        // 9ms //
        fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
@@ -123,196 +103,231 @@ int f_compute_rhs_bssn(int *ex, double &T,

        // 3ms //
        for(int i=0;i<all;i+=1){
-            const double divb = betaxx[i] + betayy[i] + betazz[i];
-            chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - divb); 
-            gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * (dxx[i] + ONE) * divb + 
-                    TWO * ((dxx[i] + ONE) * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
-            gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * (dyy[i] + ONE) * divb + 
-                    TWO * (gxy[i] * betaxy[i] + (dyy[i] + ONE) * betayy[i] + gyz[i] * betazy[i]);
-            gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * (dzz[i] + ONE) * divb + 
-                        TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + (dzz[i] + ONE) * betazz[i]);
-            gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * divb + 
-                        (dxx[i] + ONE) * betaxy[i] + gxz[i] * betazy[i] + (dyy[i] + ONE) * betayx[i] 
+            div_beta[i] = betaxx[i] + betayy[i] + betazz[i];
+            chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]); 
+            gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] + 
+                    TWO * (gxx[i] * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
+            gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * gyy[i] * div_beta[i] + 
+                    TWO * (gxy[i] * betaxy[i] + gyy[i] * betayy[i] + gyz[i] * betazy[i]);
+            gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * gzz[i] * div_beta[i] + 
+                        TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + gzz[i] * betazz[i]);
+            gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * div_beta[i] + 
+                        gxx[i] * betaxy[i] + gxz[i] * betazy[i] + gyy[i] * betayx[i] 
                        + gyz[i] * betazx[i] - gxy[i] * betazz[i];        
-            gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * divb + 
-                        gxy[i] * betaxz[i] + (dyy[i] + ONE) * betayz[i] + gxz[i] * betaxy[i] 
-                        + (dzz[i] + ONE) * betazy[i] - gyz[i] * betaxx[i];
-            gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * divb + 
-                        (dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i] 
-                        + (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];                  
+            gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * div_beta[i] + 
+                        gxy[i] * betaxz[i] + gyy[i] * betayz[i] + gxz[i] * betaxy[i] 
+                        + gzz[i] * betazy[i] - gyz[i] * betaxx[i];
+            gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * div_beta[i] + 
+                        gxx[i] * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i] 
+                        + gzz[i] * betazx[i] - gxz[i] * betayy[i];                  
        }
-        // Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
+        // 1ms //
        for(int i=0;i<all;i+=1){
-            double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
-                        gxz[i] * (dyy[i] + ONE) * gxz[i] - gxy[i] * gxy[i] * (dzz[i] + ONE) - (dxx[i] + ONE) * gyz[i] * gyz[i];
-            double lg_xx = ((dyy[i] + ONE) * (dzz[i] + ONE) - gyz[i] * gyz[i]) / det;
-            double lg_xy = -(gxy[i] * (dzz[i] + ONE) - gyz[i] * gxz[i]) / det;
-            double lg_xz = (gxy[i] * gyz[i] - (dyy[i] + ONE) * gxz[i]) / det;
-            double lg_yy = ((dxx[i] + ONE) * (dzz[i] + ONE) - gxz[i] * gxz[i]) / det;
-            double lg_yz = -((dxx[i] + ONE) * gyz[i] - gxy[i] * gxz[i]) / det;
-            double lg_zz = ((dxx[i] + ONE) * (dyy[i] + ONE) - gxy[i] * gxy[i]) / det;
-            gupxx[i] = lg_xx; gupxy[i] = lg_xy; gupxz[i] = lg_xz;
-            gupyy[i] = lg_yy; gupyz[i] = lg_yz; gupzz[i] = lg_zz;
-
+            double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] - 
+                        gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i];  
+            gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det;
+            gupxy[i] = -(gxy[i] * gzz[i] - gyz[i] * gxz[i]) / det;
+            gupxz[i] = (gxy[i] * gyz[i] - gyy[i] * gxz[i]) / det;
+            gupyy[i] = (gxx[i] * gzz[i] - gxz[i] * gxz[i]) / det;
+            gupyz[i] = -(gxx[i] * gyz[i] - gxy[i] * gxz[i]) / det;
+            gupzz[i] = (gxx[i] * gyy[i] - gxy[i] * gxy[i]) / det;   
+        }
+        // 2.2ms //
        if(co==0){
-                Gmx_Res[i] = Gamx[i] - (
-                    lg_xx * (lg_xx*gxxx[i] + lg_xy*gxyx[i] + lg_xz*gxzx[i]) +
-                    lg_xy * (lg_xx*gxyx[i] + lg_xy*gyyx[i] + lg_xz*gyzx[i]) +
-                    lg_xz * (lg_xx*gxzx[i] + lg_xy*gyzx[i] + lg_xz*gzzx[i]) +
-                    lg_xx * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
-                    lg_xy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
-                    lg_xz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
-                    lg_xx * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
-                    lg_xy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
-                    lg_xz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
-                );
-                Gmy_Res[i] = Gamy[i] - (
-                    lg_xx * (lg_xy*gxxx[i] + lg_yy*gxyx[i] + lg_yz*gxzx[i]) +
-                    lg_xy * (lg_xy*gxyx[i] + lg_yy*gyyx[i] + lg_yz*gyzx[i]) +
-                    lg_xz * (lg_xy*gxzx[i] + lg_yy*gyzx[i] + lg_yz*gzzx[i]) +
-                    lg_xy * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
-                    lg_yy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
-                    lg_yz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
-                    lg_xy * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
-                    lg_yy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
-                    lg_yz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
-                );
-                Gmz_Res[i] = Gamz[i] - (
-                    lg_xx * (lg_xz*gxxx[i] + lg_yz*gxyx[i] + lg_zz*gxzx[i]) +
-                    lg_xy * (lg_xz*gxyx[i] + lg_yz*gyyx[i] + lg_zz*gyzx[i]) +
-                    lg_xz * (lg_xz*gxzx[i] + lg_yz*gyzx[i] + lg_zz*gzzx[i]) +
-                    lg_xy * (lg_xz*gxxy[i] + lg_yz*gxyy[i] + lg_zz*gxzy[i]) +
-                    lg_yy * (lg_xz*gxyy[i] + lg_yz*gyyy[i] + lg_zz*gyzy[i]) +
-                    lg_yz * (lg_xz*gxzy[i] + lg_yz*gyzy[i] + lg_zz*gzzy[i]) +
-                    lg_xz * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
-                    lg_yz * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
-                    lg_zz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
-                );
-            }
-
-            Gamxxx[i] = HALF * ( lg_xx*gxxx[i]
-                            + lg_xy*(TWO*gxyx[i] - gxxy[i])
-                            + lg_xz*(TWO*gxzx[i] - gxxz[i]) );
-            Gamyxx[i] = HALF * ( lg_xy*gxxx[i]
-                            + lg_yy*(TWO*gxyx[i] - gxxy[i])
-                            + lg_yz*(TWO*gxzx[i] - gxxz[i]) );
-            Gamzxx[i] = HALF * ( lg_xz*gxxx[i]
-                            + lg_yz*(TWO*gxyx[i] - gxxy[i])
-                            + lg_zz*(TWO*gxzx[i] - gxxz[i]) );
-            Gamxyy[i] = HALF * ( lg_xx*(TWO*gxyy[i] - gyyx[i])
-                            + lg_xy*gyyy[i]
-                            + lg_xz*(TWO*gyzy[i] - gyyz[i]) );
-            Gamyyy[i] = HALF * ( lg_xy*(TWO*gxyy[i] - gyyx[i])
-                            + lg_yy*gyyy[i]
-                            + lg_yz*(TWO*gyzy[i] - gyyz[i]) );
-            Gamzyy[i] = HALF * ( lg_xz*(TWO*gxyy[i] - gyyx[i])
-                            + lg_yz*gyyy[i]
-                            + lg_zz*(TWO*gyzy[i] - gyyz[i]) );
-            Gamxzz[i] = HALF * ( lg_xx*(TWO*gxzz[i] - gzzx[i])
-                            + lg_xy*(TWO*gyzz[i] - gzzy[i])
-                            + lg_xz*gzzz[i] );
-            Gamyzz[i] = HALF * ( lg_xy*(TWO*gxzz[i] - gzzx[i])
-                            + lg_yy*(TWO*gyzz[i] - gzzy[i])
-                            + lg_yz*gzzz[i] );
-            Gamzzz[i] = HALF * ( lg_xz*(TWO*gxzz[i] - gzzx[i])
-                            + lg_yz*(TWO*gyzz[i] - gzzy[i])
-                            + lg_zz*gzzz[i] );
-            Gamxxy[i] = HALF * ( lg_xx*gxxy[i]
-                            + lg_xy*gyyx[i]
-                            + lg_xz*(gxzy[i] + gyzx[i] - gxyz[i]) );
-            Gamyxy[i] = HALF * ( lg_xy*gxxy[i]
-                            + lg_yy*gyyx[i]
-                            + lg_yz*(gxzy[i] + gyzx[i] - gxyz[i]) );
-            Gamzxy[i] = HALF * ( lg_xz*gxxy[i]
-                            + lg_yz*gyyx[i]
-                            + lg_zz*(gxzy[i] + gyzx[i] - gxyz[i]) );
-            Gamxxz[i] = HALF * ( lg_xx*gxxz[i]
-                            + lg_xy*(gxyz[i] + gyzx[i] - gxzy[i])
-                            + lg_xz*gzzx[i] );
-            Gamyxz[i] = HALF * ( lg_xy*gxxz[i]
-                            + lg_yy*(gxyz[i] + gyzx[i] - gxzy[i])
-                            + lg_yz*gzzx[i] );
-            Gamzxz[i] = HALF * ( lg_xz*gxxz[i]
-                            + lg_yz*(gxyz[i] + gyzx[i] - gxzy[i])
-                            + lg_zz*gzzx[i] );
-            Gamxyz[i] = HALF * ( lg_xx*(gxyz[i] + gxzy[i] - gyzx[i])
-                            + lg_xy*gyyz[i]
-                            + lg_xz*gzzy[i] );
-            Gamyyz[i] = HALF * ( lg_xy*(gxyz[i] + gxzy[i] - gyzx[i])
-                            + lg_yy*gyyz[i]
-                            + lg_yz*gzzy[i] );
-            Gamzyz[i] = HALF * ( lg_xz*(gxyz[i] + gxzy[i] - gyzx[i])
-                            + lg_yz*gyyz[i]
-                            + lg_zz*gzzy[i] );
-        }
-        // Fused: A^{ij} raise-index + Gamma_rhs part 1 (2 loops -> 1)
            for (int i=0;i<all;i+=1) {
-            double axx = gupxx[i]*gupxx[i]*Axx[i]
+                Gmx_Res[i] = Gamx[i] - (
+                    gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) +
+                    gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) +
+                    gupxz[i] * (gupxx[i]*gxzx[i] + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i]) +
+
+                    gupxx[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
+                    gupxy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
+                    gupxz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
+
+                    gupxx[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
+                    gupxy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
+                    gupxz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
+                );
+
+                Gmy_Res[i] = Gamy[i] - (
+                    gupxx[i] * (gupxy[i]*gxxx[i] + gupyy[i]*gxyx[i] + gupyz[i]*gxzx[i]) +
+                    gupxy[i] * (gupxy[i]*gxyx[i] + gupyy[i]*gyyx[i] + gupyz[i]*gyzx[i]) +
+                    gupxz[i] * (gupxy[i]*gxzx[i] + gupyy[i]*gyzx[i] + gupyz[i]*gzzx[i]) +
+
+                    gupxy[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
+                    gupyy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
+                    gupyz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
+
+                    gupxy[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
+                    gupyy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
+                    gupyz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
+                );
+
+                Gmz_Res[i] = Gamz[i] - (
+                    gupxx[i] * (gupxz[i]*gxxx[i] + gupyz[i]*gxyx[i] + gupzz[i]*gxzx[i]) +
+                    gupxy[i] * (gupxz[i]*gxyx[i] + gupyz[i]*gyyx[i] + gupzz[i]*gyzx[i]) +
+                    gupxz[i] * (gupxz[i]*gxzx[i] + gupyz[i]*gyzx[i] + gupzz[i]*gzzx[i]) +
+
+                    gupxy[i] * (gupxz[i]*gxxy[i] + gupyz[i]*gxyy[i] + gupzz[i]*gxzy[i]) +
+                    gupyy[i] * (gupxz[i]*gxyy[i] + gupyz[i]*gyyy[i] + gupzz[i]*gyzy[i]) +
+                    gupyz[i] * (gupxz[i]*gxzy[i] + gupyz[i]*gyzy[i] + gupzz[i]*gzzy[i]) +
+
+                    gupxz[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
+                    gupyz[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
+                    gupzz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
+                );
+            }
+        }
+        // 5ms //
+        for (int i=0;i<all;i+=1) {
+
+            Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i]
+                            + gupxy[i]*(TWO*gxyx[i] - gxxy[i])
+                            + gupxz[i]*(TWO*gxzx[i] - gxxz[i]) );
+
+            Gamyxx[i] = HALF * ( gupxy[i]*gxxx[i]
+                            + gupyy[i]*(TWO*gxyx[i] - gxxy[i])
+                            + gupyz[i]*(TWO*gxzx[i] - gxxz[i]) );
+
+            Gamzxx[i] = HALF * ( gupxz[i]*gxxx[i]
+                            + gupyz[i]*(TWO*gxyx[i] - gxxy[i])
+                            + gupzz[i]*(TWO*gxzx[i] - gxxz[i]) );
+
+            Gamxyy[i] = HALF * ( gupxx[i]*(TWO*gxyy[i] - gyyx[i])
+                            + gupxy[i]*gyyy[i]
+                            + gupxz[i]*(TWO*gyzy[i] - gyyz[i]) );
+
+            Gamyyy[i] = HALF * ( gupxy[i]*(TWO*gxyy[i] - gyyx[i])
+                            + gupyy[i]*gyyy[i]
+                            + gupyz[i]*(TWO*gyzy[i] - gyyz[i]) );
+
+            Gamzyy[i] = HALF * ( gupxz[i]*(TWO*gxyy[i] - gyyx[i])
+                            + gupyz[i]*gyyy[i]
+                            + gupzz[i]*(TWO*gyzy[i] - gyyz[i]) );
+
+            Gamxzz[i] = HALF * ( gupxx[i]*(TWO*gxzz[i] - gzzx[i])
+                            + gupxy[i]*(TWO*gyzz[i] - gzzy[i])
+                            + gupxz[i]*gzzz[i] );
+
+            Gamyzz[i] = HALF * ( gupxy[i]*(TWO*gxzz[i] - gzzx[i])
+                            + gupyy[i]*(TWO*gyzz[i] - gzzy[i])
+                            + gupyz[i]*gzzz[i] );
+
+            Gamzzz[i] = HALF * ( gupxz[i]*(TWO*gxzz[i] - gzzx[i])
+                            + gupyz[i]*(TWO*gyzz[i] - gzzy[i])
+                            + gupzz[i]*gzzz[i] );
+
+            Gamxxy[i] = HALF * ( gupxx[i]*gxxy[i]
+                            + gupxy[i]*gyyx[i]
+                            + gupxz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
+
+            Gamyxy[i] = HALF * ( gupxy[i]*gxxy[i]
+                            + gupyy[i]*gyyx[i]
+                            + gupyz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
+
+            Gamzxy[i] = HALF * ( gupxz[i]*gxxy[i]
+                            + gupyz[i]*gyyx[i]
+                            + gupzz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
+
+            Gamxxz[i] = HALF * ( gupxx[i]*gxxz[i]
+                            + gupxy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
+                            + gupxz[i]*gzzx[i] );
+
+            Gamyxz[i] = HALF * ( gupxy[i]*gxxz[i]
+                            + gupyy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
+                            + gupyz[i]*gzzx[i] );
+
+            Gamzxz[i] = HALF * ( gupxz[i]*gxxz[i]
+                            + gupyz[i]*(gxyz[i] + gyzx[i] - gxzy[i])
+                            + gupzz[i]*gzzx[i] );
+
+            Gamxyz[i] = HALF * ( gupxx[i]*(gxyz[i] + gxzy[i] - gyzx[i])
+                            + gupxy[i]*gyyz[i]
+                            + gupxz[i]*gzzy[i] );
+
+            Gamyyz[i] = HALF * ( gupxy[i]*(gxyz[i] + gxzy[i] - gyzx[i])
+                            + gupyy[i]*gyyz[i]
+                            + gupyz[i]*gzzy[i] );
+
+            Gamzyz[i] = HALF * ( gupxz[i]*(gxyz[i] + gxzy[i] - gyzx[i])
+                            + gupyz[i]*gyyz[i]
+                            + gupzz[i]*gzzy[i] );
+
+        }
+        // 1.8ms //
+        for (int i=0;i<all;i+=1) {
+
+            Rxx[i] = gupxx[i]*gupxx[i]*Axx[i]
                + gupxy[i]*gupxy[i]*Ayy[i]
                + gupxz[i]*gupxz[i]*Azz[i]
                + TWO * ( gupxx[i]*gupxy[i]*Axy[i]
                        + gupxx[i]*gupxz[i]*Axz[i]
                        + gupxy[i]*gupxz[i]*Ayz[i] );
-            double ayy = gupxy[i]*gupxy[i]*Axx[i]
+
+            Ryy[i] = gupxy[i]*gupxy[i]*Axx[i]
                + gupyy[i]*gupyy[i]*Ayy[i]
                + gupyz[i]*gupyz[i]*Azz[i]
                + TWO * ( gupxy[i]*gupyy[i]*Axy[i]
                        + gupxy[i]*gupyz[i]*Axz[i]
                        + gupyy[i]*gupyz[i]*Ayz[i] );
-            double azz = gupxz[i]*gupxz[i]*Axx[i]
+
+            Rzz[i] = gupxz[i]*gupxz[i]*Axx[i]
                + gupyz[i]*gupyz[i]*Ayy[i]
                + gupzz[i]*gupzz[i]*Azz[i]
                + TWO * ( gupxz[i]*gupyz[i]*Axy[i]
                        + gupxz[i]*gupzz[i]*Axz[i]
                        + gupyz[i]*gupzz[i]*Ayz[i] );
-            double axy = gupxx[i]*gupxy[i]*Axx[i]
+
+            Rxy[i] = gupxx[i]*gupxy[i]*Axx[i]
                + gupxy[i]*gupyy[i]*Ayy[i]
                + gupxz[i]*gupyz[i]*Azz[i]
                + ( gupxx[i]*gupyy[i] + gupxy[i]*gupxy[i] ) * Axy[i]
                + ( gupxx[i]*gupyz[i] + gupxz[i]*gupxy[i] ) * Axz[i]
                + ( gupxy[i]*gupyz[i] + gupxz[i]*gupyy[i] ) * Ayz[i];
-            double axz = gupxx[i]*gupxz[i]*Axx[i]
+
+            Rxz[i] = gupxx[i]*gupxz[i]*Axx[i]
                + gupxy[i]*gupyz[i]*Ayy[i]
                + gupxz[i]*gupzz[i]*Azz[i]
                + ( gupxx[i]*gupyz[i] + gupxy[i]*gupxz[i] ) * Axy[i]
                + ( gupxx[i]*gupzz[i] + gupxz[i]*gupxz[i] ) * Axz[i]
                + ( gupxy[i]*gupzz[i] + gupxz[i]*gupyz[i] ) * Ayz[i];
-            double ayz = gupxy[i]*gupxz[i]*Axx[i]
+
+            Ryz[i] = gupxy[i]*gupxz[i]*Axx[i]
                + gupyy[i]*gupyz[i]*Ayy[i]
                + gupyz[i]*gupzz[i]*Azz[i]
                + ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
                + ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
                + ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
-            Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
-            Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
-
-            Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
+        }
+        // 4ms //
+        for(int i=0;i<all;i+=1){
+            Gamx_rhs[i] = - TWO * (   Lapx[i] * Rxx[i] +   Lapy[i] * Rxy[i] +   Lapz[i] * Rxz[i] ) + 
                TWO * alpn1[i] * (                                                
-                -F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
-                    gupxx[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] ) -
-                    gupxy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] ) -
-                    gupxz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] ) +
-                                Gamxxx[i]*axx + Gamxyy[i]*ayy + Gamxzz[i]*azz +
-                        TWO * ( Gamxxy[i]*axy + Gamxxz[i]*axz + Gamxyz[i]*ayz ) );
+                -F3o2/chin1[i] * (   chix[i] * Rxx[i] +   chiy[i] * Rxy[i] +   chiz[i] * Rxz[i] ) - 
+                    gupxx[i] * (   F2o3 * Kx[i]  +  EIGHT * PI * Sx[i]            ) - 
+                    gupxy[i] * (   F2o3 * Ky[i]  +  EIGHT * PI * Sy[i]            ) - 
+                    gupxz[i] * (   F2o3 * Kz[i]  +  EIGHT * PI * Sz[i]            ) + 
+                                Gamxxx[i] * Rxx[i] + Gamxyy[i] * Ryy[i] + Gamxzz[i] * Rzz[i]   + 
+                        TWO * ( Gamxxy[i] * Rxy[i] + Gamxxz[i] * Rxz[i] + Gamxyz[i] * Ryz[i] ) );

-            Gamy_rhs[i] = -TWO * ( Lapx[i]*axy + Lapy[i]*ayy + Lapz[i]*ayz )
+            Gamy_rhs[i] = -TWO * ( Lapx[i]*Rxy[i] + Lapy[i]*Ryy[i] + Lapz[i]*Ryz[i] )
                        +  TWO * alpn1[i] * (
-                            -F3o2/chin1[i] * ( chix[i]*axy + chiy[i]*ayy + chiz[i]*ayz )
+                            -F3o2/chin1[i] * ( chix[i]*Rxy[i] + chiy[i]*Ryy[i] + chiz[i]*Ryz[i] )
                            - gupxy[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
                            - gupyy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
                            - gupyz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
-                            + Gamyxx[i]*axx + Gamyyy[i]*ayy + Gamyzz[i]*azz
-                            + TWO * ( Gamyxy[i]*axy + Gamyxz[i]*axz + Gamyyz[i]*ayz )
+                            + Gamyxx[i]*Rxx[i] + Gamyyy[i]*Ryy[i] + Gamyzz[i]*Rzz[i]
+                            + TWO * ( Gamyxy[i]*Rxy[i] + Gamyxz[i]*Rxz[i] + Gamyyz[i]*Ryz[i] )
                        );

-            Gamz_rhs[i] = -TWO * ( Lapx[i]*axz + Lapy[i]*ayz + Lapz[i]*azz )
+            Gamz_rhs[i] = -TWO * ( Lapx[i]*Rxz[i] + Lapy[i]*Ryz[i] + Lapz[i]*Rzz[i] )
                        +  TWO * alpn1[i] * (
-                            -F3o2/chin1[i] * ( chix[i]*axz + chiy[i]*ayz + chiz[i]*azz )
+                            -F3o2/chin1[i] * ( chix[i]*Rxz[i] + chiy[i]*Ryz[i] + chiz[i]*Rzz[i] )
                            - gupxz[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
                            - gupyz[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
                            - gupzz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
-                            + Gamzxx[i]*axx + Gamzyy[i]*ayy + Gamzzz[i]*azz
-                            + TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
+                            + Gamzxx[i]*Rxx[i] + Gamzyy[i]*Ryy[i] + Gamzzz[i]*Rzz[i]
+                            + TWO * ( Gamzxy[i]*Rxy[i] + Gamzxz[i]*Rxz[i] + Gamzyz[i]*Ryz[i] )
                        );
        }
        // 22.3ms //
@@ -326,63 +341,65 @@ int f_compute_rhs_bssn(int *ex, double &T,
        fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
        fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);

-        // Fused: fxx/Gamxa + Gamma_rhs part 2 (2 loops -> 1)
+        // 3.5ms //
        for(int i=0;i<all;i+=1){
-            const double divb = betaxx[i] + betayy[i] + betazz[i];
-            double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
-            double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
-            double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
-            fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
-
-            double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
+            fxx[i] = gxxx[i] + gxyy[i] + gxzz[i];
+            fxy[i] = gxyx[i] + gyyy[i] + gyzz[i];
+            fxz[i] = gxzx[i] + gyzy[i] + gzzz[i];
+            Gamxa[i] = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
                    + TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
-            double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
+
+            Gamya[i] = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
                    + TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
-            double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
+
+            Gamza[i] = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
                    + TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
+        }
+        // 3.9ms //
+        for(int i=0;i<all;i+=1){
            Gamx_rhs[i] = Gamx_rhs[i]
-                        + F2o3 * gxa * divb
-                        - gxa * betaxx[i] - gya * betaxy[i] - gza * betaxz[i]
-                        + F1o3 * ( gupxx[i] * lfxx + gupxy[i] * lfxy + gupxz[i] * lfxz )
+                        + F2o3 * Gamxa[i] * div_beta[i]
+                        - Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i]
+                        + F1o3 * ( gupxx[i] * fxx[i] + gupxy[i] * fxy[i] + gupxz[i] * fxz[i] )
                        + gupxx[i] * gxxx[i] + gupyy[i] * gyyx[i] + gupzz[i] * gzzx[i]
                        + TWO * ( gupxy[i] * gxyx[i] + gupxz[i] * gxzx[i] + gupyz[i] * gyzx[i] );

            Gamy_rhs[i] = Gamy_rhs[i]
-                        + F2o3 * gya * divb
-                        - gxa * betayx[i] - gya * betayy[i] - gza * betayz[i]
-                        + F1o3 * ( gupxy[i] * lfxx + gupyy[i] * lfxy + gupyz[i] * lfxz )
+                        + F2o3 * Gamya[i] * div_beta[i]
+                        - Gamxa[i] * betayx[i] - Gamya[i] * betayy[i] - Gamza[i] * betayz[i]
+                        + F1o3 * ( gupxy[i] * fxx[i] + gupyy[i] * fxy[i] + gupyz[i] * fxz[i] )
                        + gupxx[i] * gxxy[i] + gupyy[i] * gyyy[i] + gupzz[i] * gzzy[i]
                        + TWO * ( gupxy[i] * gxyy[i] + gupxz[i] * gxzy[i] + gupyz[i] * gyzy[i] );

            Gamz_rhs[i] = Gamz_rhs[i]
-                        + F2o3 * gza * divb
-                        - gxa * betazx[i] - gya * betazy[i] - gza * betazz[i]
-                        + F1o3 * ( gupxz[i] * lfxx + gupyz[i] * lfxy + gupzz[i] * lfxz )
+                        + F2o3 * Gamza[i] * div_beta[i]
+                        - Gamxa[i] * betazx[i] - Gamya[i] * betazy[i] - Gamza[i] * betazz[i]
+                        + F1o3 * ( gupxz[i] * fxx[i] + gupyz[i] * fxy[i] + gupzz[i] * fxz[i] )
                        + gupxx[i] * gxxz[i] + gupyy[i] * gyyz[i] + gupzz[i] * gzzz[i]
                        + TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
        }
        // 4.4ms //
        for (int i=0;i<all;i+=1) {
-            gxxx[i] = (dxx[i] + ONE)*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
-            gxyx[i] = (dxx[i] + ONE)*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
-            gxzx[i] = (dxx[i] + ONE)*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
-            gyyx[i] = (dxx[i] + ONE)*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
-            gyzx[i] = (dxx[i] + ONE)*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
-            gzzx[i] = (dxx[i] + ONE)*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];
+            gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
+            gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
+            gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
+            gyyx[i] = gxx[i]*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
+            gyzx[i] = gxx[i]*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
+            gzzx[i] = gxx[i]*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];

-            gxxy[i] = gxy[i]*Gamxxx[i] + (dyy[i] + ONE)*Gamyxx[i] + gyz[i]*Gamzxx[i];
-            gxyy[i] = gxy[i]*Gamxxy[i] + (dyy[i] + ONE)*Gamyxy[i] + gyz[i]*Gamzxy[i];
-            gxzy[i] = gxy[i]*Gamxxz[i] + (dyy[i] + ONE)*Gamyxz[i] + gyz[i]*Gamzxz[i];
-            gyyy[i] = gxy[i]*Gamxyy[i] + (dyy[i] + ONE)*Gamyyy[i] + gyz[i]*Gamzyy[i];
-            gyzy[i] = gxy[i]*Gamxyz[i] + (dyy[i] + ONE)*Gamyyz[i] + gyz[i]*Gamzyz[i];
-            gzzy[i] = gxy[i]*Gamxzz[i] + (dyy[i] + ONE)*Gamyzz[i] + gyz[i]*Gamzzz[i];
+            gxxy[i] = gxy[i]*Gamxxx[i] + gyy[i]*Gamyxx[i] + gyz[i]*Gamzxx[i];
+            gxyy[i] = gxy[i]*Gamxxy[i] + gyy[i]*Gamyxy[i] + gyz[i]*Gamzxy[i];
+            gxzy[i] = gxy[i]*Gamxxz[i] + gyy[i]*Gamyxz[i] + gyz[i]*Gamzxz[i];
+            gyyy[i] = gxy[i]*Gamxyy[i] + gyy[i]*Gamyyy[i] + gyz[i]*Gamzyy[i];
+            gyzy[i] = gxy[i]*Gamxyz[i] + gyy[i]*Gamyyz[i] + gyz[i]*Gamzyz[i];
+            gzzy[i] = gxy[i]*Gamxzz[i] + gyy[i]*Gamyzz[i] + gyz[i]*Gamzzz[i];

-            gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + (dzz[i] + ONE)*Gamzxx[i];
-            gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + (dzz[i] + ONE)*Gamzxy[i];
-            gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + (dzz[i] + ONE)*Gamzxz[i];
-            gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + (dzz[i] + ONE)*Gamzyy[i];
-            gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + (dzz[i] + ONE)*Gamzyz[i];
-            gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + (dzz[i] + ONE)*Gamzzz[i];
+            gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + gzz[i]*Gamzxx[i];
+            gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + gzz[i]*Gamzxy[i];
+            gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + gzz[i]*Gamzxz[i];
+            gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + gzz[i]*Gamzyy[i];
+            gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + gzz[i]*Gamzyz[i];
+            gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i];
        }
        // 22.2ms //
            fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
@@ -430,17 +447,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
        // 14ms //
        /* 假设 all = ex1*ex2*ex3，所有量都是 length=all 的 double 数组（已按同一扁平化规则排布） */
        for (int i = 0; i < all; i += 1) {
-            const double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
-                            + TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
-            const double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
-                            + TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
-            const double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
-                            + TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
-
            Rxx[i] =
                -HALF * Rxx[i]
-                + (dxx[i] + ONE) * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
-                + gxa * gxxx[i] + gya * gxyx[i] + gza * gxzx[i]
+                + gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
+                + Gamxa[i] * gxxx[i] + Gamya[i] * gxyx[i] + Gamza[i] * gxzx[i]
                + gupxx[i] * (
                    TWO * (Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxyx[i] + Gamzxx[i] * gxzx[i]) +
                        (Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxxy[i] + Gamzxx[i] * gxxz[i])
@@ -474,8 +484,8 @@ int f_compute_rhs_bssn(int *ex, double &T,

            Ryy[i] =
                -HALF * Ryy[i]
-                + gxy[i] * Gamxy[i] + (dyy[i] + ONE) * Gamyy[i] + gyz[i] * Gamzy[i]
-                + gxa * gxyy[i] + gya * gyyy[i] + gza * gyzy[i]
+                + gxy[i] * Gamxy[i] + gyy[i] * Gamyy[i] + gyz[i] * Gamzy[i]
+                + Gamxa[i] * gxyy[i] + Gamya[i] * gyyy[i] + Gamza[i] * gyzy[i]
                + gupxx[i] * (
                    TWO * (Gamxxy[i] * gxxy[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxzy[i]) +
                        (Gamxxy[i] * gxyx[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxyz[i])
@@ -509,8 +519,8 @@ int f_compute_rhs_bssn(int *ex, double &T,

            Rzz[i] =
                -HALF * Rzz[i]
-                + gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + (dzz[i] + ONE) * Gamzz[i]
-                + gxa * gxzz[i] + gya * gyzz[i] + gza * gzzz[i]
+                + gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + gzz[i] * Gamzz[i]
+                + Gamxa[i] * gxzz[i] + Gamya[i] * gyzz[i] + Gamza[i] * gzzz[i]
                + gupxx[i] * (
                    TWO * (Gamxxz[i] * gxxz[i] + Gamyxz[i] * gxyz[i] + Gamzxz[i] * gxzz[i]) +
                        (Gamxxz[i] * gxzx[i] + Gamyxz[i] * gxzy[i] + Gamzxz[i] * gxzz[i])
@@ -545,10 +555,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
            Rxy[i] =
                HALF * (
                    -Rxy[i]
-                    + (dxx[i] + ONE) * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
-                    + gxy[i] * Gamxx[i] + (dyy[i] + ONE) * Gamyx[i] + gyz[i] * Gamzx[i]
-                    + gxa * gxyx[i] + gya * gyyx[i] + gza * gyzx[i]
-                    + gxa * gxxy[i] + gya * gxyy[i] + gza * gxzy[i]
+                    + gxx[i] * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
+                    + gxy[i] * Gamxx[i] + gyy[i] * Gamyx[i] + gyz[i] * Gamzx[i]
+                    + Gamxa[i] * gxyx[i] + Gamya[i] * gyyx[i] + Gamza[i] * gyzx[i]
+                    + Gamxa[i] * gxxy[i] + Gamya[i] * gxyy[i] + Gamza[i] * gxzy[i]
                )
                + gupxx[i] * (
                    Gamxxx[i] * gxxy[i] + Gamyxx[i] * gxyy[i] + Gamzxx[i] * gxzy[i]
@@ -593,10 +603,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
            Rxz[i] =
                HALF * (
                    -Rxz[i]
-                    + (dxx[i] + ONE) * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
-                    + gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + (dzz[i] + ONE) * Gamzx[i]
-                    + gxa * gxzx[i] + gya * gyzx[i] + gza * gzzx[i]
-                    + gxa * gxxz[i] + gya * gxyz[i] + gza * gxzz[i]
+                    + gxx[i] * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
+                    + gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + gzz[i] * Gamzx[i]
+                    + Gamxa[i] * gxzx[i] + Gamya[i] * gyzx[i] + Gamza[i] * gzzx[i]
+                    + Gamxa[i] * gxxz[i] + Gamya[i] * gxyz[i] + Gamza[i] * gxzz[i]
                )
                + gupxx[i] * (
                    Gamxxx[i] * gxxz[i] + Gamyxx[i] * gxyz[i] + Gamzxx[i] * gxzz[i]
@@ -641,10 +651,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
            Ryz[i] =
                HALF * (
                    -Ryz[i]
-                    + gxy[i] * Gamxz[i] + (dyy[i] + ONE) * Gamyz[i] + gyz[i] * Gamzz[i]
-                    + gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + (dzz[i] + ONE) * Gamzy[i]
-                    + gxa * gxzy[i] + gya * gyzy[i] + gza * gzzy[i]
-                    + gxa * gxyz[i] + gya * gyyz[i] + gza * gyzz[i]
+                    + gxy[i] * Gamxz[i] + gyy[i] * Gamyz[i] + gyz[i] * Gamzz[i]
+                    + gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + gzz[i] * Gamzy[i]
+                    + Gamxa[i] * gxzy[i] + Gamya[i] * gyzy[i] + Gamza[i] * gzzy[i]
+                    + Gamxa[i] * gxyz[i] + Gamya[i] * gyyz[i] + Gamza[i] * gyzz[i]
                )
                + gupxx[i] * (
                    Gamxxy[i] * gxxz[i] + Gamyxy[i] * gxyz[i] + Gamzxy[i] * gxzz[i]
@@ -705,9 +715,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
                + TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
                + TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
                + TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
-            Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO);
-            Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO);
-            Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO);
+            Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + gxx[i] * f[i] ) / (chin1[i] * TWO);
+            Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + gyy[i] * f[i] ) / (chin1[i] * TWO);
+            Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + gzz[i] * f[i] ) / (chin1[i] * TWO);

            Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
            Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
@@ -716,6 +726,7 @@ int f_compute_rhs_bssn(int *ex, double &T,

        // 24ms //
        fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
+        fderivs(ex,chi,dtSfx_rhs,dtSfy_rhs,dtSfz_rhs,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);

        // 6ms //
        for (int i=0;i<all;i+=1) {
@@ -725,17 +736,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
            gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];

            /* Christoffel 修正项 */
-            Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF;
-            Gamyxx[i] = Gamyxx[i] - ( 0.0           - (dxx[i] + ONE) * gxxy[i] ) * HALF;  /* 原式只有 -gxx*gxxy */
-            Gamzxx[i] = Gamzxx[i] - ( 0.0           - (dxx[i] + ONE) * gxxz[i] ) * HALF;
+            Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - gxx[i] * gxxx[i] ) * HALF;
+            Gamyxx[i] = Gamyxx[i] - ( 0.0           - gxx[i] * gxxy[i] ) * HALF;  /* 原式只有 -gxx*gxxy */
+            Gamzxx[i] = Gamzxx[i] - ( 0.0           - gxx[i] * gxxz[i] ) * HALF;

-            Gamxyy[i] = Gamxyy[i] - ( 0.0           - (dyy[i] + ONE) * gxxx[i] ) * HALF;
-            Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF;
-            Gamzyy[i] = Gamzyy[i] - ( 0.0           - (dyy[i] + ONE) * gxxz[i] ) * HALF;
+            Gamxyy[i] = Gamxyy[i] - ( 0.0           - gyy[i] * gxxx[i] ) * HALF;
+            Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - gyy[i] * gxxy[i] ) * HALF;
+            Gamzyy[i] = Gamzyy[i] - ( 0.0           - gyy[i] * gxxz[i] ) * HALF;

-            Gamxzz[i] = Gamxzz[i] - ( 0.0           - (dzz[i] + ONE) * gxxx[i] ) * HALF;
-            Gamyzz[i] = Gamyzz[i] - ( 0.0           - (dzz[i] + ONE) * gxxy[i] ) * HALF;
-            Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF;
+            Gamxzz[i] = Gamxzz[i] - ( 0.0           - gzz[i] * gxxx[i] ) * HALF;
+            Gamyzz[i] = Gamyzz[i] - ( 0.0           - gzz[i] * gxxy[i] ) * HALF;
+            Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - gzz[i] * gxxz[i] ) * HALF;

            Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i])      - gxy[i] * gxxx[i] ) * HALF;
            Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i])      - gxy[i] * gxxy[i] ) * HALF;
@@ -757,13 +768,14 @@ int f_compute_rhs_bssn(int *ex, double &T,
            fxy[i] = fxy[i] - Gamxxy[i] * Lapx[i] - Gamyxy[i] * Lapy[i] - Gamzxy[i] * Lapz[i];
            fxz[i] = fxz[i] - Gamxxz[i] * Lapx[i] - Gamyxz[i] * Lapy[i] - Gamzxz[i] * Lapz[i];
            fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
-
+        }
+        // 1ms //
+        for (int i=0;i<all;i+=1) {
            trK_rhs[i] =  gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
                    +  TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
        }
        // 2.5ms //
        for (int i=0;i<all;i+=1) {
-            const double divb = betaxx[i] + betayy[i] + betazz[i];

            S[i] = chin1[i] * (
                    gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
@@ -814,20 +826,23 @@ int f_compute_rhs_bssn(int *ex, double &T,
                + (alpn1[i] / chin1[i]) * f[i]
            );

-            double l_fxx = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
-            double l_fxy = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
-            double l_fxz = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
-            double l_fyy = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
-            double l_fyz = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
-            double l_fzz = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
+            fxx[i] = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
+            fxy[i] = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
+            fxz[i] = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
+            fyy[i] = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
+            fyz[i] = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
+            fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
+        }
+        // 8ms //
+        for (int i=0;i<all;i+=1) {

            /* Aij_rhs = fij - gij * f */
-            Axx_rhs[i] = l_fxx - (dxx[i] + ONE) * f[i];
-            Ayy_rhs[i] = l_fyy - (dyy[i] + ONE) * f[i];
-            Azz_rhs[i] = l_fzz - (dzz[i] + ONE) * f[i];
-            Axy_rhs[i] = l_fxy - gxy[i] * f[i];
-            Axz_rhs[i] = l_fxz - gxz[i] * f[i];
-            Ayz_rhs[i] = l_fyz - gyz[i] * f[i];
+            Axx_rhs[i] = fxx[i] - gxx[i] * f[i];
+            Ayy_rhs[i] = fyy[i] - gyy[i] * f[i];
+            Azz_rhs[i] = fzz[i] - gzz[i] * f[i];
+            Axy_rhs[i] = fxy[i] - gxy[i] * f[i];
+            Axz_rhs[i] = fxz[i] - gxz[i] * f[i];
+            Ayz_rhs[i] = fyz[i] - gyz[i] * f[i];

            /* Now: store A_il A^l_j into fij: */
            fxx[i] =
@@ -889,19 +904,19 @@ int f_compute_rhs_bssn(int *ex, double &T,
                f[i] * Axx_rhs[i]
                + alpn1[i] * ( trK[i] * Axx[i] - TWO * fxx[i] )
                + TWO * ( Axx[i] * betaxx[i] + Axy[i] * betayx[i] + Axz[i] * betazx[i] )
-                - F2o3 * Axx[i] * divb;
+                - F2o3 * Axx[i] * div_beta[i];

            Ayy_rhs[i] =
                f[i] * Ayy_rhs[i]
                + alpn1[i] * ( trK[i] * Ayy[i] - TWO * fyy[i] )
                + TWO * ( Axy[i] * betaxy[i] + Ayy[i] * betayy[i] + Ayz[i] * betazy[i] )
-                - F2o3 * Ayy[i] * divb;
+                - F2o3 * Ayy[i] * div_beta[i];

            Azz_rhs[i] =
                f[i] * Azz_rhs[i]
                + alpn1[i] * ( trK[i] * Azz[i] - TWO * fzz[i] )
                + TWO * ( Axz[i] * betaxz[i] + Ayz[i] * betayz[i] + Azz[i] * betazz[i] )
-                - F2o3 * Azz[i] * divb;
+                - F2o3 * Azz[i] * div_beta[i];

            Axy_rhs[i] =
                f[i] * Axy_rhs[i]
@@ -910,7 +925,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
                + Axz[i] * betazy[i]
                + Ayy[i] * betayx[i]
                + Ayz[i] * betazx[i]
-                + F1o3 * Axy[i] * divb
+                + F1o3 * Axy[i] * div_beta[i]
                - Axy[i] * betazz[i];

            Ayz_rhs[i] =
@@ -920,7 +935,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
                + Ayy[i] * betayz[i]
                + Axz[i] * betaxy[i]
                + Azz[i] * betazy[i]
-                + F1o3 * Ayz[i] * divb
+                + F1o3 * Ayz[i] * div_beta[i]
                - Ayz[i] * betaxx[i];

            Axz_rhs[i] =
@@ -930,7 +945,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
                + Axy[i] * betayz[i]
                + Ayz[i] * betayx[i]
                + Azz[i] * betazx[i]
-                + F1o3 * Axz[i] * divb
+                + F1o3 * Axz[i] * div_beta[i]
                - Axz[i] * betayy[i];

            /* Compute trace of S_ij */
@@ -951,141 +966,82 @@ int f_compute_rhs_bssn(int *ex, double &T,
        }

        // 1ms //
-        #if (GAUGE == 6 || GAUGE == 7)
-        if (BHN == 2) {
-            const double M = Mass[0] + Mass[1];
-            const double A = TWO / M;
-            const double w1 = 1.2e1;
-            const double w2 = w1;
-            const double C1 = ONE / Mass[0] - A;
-            const double C2 = ONE / Mass[1] - A;
-            const double denom =
-                (Porg[0] - Porg[3]) * (Porg[0] - Porg[3]) +
-                (Porg[1] - Porg[4]) * (Porg[1] - Porg[4]) +
-                (Porg[2] - Porg[5]) * (Porg[2] - Porg[5]);
-
-            for (int k0 = 0; k0 < nz; ++k0) {
-                for (int j0 = 0; j0 < ny; ++j0) {
-                    for (int i0 = 0; i0 < nx; ++i0) {
-                        const size_t p = idx_ex(i0, j0, k0, ex);
-                        const double dx1 = Porg[0] - X[i0];
-                        const double dy1 = Porg[1] - Y[j0];
-                        const double dz1 = Porg[2] - Z[k0];
-                        const double dx2 = Porg[3] - X[i0];
-                        const double dy2 = Porg[4] - Y[j0];
-                        const double dz2 = Porg[5] - Z[k0];
-                        const double r1 = (dx1 * dx1 + dy1 * dy1 + dz1 * dz1) / denom;
-                        const double r2 = (dx2 * dx2 + dy2 * dy2 + dz2 * dz2) / denom;
-                        #if (GAUGE == 6)
-                        reta[p] = A + C1 / (ONE + w1 * r1) + C2 / (ONE + w2 * r2);
-                        #else
-                        reta[p] = A + C1 * exp(-w1 * r1) + C2 * exp(-w2 * r2);
-                        #endif
-                    }
-                }
-            }
-        } else {
-            printf("not support BH_num in Jason's form %d %d\n", (GAUGE == 6) ? 1 : 2, BHN);
-            for (int i = 0; i < all; ++i) reta[i] = ZEO;
-        }
-        #endif
-
-        for (int i = 0; i < all; i += 1) {
-            #if (GAUGE == 0)
+        for(int i=0;i<all;i+=1){
            betax_rhs[i] = FF * dtSfx[i];
            betay_rhs[i] = FF * dtSfy[i];
            betaz_rhs[i] = FF * dtSfz[i];
+#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
+            reta[i] =
+                gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+                + gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+                + gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+                + TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+                        + gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );

+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (1.0 - sqrt(chin1[i])), 2.0 );
+
+            dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
+            dtSfy_rhs[i] = Gamy_rhs[i] - reta[i] * dtSfy[i];
+            dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
+#else
            dtSfx_rhs[i] = Gamx_rhs[i] - eta * dtSfx[i];
            dtSfy_rhs[i] = Gamy_rhs[i] - eta * dtSfy[i];
            dtSfz_rhs[i] = Gamz_rhs[i] - eta * dtSfz[i];
-            #elif (GAUGE == 1)
-            betax_rhs[i] = Gamx[i] - eta * betax[i];
-            betay_rhs[i] = Gamy[i] - eta * betay[i];
-            betaz_rhs[i] = Gamz[i] - eta * betaz[i];
-
-            dtSfx_rhs[i] = ZEO;
-            dtSfy_rhs[i] = ZEO;
-            dtSfz_rhs[i] = ZEO;
-            #elif (GAUGE == 2 || GAUGE == 3)
-            betax_rhs[i] = FF * dtSfx[i];
-            betay_rhs[i] = FF * dtSfy[i];
-            betaz_rhs[i] = FF * dtSfz[i];
-
-            reta[i] =
-                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * chiy[i] * chiz[i] );
-
-            #if (GAUGE == 2)
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
-            #else
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
-            #endif
-
-            dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
-            dtSfy_rhs[i] = Gamy_rhs[i] - reta[i] * dtSfy[i];
-            dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
-            #elif (GAUGE == 4 || GAUGE == 5)
-            reta[i] =
-                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * chiy[i] * chiz[i] );
-
-            #if (GAUGE == 4)
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
-            #else
-            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
-            #endif
-
-            betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
-            betay_rhs[i] = FF * Gamy[i] - reta[i] * betay[i];
-            betaz_rhs[i] = FF * Gamz[i] - reta[i] * betaz[i];
-
-            dtSfx_rhs[i] = ZEO;
-            dtSfy_rhs[i] = ZEO;
-            dtSfz_rhs[i] = ZEO;
-            #elif (GAUGE == 6 || GAUGE == 7)
-            betax_rhs[i] = FF * dtSfx[i];
-            betay_rhs[i] = FF * dtSfy[i];
-            betaz_rhs[i] = FF * dtSfz[i];
-
-            dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
-            dtSfy_rhs[i] = Gamy_rhs[i] - reta[i] * dtSfy[i];
-            dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
-            #endif
+#endif
+        }
+        // 26ms //
+        lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
+        lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
+        lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
+        lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
+        lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
+        lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
+        lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
+        lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
+        lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
+        lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
+        lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
+        lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA);
+        lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA);
+        lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS);
+        lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS);
+        lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS);
+        // 20ms //
+        if(eps>0){
+            kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps);
+            kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps);
+            kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps);
+            kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
+            kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
+            kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
+            kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
+            kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
+            kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
+            kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
        }
-        // advection + KO dissipation with shared symmetry buffer
-        lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
        // 2ms //
        if(co==0){
            for (int i=0;i<all;i+=1) {
@@ -1138,6 +1094,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
            fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
            fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
            fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
+        }
        // 7ms //
        for (int i=0;i<all;i+=1) {
            gxxx[i] = gxxx[i] - (  Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
@@ -1191,7 +1148,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
            movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
            movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
        }
-        }


            
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -130,11 +130,7 @@ void cgh::compose_cgh(int nprocs)
  for (int lev = 0; lev < levels; lev++)
  {
    checkPatchList(PatL[lev], false);
-#ifdef INTERP_LB_OPTIMIZE
-    Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
-#else
    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
-#endif
 #if (RPB == 1)
    // we need distributed box of PatL[lev] and PatL[lev-1]
    if (lev > 0)
--- a/AMSS_NCKU_source/diff_newwb.f90
+++ b/AMSS_NCKU_source/diff_newwb.f90
@@ -1513,7 +1513,6 @@
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3))   :: fh
  real*8, dimension(3) :: SoA
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
-  integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
@@ -1566,47 +1565,9 @@
  fxz = ZEO
  fyz = ZEO

-  i_core_min = max(1, imin+2)
-  i_core_max = min(ex(1), imax-2)
-  j_core_min = max(1, jmin+2)
-  j_core_max = min(ex(2), jmax-2)
-  k_core_min = max(1, kmin+2)
-  k_core_max = min(ex(3), kmax-2)
-
-  if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
-   do k=k_core_min,k_core_max
-   do j=j_core_min,j_core_max
-   do i=i_core_min,i_core_max
-! interior points always use 4th-order stencils without branch checks
-      fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-                          -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
-      fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-                          -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
-      fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-                          -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
-      fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
-                          -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
-                          +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
-                          -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
-      fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
-                          -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
-                          +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
-                          -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
-      fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
-                          -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
-                          +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
-                          -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
-   enddo
-   enddo
-   enddo
-  endif
-
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
-      if(i>=i_core_min .and. i<=i_core_max .and. &
-         j>=j_core_min .and. j<=j_core_max .and. &
-         k>=k_core_min .and. k<=k_core_max) cycle
 !~~~~~~ fxx
        if(i+2 <= imax .and. i-2 >= imin)then
 !
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -71,99 +71,149 @@ void fdderivs(const int ex[3],
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);

-    /* 只清零不被主循环覆盖的边界面 */
-    {
-        /* 高边界：k0=ex3-1 */
-        for (int j0 = 0; j0 < ex2; ++j0)
-            for (int i0 = 0; i0 < ex1; ++i0) {
-                const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
-        /* 高边界：j0=ex2-1 */
-        for (int k0 = 0; k0 < ex3 - 1; ++k0)
-            for (int i0 = 0; i0 < ex1; ++i0) {
-                const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
-        /* 高边界：i0=ex1-1 */
-        for (int k0 = 0; k0 < ex3 - 1; ++k0)
-            for (int j0 = 0; j0 < ex2 - 1; ++j0) {
-                const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
-                fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-            }
-
-        /* 低边界：当二阶模板也不可用时，对应 i0/j0/k0=0 面 */
-        if (kminF == 1) {
-            for (int j0 = 0; j0 < ex2; ++j0)
-                for (int i0 = 0; i0 < ex1; ++i0) {
-                    const size_t p = idx_ex(i0, j0, 0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
-        }
-        if (jminF == 1) {
-            for (int k0 = 0; k0 < ex3; ++k0)
-                for (int i0 = 0; i0 < ex1; ++i0) {
-                    const size_t p = idx_ex(i0, 0, k0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
-        }
-        if (iminF == 1) {
-            for (int k0 = 0; k0 < ex3; ++k0)
-                for (int j0 = 0; j0 < ex2; ++j0) {
-                    const size_t p = idx_ex(0, j0, k0, ex);
-                    fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
-                    fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
-                }
-        }
+    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
+    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
+    for (size_t p = 0; p < all; ++p) {
+        fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
+        fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
    }

    /*
-     * 两段式：
-     * 1) 二阶可用区域先计算二阶模板
-     * 2) 高阶可用区域再覆盖四阶模板
+     * Fortran:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
     */
-    const int i2_lo = (iminF > 0) ? iminF : 0;
-    const int j2_lo = (jminF > 0) ? jminF : 0;
-    const int k2_lo = (kminF > 0) ? kminF : 0;
-    const int i2_hi = ex1 - 2;
-    const int j2_hi = ex2 - 2;
-    const int k2_hi = ex3 - 2;
    
-    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
-    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
-    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
-    const int i4_hi = ex1 - 3;
-    const int j4_hi = ex2 - 3;
-    const int k4_hi = ex3 - 3;
-
-    /*
-     * Strategy A:
-     * Avoid redundant work in overlap of 2nd/4th-order regions.
-     * Only compute 2nd-order on shell points that are NOT overwritten by
-     * the 4th-order pass.
-     */
-    const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
-
-    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
-        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
-            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
-                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
-                    if (has4 &&
-                        i0 >= i4_lo && i0 <= i4_hi &&
-                        j0 >= j4_lo && j0 <= j4_hi &&
-                        k0 >= k4_lo && k0 <= k4_hi) {
-                        continue;
-                    }
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);

+                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
+                    fxx[p] = Fdxdx * (
+                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fyy[p] = Fdydy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fzz[p] = Fdzdz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
+                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
+                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+
+                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
+                    {
+                        const double t_jm2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
+
+                        const double t_jm1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
+
+                        const double t_jp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
+
+                        const double t_jp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
+
+                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
+                    }
+
+                    /* fxz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
+
+                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+
+                    /* fyz 高阶 */
+                    {
+                        const double t_km2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
+
+                        const double t_km1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
+
+                        const double t_kp1 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
+
+                        const double t_kp2 =
+                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
+                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
+                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
+
+                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
+                    }
+                }
+                /* 二阶分支：i±1,j±1,k±1 在范围内 */
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
@@ -202,127 +252,13 @@ void fdderivs(const int ex[3],
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
-                }
-            }
-        }
-    }
-
-    if (has4) {
-        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
-
-                    fxx[p] = Fdxdx * (
-                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
-                    );
-
-                    fyy[p] = Fdydy * (
-                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
-                    );
-
-                    fzz[p] = Fdzdz * (
-                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
-                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
-                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
-                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
-                    );
-
-                    {
-                        const double t_jm2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
-
-                        const double t_jm1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
-
-                        const double t_jp1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
-
-                        const double t_jp2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
-
-                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
-                    }
-
-                    {
-                        const double t_km2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
-
-                        const double t_km1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
-
-                        const double t_kp1 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
-
-                        const double t_kp2 =
-                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
-
-                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
-                    }
-
-                    {
-                        const double t_km2 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
-
-                        const double t_km1 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
-
-                        const double t_kp1 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
-
-                        const double t_kp2 =
-                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
-                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
-                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
-
-                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
-                    }
+                }else{
+                    fxx[p] = 0.0;
+                    fyy[p] = 0.0;
+                    fzz[p] = 0.0;
+                    fxy[p] = 0.0;
+                    fxz[p] = 0.0;
+                    fyz[p] = 0.0;
                }
            }
        }
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -81,63 +81,26 @@ void fderivs(const int ex[3],
    }

    /*
-     * 两段式：
-     * 1) 先在二阶可用区域计算二阶模板
-     * 2) 再在高阶可用区域覆盖为四阶模板
+     * Fortran loops:
+     * do k=1,ex3-1
+     * do j=1,ex2-1
+     * do i=1,ex1-1
     *
-     * 与原 if/elseif 逻辑等价，但减少逐点分支判断。
+     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
     */
-    const int i2_lo = (iminF > 0) ? iminF : 0;
-    const int j2_lo = (jminF > 0) ? jminF : 0;
-    const int k2_lo = (kminF > 0) ? kminF : 0;
-    const int i2_hi = ex1 - 2;
-    const int j2_hi = ex2 - 2;
-    const int k2_hi = ex3 - 2;
-
-    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
-    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
-    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
-    const int i4_hi = ex1 - 3;
-    const int j4_hi = ex2 - 3;
-    const int k4_hi = ex3 - 3;
-
-    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
-        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
-            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
-                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
-                    const int iF = i0 + 1;
-                    const size_t p = idx_ex(i0, j0, k0, ex);
-
-                    fx[p] = d2dx * (
-                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
-                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
-                    );
-
-                    fy[p] = d2dy * (
-                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
-                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
-                    );
-
-                    fz[p] = d2dz * (
-                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
-                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
-                    );
-                }
-            }
-        }
-    }
-
-    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
-        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
-            const int kF = k0 + 1;
-            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
-                const int jF = j0 + 1;
-                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);

+                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
+                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
+                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
+                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
+                {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
@@ -159,6 +122,26 @@ void fderivs(const int ex[3],
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
+                // elseif(i+1 <= imax .and. i-1 >= imin ...)
+                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
+                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
+                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
+                {
+                    fx[p] = d2dx * (
+                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
+                    );
+
+                    fy[p] = d2dy * (
+                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
+                    );
+
+                    fz[p] = d2dz * (
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
+                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
+                    );
+                }
            }
        }
    }
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1115,147 +1115,6 @@ end subroutine d2dump
 !------------------------------------------------------------------------------
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------
-#ifndef POLINT6_USE_BARYCENTRIC
-#define POLINT6_USE_BARYCENTRIC 1
-#endif
-
-!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
-  subroutine polint6_neville(xa, ya, x, y, dy)
-  implicit none
-
-  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
-
-  integer :: i, m, ns, n_m
-  real*8, dimension(6) :: c, d, ho
-  real*8 :: dif, dift, hp, h, den_val
-
-  c = ya
-  d = ya
-  ho = xa - x
-
-  ns = 1
-  dif = abs(x - xa(1))
-
-  do i = 2, 6
-    dift = abs(x - xa(i))
-    if (dift < dif) then
-      ns = i
-      dif = dift
-    end if
-  end do
-
-  y = ya(ns)
-  ns = ns - 1
-
-  do m = 1, 5
-    n_m = 6 - m
-    do i = 1, n_m
-      hp = ho(i)
-      h  = ho(i+m)
-      den_val = hp - h
-
-      if (den_val == 0.0d0) then
-        write(*,*) 'failure in polint for point',x
-        write(*,*) 'with input points: ',xa
-        stop
-      end if
-
-      den_val = (c(i+1) - d(i)) / den_val
-
-      d(i) = h * den_val
-      c(i) = hp * den_val
-    end do
-
-    if (2 * ns < n_m) then
-      dy = c(ns + 1)
-    else
-      dy = d(ns)
-      ns = ns - 1
-    end if
-    y = y + dy
-  end do
-
-  return
-  end subroutine polint6_neville
-
-!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
-  subroutine polint6_barycentric(xa, ya, x, y, dy)
-  implicit none
-
-  real*8, dimension(6), intent(in) :: xa, ya
-  real*8, intent(in) :: x
-  real*8, intent(out) :: y, dy
-
-  integer :: i, j
-  logical :: is_uniform
-  real*8, dimension(6) :: lambda
-  real*8 :: dx, den_i, term, num, den, step, tol
-  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
-
-  do i = 1, 6
-    if (x == xa(i)) then
-      y = ya(i)
-      dy = 0.d0
-      return
-    end if
-  end do
-
-  step = xa(2) - xa(1)
-  is_uniform = (step /= 0.d0)
-  if (is_uniform) then
-    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
-    do i = 3, 6
-      if (abs((xa(i) - xa(i-1)) - step) > tol) then
-        is_uniform = .false.
-        exit
-      end if
-    end do
-  end if
-
-  if (is_uniform) then
-    num = 0.d0
-    den = 0.d0
-    do i = 1, 6
-      term = c_uniform(i) / (x - xa(i))
-      num = num + term * ya(i)
-      den = den + term
-    end do
-    y = num / den
-    dy = 0.d0
-    return
-  end if
-
-  do i = 1, 6
-    den_i = 1.d0
-    do j = 1, 6
-      if (j /= i) then
-        dx = xa(i) - xa(j)
-        if (dx == 0.0d0) then
-          write(*,*) 'failure in polint for point',x
-          write(*,*) 'with input points: ',xa
-          stop
-        end if
-        den_i = den_i * dx
-      end if
-    end do
-    lambda(i) = 1.d0 / den_i
-  end do
-
-  num = 0.d0
-  den = 0.d0
-  do i = 1, 6
-    term = lambda(i) / (x - xa(i))
-    num = num + term * ya(i)
-    den = den + term
-  end do
-
-  y = num / den
-  dy = 0.d0
-
-  return
-  end subroutine polint6_barycentric

 !DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa, ya, x, y, dy, ordn)
@@ -1270,15 +1129,6 @@ end subroutine d2dump
  real*8, dimension(ordn) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val

-  if (ordn == 6) then
-#if POLINT6_USE_BARYCENTRIC
-    call polint6_barycentric(xa, ya, x, y, dy)
-#else
-    call polint6_neville(xa, ya, x, y, dy)
-#endif
-    return
-  end if
-
  c = ya
  d = ya
  ho = xa - x
@@ -1328,41 +1178,6 @@ end subroutine d2dump
  return
  end subroutine polint
 !------------------------------------------------------------------------------
-! Compute Lagrange interpolation basis weights for one target point.
-!------------------------------------------------------------------------------
-!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
-  subroutine polint_lagrange_weights(xa, x, w, ordn)
-  implicit none
-
-  integer, intent(in) :: ordn
-  real*8, dimension(1:ordn), intent(in) :: xa
-  real*8, intent(in) :: x
-  real*8, dimension(1:ordn), intent(out) :: w
-
-  integer :: i, j
-  real*8 :: num, den, dx
-
-  do i = 1, ordn
-    num = 1.d0
-    den = 1.d0
-    do j = 1, ordn
-      if (j /= i) then
-        dx = xa(i) - xa(j)
-        if (dx == 0.0d0) then
-          write(*,*) 'failure in polint for point',x
-          write(*,*) 'with input points: ',xa
-          stop
-        end if
-        num = num * (x - xa(j))
-        den = den * dx
-      end if
-    end do
-    w(i) = num / den
-  end do
-
-  return
-  end subroutine polint_lagrange_weights
-!------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
@@ -1433,26 +1248,19 @@ end subroutine d2dump
  end do
  call polint(x1a,ymtmp,x1,y,dy,ordn)
 #else
-  integer  :: i, j, k
-  real*8, dimension(ordn) :: w1, w2
+  integer  :: j, k
+  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
-  real*8 :: yx_sum, x_sum
+  real*8 :: dy_temp

-  call polint_lagrange_weights(x1a, x1, w1, ordn)
-  call polint_lagrange_weights(x2a, x2, w2, ordn)
-
-  do k = 1, ordn
-    yx_sum = 0.d0
-    do j = 1, ordn
-      x_sum = 0.d0
-      do i = 1, ordn
-        x_sum = x_sum + w1(i) * ya(i,j,k)
+  do k=1,ordn
+    do j=1,ordn
+      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
    end do
-      yx_sum = yx_sum + w2(j) * x_sum
  end do
-    ymtmp(k) = yx_sum
+  do k=1,ordn
+    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
  end do
-
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif

@@ -1801,11 +1609,8 @@ deallocate(f_flat)
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3

  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
-  integer :: i,j,k

-  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
-    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
-  end do
+  fout = C1*f1+C2*f2+C3*f3

  return

--- a/AMSS_NCKU_source/interp_lb_profile.C
+++ b/AMSS_NCKU_source/interp_lb_profile.C
@@ -1,107 +0,0 @@
-#include "interp_lb_profile.h"
-#include <cstdio>
-#include <cstring>
-#include <algorithm>
-
-namespace InterpLBProfile {
-
-bool write_profile(const char *filepath, int nprocs,
-                   const double *rank_times,
-                   const int *heavy_ranks, int num_heavy,
-                   double threshold_ratio)
-{
-    FILE *fp = fopen(filepath, "wb");
-    if (!fp) return false;
-
-    ProfileHeader hdr;
-    hdr.magic = MAGIC;
-    hdr.version = VERSION;
-    hdr.nprocs = nprocs;
-    hdr.num_heavy = num_heavy;
-    hdr.threshold_ratio = threshold_ratio;
-
-    fwrite(&hdr, sizeof(hdr), 1, fp);
-    fwrite(rank_times, sizeof(double), nprocs, fp);
-    fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
-    fclose(fp);
-    return true;
-}
-
-bool read_profile(const char *filepath, int current_nprocs,
-                  int *heavy_ranks, int &num_heavy,
-                  double *rank_times, MPI_Comm comm)
-{
-    int myrank;
-    MPI_Comm_rank(comm, &myrank);
-
-    int valid = 0;
-    ProfileHeader hdr;
-    memset(&hdr, 0, sizeof(hdr));
-
-    if (myrank == 0) {
-        FILE *fp = fopen(filepath, "rb");
-        if (fp) {
-            if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
-                hdr.magic == MAGIC && hdr.version == VERSION &&
-                hdr.nprocs == current_nprocs)
-            {
-                if (fread(rank_times, sizeof(double), current_nprocs, fp)
-                    == (size_t)current_nprocs &&
-                    fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
-                    == (size_t)hdr.num_heavy)
-                {
-                    num_heavy = hdr.num_heavy;
-                    valid = 1;
-                }
-            } else if (fp) {
-                printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
-                       "nprocs=%d (current=%d)\n",
-                       hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
-            }
-            fclose(fp);
-        }
-    }
-
-    MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
-    if (!valid) return false;
-
-    MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
-    MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
-    MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
-    return true;
-}
-
-int identify_heavy_ranks(const double *rank_times, int nprocs,
-                         double threshold_ratio,
-                         int *heavy_ranks, int max_heavy)
-{
-    double sum = 0;
-    for (int i = 0; i < nprocs; i++) sum += rank_times[i];
-    double mean = sum / nprocs;
-    double threshold = threshold_ratio * mean;
-
-    // Collect candidates
-    struct RankTime { int rank; double time; };
-    RankTime *candidates = new RankTime[nprocs];
-    int ncand = 0;
-
-    for (int i = 0; i < nprocs; i++) {
-        if (rank_times[i] > threshold)
-            candidates[ncand++] = {i, rank_times[i]};
-    }
-
-    // Sort descending by time
-    std::sort(candidates, candidates + ncand,
-              [](const RankTime &a, const RankTime &b) {
-                  return a.time > b.time;
-              });
-
-    int count = (ncand < max_heavy) ? ncand : max_heavy;
-    for (int i = 0; i < count; i++)
-        heavy_ranks[i] = candidates[i].rank;
-
-    delete[] candidates;
-    return count;
-}
-
-} // namespace InterpLBProfile
--- a/AMSS_NCKU_source/interp_lb_profile.bin
+++ b/AMSS_NCKU_source/interp_lb_profile.bin
--- a/AMSS_NCKU_source/interp_lb_profile.h
+++ b/AMSS_NCKU_source/interp_lb_profile.h
@@ -1,38 +0,0 @@
-#ifndef INTERP_LB_PROFILE_H
-#define INTERP_LB_PROFILE_H
-
-#include <mpi.h>
-
-namespace InterpLBProfile {
-
-static const unsigned int MAGIC   = 0x494C4250; // "ILBP"
-static const unsigned int VERSION = 1;
-
-struct ProfileHeader {
-    unsigned int magic;
-    unsigned int version;
-    int nprocs;
-    int num_heavy;
-    double threshold_ratio;
-};
-
-// Write profile file (rank 0 only)
-bool write_profile(const char *filepath, int nprocs,
-                   const double *rank_times,
-                   const int *heavy_ranks, int num_heavy,
-                   double threshold_ratio);
-
-// Read profile file (rank 0 reads, then broadcasts to all)
-// Returns true if file found and valid for current nprocs
-bool read_profile(const char *filepath, int current_nprocs,
-                  int *heavy_ranks, int &num_heavy,
-                  double *rank_times, MPI_Comm comm);
-
-// Identify heavy ranks: those with time > threshold_ratio * mean
-int identify_heavy_ranks(const double *rank_times, int nprocs,
-                         double threshold_ratio,
-                         int *heavy_ranks, int max_heavy);
-
-} // namespace InterpLBProfile
-
-#endif /* INTERP_LB_PROFILE_H */
--- a/AMSS_NCKU_source/interp_lb_profile_data.h
+++ b/AMSS_NCKU_source/interp_lb_profile_data.h
@@ -1,29 +0,0 @@
-/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
-/* 更新：负载均衡问题已经通过优化插值函数解决，此profile静态均衡方案已弃用，本头文件现在未参与编译 */
-/* Auto-generated from interp_lb_profile.bin — do not edit */
-#ifndef INTERP_LB_PROFILE_DATA_H
-#define INTERP_LB_PROFILE_DATA_H
-
-#define INTERP_LB_NPROCS 64
-#define INTERP_LB_NUM_HEAVY 4
-
-static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
-
-/* Split table: {block_id, r_left, r_right} */
-static const int interp_lb_splits[4][3] = {
-    {27, 26, 27},
-    {35, 34, 35},
-    {28, 28, 29},
-    {36, 36, 37},
-};
-
-/* Rank remap for displaced neighbor blocks */
-static const int interp_lb_num_remaps = 4;
-static const int interp_lb_remaps[][2] = {
-    {26, 25},
-    {29, 30},
-    {34, 33},
-    {37, 38},
-};
-
-#endif /* INTERP_LB_PROFILE_DATA_H */
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -63,28 +63,19 @@ void kodis(const int ex[3],
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
-    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
-    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
-    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
-    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
-    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
-    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
-    const int i0_hi = imaxF - 4;  // inclusive
-    const int j0_hi = jmaxF - 4;
-    const int k0_hi = kmaxF - 4;
-
-    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
-        free(fh);
-        return;
-    }
-
-    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
+    for (int k0 = 0; k0 < ex3; ++k0) {
        const int kF = k0 + 1;
-        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
+        for (int j0 = 0; j0 < ex2; ++j0) {
            const int jF = j0 + 1;
-            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
+            for (int i0 = 0; i0 < ex1; ++i0) {
                const int iF = i0 + 1;

+                // Fortran if 条件：
+                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
+                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
+                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
+                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
+                {
                    const size_t p = idx_ex(i0, j0, k0, ex);

                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
@@ -112,6 +103,7 @@ void kodis(const int ex[3],
                }
            }
        }
+    }

    free(fh);
 }
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -1,248 +0,0 @@
-#include "tool.h"
-
-/*
- * Combined advection (lopsided) + KO dissipation (kodis).
- * Uses one shared symmetry_bd buffer per call.
- */
-void lopsided_kodis(const int ex[3],
-                    const double *X, const double *Y, const double *Z,
-                    const double *f, double *f_rhs,
-                    const double *Sfx, const double *Sfy, const double *Sfz,
-                    int Symmetry, const double SoA[3], double eps)
-{
-    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
-    const double F6 = 6.0, F18 = 18.0;
-    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
-    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
-    const double cof = 64.0; // 2^6
-
-    const int NO_SYMM = 0, EQ_SYMM = 1;
-
-    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
-
-    const double dX = X[1] - X[0];
-    const double dY = Y[1] - Y[0];
-    const double dZ = Z[1] - Z[0];
-
-    const double d12dx = ONE / F12 / dX;
-    const double d12dy = ONE / F12 / dY;
-    const double d12dz = ONE / F12 / dZ;
-
-    const int imaxF = ex1;
-    const int jmaxF = ex2;
-    const int kmaxF = ex3;
-
-    int iminF = 1, jminF = 1, kminF = 1;
-    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
-    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
-    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
-
-    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
-    const size_t nx = (size_t)ex1 + 3;
-    const size_t ny = (size_t)ex2 + 3;
-    const size_t nz = (size_t)ex3 + 3;
-    const size_t fh_size = nx * ny * nz;
-
-    double *fh = (double*)malloc(fh_size * sizeof(double));
-    if (!fh) return;
-
-    symmetry_bd(3, ex, f, fh, SoA);
-
-    // Advection (same stencil logic as lopsided_c.C)
-    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
-        const int kF = k0 + 1;
-        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
-            const int jF = j0 + 1;
-            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
-                const int iF = i0 + 1;
-                const size_t p = idx_ex(i0, j0, k0, ex);
-
-                const double sfx = Sfx[p];
-                if (sfx > ZEO) {
-                    if (i0 <= ex1 - 4) {
-                        f_rhs[p] += sfx * d12dx *
-                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
-                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
-                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
-                    } else if (i0 <= ex1 - 3) {
-                        f_rhs[p] += sfx * d12dx *
-                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
-                    } else if (i0 <= ex1 - 2) {
-                        f_rhs[p] -= sfx * d12dx *
-                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
-                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
-                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
-                    }
-                } else if (sfx < ZEO) {
-                    if ((i0 - 2) >= iminF) {
-                        f_rhs[p] -= sfx * d12dx *
-                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
-                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
-                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
-                    } else if ((i0 - 1) >= iminF) {
-                        f_rhs[p] += sfx * d12dx *
-                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
-                    } else if (i0 >= iminF) {
-                        f_rhs[p] += sfx * d12dx *
-                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
-                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
-                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
-                    }
-                }
-
-                const double sfy = Sfy[p];
-                if (sfy > ZEO) {
-                    if (j0 <= ex2 - 4) {
-                        f_rhs[p] += sfy * d12dy *
-                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
-                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
-                    } else if (j0 <= ex2 - 3) {
-                        f_rhs[p] += sfy * d12dy *
-                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
-                    } else if (j0 <= ex2 - 2) {
-                        f_rhs[p] -= sfy * d12dy *
-                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
-                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
-                    }
-                } else if (sfy < ZEO) {
-                    if ((j0 - 2) >= jminF) {
-                        f_rhs[p] -= sfy * d12dy *
-                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
-                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
-                    } else if ((j0 - 1) >= jminF) {
-                        f_rhs[p] += sfy * d12dy *
-                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
-                    } else if (j0 >= jminF) {
-                        f_rhs[p] += sfy * d12dy *
-                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
-                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
-                    }
-                }
-
-                const double sfz = Sfz[p];
-                if (sfz > ZEO) {
-                    if (k0 <= ex3 - 4) {
-                        f_rhs[p] += sfz * d12dz *
-                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
-                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
-                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
-                    } else if (k0 <= ex3 - 3) {
-                        f_rhs[p] += sfz * d12dz *
-                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
-                    } else if (k0 <= ex3 - 2) {
-                        f_rhs[p] -= sfz * d12dz *
-                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
-                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
-                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
-                    }
-                } else if (sfz < ZEO) {
-                    if ((k0 - 2) >= kminF) {
-                        f_rhs[p] -= sfz * d12dz *
-                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
-                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
-                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
-                    } else if ((k0 - 1) >= kminF) {
-                        f_rhs[p] += sfz * d12dz *
-                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
-                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
-                    } else if (k0 >= kminF) {
-                        f_rhs[p] += sfz * d12dz *
-                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
-                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
-                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
-                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
-                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
-                    }
-                }
-            }
-        }
-    }
-
-    // KO dissipation (same domain restriction as kodiss_c.C)
-    if (eps > ZEO) {
-        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
-        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
-        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
-        const int i0_hi = imaxF - 4; // inclusive
-        const int j0_hi = jmaxF - 4;
-        const int k0_hi = kmaxF - 4;
-
-        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
-            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
-                const int kF = k0 + 1;
-                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
-                    const int jF = j0 + 1;
-                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
-                        const int iF = i0 + 1;
-                        const size_t p = idx_ex(i0, j0, k0, ex);
-
-                        const double Dx_term =
-                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
-                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
-                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
-                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
-
-                        const double Dy_term =
-                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
-                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
-                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
-                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
-
-                        const double Dz_term =
-                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
-                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
-                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
-                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
-
-                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
-                    }
-                }
-            }
-        }
-    }
-
-    free(fh);
-}
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -2,12 +2,6 @@

 include makefile.inc

-## polint(ordn=6) kernel selector:
-##   1 (default): barycentric fast path
-##   0          : fallback to Neville path
-POLINT6_USE_BARY ?= 1
-POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-
 ## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
 ##   make                        -> opt  (PGO-guided, maximum performance)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
@@ -16,19 +10,17 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
+              -align array64byte -fpp -I${MKLROOT}/include
 else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
-## PGO has been turned off, now tested and found to be negative optimization
-## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
-
-
+## opt (default): maximum performance with PGO profile data
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -fprofile-instr-use=$(PROFDATA) \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
+              -fprofile-instr-use=$(PROFDATA) \
+              -align array64byte -fpp -I${MKLROOT}/include
 endif

 .SUFFIXES: .o .f90 .C .for .cu
@@ -61,17 +53,8 @@ kodiss_c.o: kodiss_c.C
 lopsided_c.o: lopsided_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

-lopsided_kodis_c.o: lopsided_kodis_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
-## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
-TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
-TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -fprofile-instr-use=$(TP_PROFDATA) \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include
+## TwoPunctureABE uses fixed optimal flags, independent of CXXAPPFLAGS (which may be PGO-instrumented)
+TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -Dfortran3 -Dnewc -I${MKLROOT}/include

 TwoPunctures.o: TwoPunctures.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
@@ -81,29 +64,15 @@ TwoPunctureABE.o: TwoPunctureABE.C

 # Input files

-## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
-ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
-CFILES =
-else
-# C++ mode (default): C rewrite of bssn_rhs and helper kernels
-CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
-endif
-
-## RK4 kernel switch (independent from USE_CXX_KERNELS)
-ifeq ($(USE_CXX_RK4),1)
-CFILES += rungekutta4_rout_c.o
-RK4_F90_OBJ =
-else
-RK4_F90_OBJ = rungekutta4_rout.o
-endif
+# C rewrite files
+CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o

 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
 	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
+	   NullShellPatch2_Evo.o writefile_f.o
 	   
 C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
@@ -113,9 +82,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 	   NullShellPatch2_Evo.o \
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o

-F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
+	   rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\
 	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
@@ -126,14 +95,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
 	   NullNews2.o tool_f.o

-ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: include original bssn_rhs.o
-F90FILES = $(F90FILES_BASE) bssn_rhs.o
-else
-# C++ mode (default): bssn_rhs.o replaced by C++ kernel
-F90FILES = $(F90FILES_BASE)
-endif
-
 F77FILES = zbesh.o

 AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -10,49 +10,10 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5

-## Memory allocator switch
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
-##   0           : use system default allocator (ptmalloc)
-USE_TBBMALLOC ?= 1
-TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
-ifneq ($(wildcard $(TBBMALLOC_SO)),)
-TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
-else
-TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
-endif
-ifeq ($(USE_TBBMALLOC),1)
-LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
-endif
-
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
 PGO_MODE ?= opt
-
-## Interp_Points load balance profiling mode
-##   off        : (default) no load balance instrumentation
-##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
-##   optimize   : Pass 2 — read profile and apply block rebalancing
-INTERP_LB_MODE ?= off
-
-ifeq ($(INTERP_LB_MODE),profile)
-INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
-else ifeq ($(INTERP_LB_MODE),optimize)
-INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
-else
-INTERP_LB_FLAGS =
-endif
-
-## Kernel implementation switch
-##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
-##   0           : fall back to original Fortran kernels
-USE_CXX_KERNELS ?= 1
-
-## RK4 kernel implementation switch
-##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
-##   0           : use original Fortran rungekutta4_rout.o
-USE_CXX_RK4 ?= 1
-
 f90          = ifx
 f77          = ifx
 CXX          = icpx
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1934,35 +1934,18 @@
 ! when if=1 -> ic=0, this is different to vertex center grid 
  real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3))   :: funcc
  integer,dimension(3) :: cxI
-  integer :: i,j,k,ii,jj,kk,px,py,pz
+  integer :: i,j,k,ii,jj,kk
  real*8, dimension(6,6) :: tmp2
  real*8, dimension(6) :: tmp1
-  integer, dimension(extf(1)) :: cix
-  integer, dimension(extf(2)) :: ciy
-  integer, dimension(extf(3)) :: ciz
-  integer, dimension(extf(1)) :: pix
-  integer, dimension(extf(2)) :: piy
-  integer, dimension(extf(3)) :: piz

  real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
  real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
-  real*8, dimension(6,2), parameter :: WC = reshape((/&
-      C1,C2,C3,C4,C5,C6,&
-      C6,C5,C4,C3,C2,C1/), (/6,2/))

  integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
-  integer::maxcx,maxcy,maxcz

  real*8,dimension(3) :: CD,FD
-  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
-  real*8 :: tmp_xyz_line(-2:extc(1))   ! 包含 X 向 6 点模板访问所需下界
-  real*8 :: v1, v2, v3, v4, v5, v6
-  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
-  integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
-  logical :: need_full_symmetry
-  real*8 :: res_line
-  real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2))  ! 包含 Y/X 向模板访问所需下界
+  
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2037,140 +2020,145 @@
          return
  endif

-  do i = imino,imaxo
-     ii = i + lbf(1) - 1
-     cix(i) = ii/2 - lbc(1) + 1
-     if(ii/2*2 == ii)then
-        pix(i) = 1
-     else
-        pix(i) = 2
-     endif
-  enddo
-  do j = jmino,jmaxo
-     jj = j + lbf(2) - 1
-     ciy(j) = jj/2 - lbc(2) + 1
-     if(jj/2*2 == jj)then
-        piy(j) = 1
-     else
-        piy(j) = 2
-     endif
-  enddo
-  do k = kmino,kmaxo
-     kk = k + lbf(3) - 1
-     ciz(k) = kk/2 - lbc(3) + 1
-     if(kk/2*2 == kk)then
-        piz(k) = 1
-     else
-        piz(k) = 2
-     endif
-  enddo
-
-  ic_min = minval(cix(imino:imaxo))
-  ic_max = maxval(cix(imino:imaxo))
-  jc_min = minval(ciy(jmino:jmaxo))
-  jc_max = maxval(ciy(jmino:jmaxo))
-  kc_min = minval(ciz(kmino:kmaxo))
-  kc_max = maxval(ciz(kmino:kmaxo))
-
-  maxcx = ic_max
-  maxcy = jc_max
-  maxcz = kc_max
-  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
-     write(*,*)"error in prolong"
-     return
-  endif
-
-  i_lo = ic_min - 2
-  i_hi = ic_max + 3
-  j_lo = jc_min - 2
-  j_hi = jc_max + 3
-  k_lo = kc_min - 2
-  k_hi = kc_max + 3
-  need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
-  if(need_full_symmetry)then
  call symmetry_bd(3,extc,func,funcc,SoA)
-  else
-     funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
-  endif
-
-     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
-
-do k = kmino, kmaxo
-    pz = piz(k); kc = ciz(k)
-    ! --- Pass 1: Z 方向，只算一次 ---
-    do iy = jc_min-2, jc_max+3   ! 仅需的 iy 范围（对应 jc-2:jc+3）
-        do ii = ic_min-2, ic_max+3  ! 仅需的 ii 范围（对应 cix-2:cix+3）
-            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
-        end do
-    end do
-
-    do j = jmino, jmaxo
-        py = piy(j); jc = ciy(j)
-        ! --- Pass 2: Y 方向 ---
-        do ii = ic_min-2, ic_max+3
-            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
-        end do
-        ! --- Pass 3: X 方向 ---
-        do i = imino, imaxo
-            funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
-        end do
-    end do
-end do
     
 !~~~~~~> prolongation start...
+  do k = kmino,kmaxo
+   do j = jmino,jmaxo
+    do i = imino,imaxo
+       cxI(1) = i
+       cxI(2) = j
+       cxI(3) = k
+! change to coarse level reference
+!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---| 
+!|=======x===============x===============x===============x=======|
+       cxI = (cxI+lbf-1)/2
+! change to array index      
+       cxI = cxI - lbc + 1
+
+       if(any(cxI+3 > extc)) write(*,*)"error in prolong"
+       ii=i+lbf(1)-1
+       jj=j+lbf(2)-1
+       kk=k+lbf(3)-1
 #if 0
- do k = kmino, kmaxo
-     pz = piz(k)
-     kc = ciz(k)
+       if(ii/2*2==ii)then
+         if(jj/2*2==jj)then
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           endif
+         else
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+           endif
+         endif
+       else
+         if(jj/2*2==jj)then
+           if(kk/2*2==kk)then               
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           endif
+         else
+           if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+           endif
+         endif
+       endif
+#else 
+       if(kk/2*2==kk)then
+             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+       else
+             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
+                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
+                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
+                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
+                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
+                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
+       endif

-     do j = jmino, jmaxo
-        py = piy(j)
-        jc = ciy(j)
+       if(jj/2*2==jj)then
+             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
+       else
+             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
+       endif

-! --- 步骤 1 & 2 融合：分段处理 X 轴，提升 Cache 命中率 ---
-        ! 我们将 ii 循环逻辑重组，减少对 funcc 的跨行重复访问
-        do ii = 1, extc(1)
-           ! 1. 先做 Z 方向的 6 条线插值（针对当前的 ii 和当前的 6 个 iy）
-           ! 我们直接在这里把 Y 方向的加权也做了，省去 tmp_yz 数组
-           ! 这样 funcc 的数据读进来后立即完成所有维度的贡献，不再写回内存
-           
-           res_line = 0.0d0
-           do jj = 1, 6
-              iy = jc - 3 + jj
-              ! 这一行代码是核心：一次性完成 Z 插值并加上 Y 的权重
-              ! 编译器会把 WC(jj, py) 存在寄存器里
-              res_line = res_line + WC(jj, py) * ( &
-                         WC(1, pz) * funcc(ii, iy, kc-2) + &
-                         WC(2, pz) * funcc(ii, iy, kc-1) + &
-                         WC(3, pz) * funcc(ii, iy, kc  ) + &
-                         WC(4, pz) * funcc(ii, iy, kc+1) + &
-                         WC(5, pz) * funcc(ii, iy, kc+2) + &
-                         WC(6, pz) * funcc(ii, iy, kc+3) )
-           end do
-           tmp_xyz_line(ii) = res_line
-        end do
-
-
-
-
-        ! 3. 【降维：X 向】最后在最内层只处理 X 方向的 6 点加权
-        ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
-        do i = imino, imaxo
-           px = pix(i)
-           ic = cix(i)
-           
-           ! 直接从预计算好的 line 中读取连续的 6 个点
-           ! ic-2 到 ic+3 对应原始 6 点算子
-           funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
-                         WC(2,px)*tmp_xyz_line(ic-1) + &
-                         WC(3,px)*tmp_xyz_line(ic  ) + &
-                         WC(4,px)*tmp_xyz_line(ic+1) + &
-                         WC(5,px)*tmp_xyz_line(ic+2) + &
-                         WC(6,px)*tmp_xyz_line(ic+3)
-        end do
-     end do
-  end do
+       if(ii/2*2==ii)then
+             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
+       else
+             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
+       endif
 #endif
+    enddo
+   enddo
+  enddo
+
  return

  end subroutine prolong3
@@ -2370,13 +2358,6 @@ end do

  real*8,dimension(3) :: CD,FD
  
-  real*8 :: tmp_xz_plane(-1:extf(1), 6)
-  real*8 :: tmp_x_line(-1:extf(1))
-  integer :: fi, fj, fk, ii, jj, kk
-  integer :: fi_min, fi_max, ii_lo, ii_hi
-  integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
-  logical :: need_full_symmetry
-
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2455,86 +2436,9 @@ end do
          stop
  endif

-  ! 仅计算 X 向最终写回所需的窗口：
-  ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
-  fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
-  fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
-  fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
-  fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
-  fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
-  fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
-  ii_lo = fi_min - 2
-  ii_hi = fi_max + 3
-  jj_lo = fj_min - 2
-  jj_hi = fj_max + 3
-  kk_lo = fk_min - 2
-  kk_hi = fk_max + 3
-  if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
-     jj_lo < -1 .or. jj_hi > extf(2) .or. &
-     kk_lo < -1 .or. kk_hi > extf(3))then
-      write(*,*)"restrict3: invalid stencil window"
-      write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
-      write(*,*)"extf=",extf
-      stop
-  endif
-  need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
-  if(need_full_symmetry)then
  call symmetry_bd(2,extf,funf,funff,SoA)
-  else
-      funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
-  endif

 !~~~~~~> restriction start...
-do k = kmino, kmaxo
-    fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
-
-    do j = jmino, jmaxo
-        fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
-        
-        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
-        ! 确保 ii 循环是最内层且连续访问
-        !DIR$ VECTOR ALWAYS
-        do ii = ii_lo, ii_hi
-            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
-            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
-            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
-                                  C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
-                                  C3*(funff(ii,fj-2,fk  )+funff(ii,fj-2,fk+1))
-            tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
-                                  C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
-                                  C3*(funff(ii,fj-1,fk  )+funff(ii,fj-1,fk+1))
-            tmp_xz_plane(ii, 3) = C1*(funff(ii,fj  ,fk-2)+funff(ii,fj  ,fk+3)) + &
-                                  C2*(funff(ii,fj  ,fk-1)+funff(ii,fj  ,fk+2)) + &
-                                  C3*(funff(ii,fj  ,fk  )+funff(ii,fj  ,fk+1))
-            tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
-                                  C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
-                                  C3*(funff(ii,fj+1,fk  )+funff(ii,fj+1,fk+1))
-            tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
-                                  C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
-                                  C3*(funff(ii,fj+2,fk  )+funff(ii,fj+2,fk+1))
-            tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
-                                  C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
-                                  C3*(funff(ii,fj+3,fk  )+funff(ii,fj+3,fk+1))
-        end do
-
-        ! 优化点 2: 同样向量化 Y 方向压缩
-        !DIR$ VECTOR ALWAYS
-        do ii = ii_lo, ii_hi
-            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
-                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
-                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
-        end do
-
-        ! 优化点 3: 最终写入，利用已经缓存在 tmp_x_line 的数据
-        do i = imino, imaxo
-            fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
-            func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
-                            C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
-                            C3*(tmp_x_line(fi  ) + tmp_x_line(fi+1))
-        end do
-    end do
-end do
-#if 0
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
@@ -2558,7 +2462,7 @@ end do
    enddo
   enddo
  enddo
-#endif
+  
  return

  end subroutine restrict3
--- a/AMSS_NCKU_source/rungekutta4_rout_c.C
+++ b/AMSS_NCKU_source/rungekutta4_rout_c.C
@@ -1,212 +0,0 @@
-#include "rungekutta4_rout.h"
-#include <cstdio>
-#include <cstdlib>
-#include <cstddef>
-#include <complex>
-#include <immintrin.h>
-
-namespace {
-
-inline void rk4_stage0(std::size_t n,
-                       const double *__restrict f0,
-                       const double *__restrict frhs,
-                       double *__restrict f1,
-                       double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d vr = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d vr = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * frhs[i];
-    }
-}
-
-inline void rk4_rhs_accum(std::size_t n,
-                          const double *__restrict f1,
-                          double *__restrict frhs) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d v2 = _mm512_set1_pd(2.0);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        const __m512d vrhs = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
-    }
-#elif defined(__AVX2__)
-    const __m256d v2 = _mm256_set1_pd(2.0);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        const __m256d vrhs = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        frhs[i] = frhs[i] + 2.0 * f1[i];
-    }
-}
-
-inline void rk4_f1_from_f0_f1(std::size_t n,
-                              const double *__restrict f0,
-                              double *__restrict f1,
-                              double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * f1[i];
-    }
-}
-
-inline void rk4_stage3(std::size_t n,
-                       const double *__restrict f0,
-                       double *__restrict f1,
-                       const double *__restrict frhs,
-                       double c) {
-    std::size_t i = 0;
-#if defined(__AVX512F__)
-    const __m512d vc = _mm512_set1_pd(c);
-    for (; i + 7 < n; i += 8) {
-        const __m512d v0 = _mm512_loadu_pd(f0 + i);
-        const __m512d v1 = _mm512_loadu_pd(f1 + i);
-        const __m512d vr = _mm512_loadu_pd(frhs + i);
-        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
-    }
-#elif defined(__AVX2__)
-    const __m256d vc = _mm256_set1_pd(c);
-    for (; i + 3 < n; i += 4) {
-        const __m256d v0 = _mm256_loadu_pd(f0 + i);
-        const __m256d v1 = _mm256_loadu_pd(f1 + i);
-        const __m256d vr = _mm256_loadu_pd(frhs + i);
-        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
-    }
-#endif
-#pragma ivdep
-    for (; i < n; ++i) {
-        f1[i] = f0[i] + c * (f1[i] + frhs[i]);
-    }
-}
-
-} // namespace
-
-extern "C" {
-
-void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
-    constexpr double F1o6 = 1.0 / 6.0;
-    constexpr double HLF = 0.5;
-    constexpr double TWO = 2.0;
-
-    switch (RK4) {
-    case 0:
-        f1 = f0 + HLF * dT * f_rhs;
-        break;
-    case 1:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + HLF * dT * f1;
-        break;
-    case 2:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + dT * f1;
-        break;
-    case 3:
-        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
-        break;
-    default:
-        std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-}
-
-void rungekutta4_cplxscalar_(double &dT,
-                             std::complex<double> &f0,
-                             std::complex<double> &f1,
-                             std::complex<double> &f_rhs,
-                             int &RK4) {
-    constexpr double F1o6 = 1.0 / 6.0;
-    constexpr double HLF = 0.5;
-    constexpr double TWO = 2.0;
-
-    switch (RK4) {
-    case 0:
-        f1 = f0 + HLF * dT * f_rhs;
-        break;
-    case 1:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + HLF * dT * f1;
-        break;
-    case 2:
-        f_rhs = f_rhs + TWO * f1;
-        f1 = f0 + dT * f1;
-        break;
-    case 3:
-        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
-        break;
-    default:
-        std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-}
-
-int f_rungekutta4_rout(int *ex, double &dT,
-                       double *f0, double *f1, double *f_rhs,
-                       int &RK4) {
-    const std::size_t n = static_cast<std::size_t>(ex[0]) *
-                          static_cast<std::size_t>(ex[1]) *
-                          static_cast<std::size_t>(ex[2]);
-    const double *const __restrict f0r = f0;
-    double *const __restrict f1r = f1;
-    double *const __restrict frhs = f_rhs;
-
-    if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
-        std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
-        std::abort();
-    }
-
-    switch (RK4) {
-    case 0:
-        rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
-        break;
-    case 1:
-        rk4_rhs_accum(n, f1r, frhs);
-        rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
-        break;
-    case 2:
-        rk4_rhs_accum(n, f1r, frhs);
-        rk4_f1_from_f0_f1(n, f0r, f1r, dT);
-        break;
-    default:
-        rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
-        break;
-    }
-
-    return 0;
-}
-
-} // extern "C"
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -5,7 +5,6 @@
 #include <stddef.h>
 #include <math.h>
 #include <stdio.h>
-#include <string.h>
 /* 主网格：0-based -> 1D */
 static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
    const int ex1 = ex[0], ex2 = ex[1];
@@ -88,159 +87,60 @@ static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc
 *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
 * enddo
 */
-static inline void symmetry_bd_impl(int ord,
-                 int shift,
-                 const int extc[3],
-                 const double *__restrict func,
-                 double *__restrict funcc,
-                 const double SoA[3])
-{
-    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
-    const int nx = extc1 + ord;
-    const int ny = extc2 + ord;
-
-    const size_t snx = (size_t)nx;
-    const size_t splane = (size_t)nx * (size_t)ny;
-    const size_t interior_i = (size_t)shift + 1u;          /* iF = 1 */
-    const size_t interior_j = ((size_t)shift + 1u) * snx;  /* jF = 1 */
-    const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
-    const size_t interior0 = interior_k + interior_j + interior_i;
-
-    /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
-    for (int k0 = 0; k0 < extc3; ++k0) {
-        const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
-        const size_t dst_k0 = interior0 + (size_t)k0 * splane;
-        for (int j0 = 0; j0 < extc2; ++j0) {
-            const double *src = src_k + (size_t)j0 * (size_t)extc1;
-            double *dst = funcc + dst_k0 + (size_t)j0 * snx;
-            memcpy(dst, src, (size_t)extc1 * sizeof(double));
-        }
-    }
-
-    /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
-    const double s1 = SoA[0];
-    if (s1 == 1.0) {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = funcc[off + src_i];
-                }
-            }
-        }
-    } else if (s1 == -1.0) {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = -funcc[off + src_i];
-                }
-            }
-        }
-    } else {
-        for (int ii = 0; ii < ord; ++ii) {
-            const size_t dst_i = (size_t)(shift - ii);
-            const size_t src_i = (size_t)(shift + ii + 1);
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
-                for (int j0 = 0; j0 < extc2; ++j0) {
-                    const size_t off = kbase + (size_t)j0 * snx;
-                    funcc[off + dst_i] = funcc[off + src_i] * s1;
-                }
-            }
-        }
-    }
-
-    /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
-    const double s2 = SoA[1];
-    if (s2 == 1.0) {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = src[i];
-            }
-        }
-    } else if (s2 == -1.0) {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = -src[i];
-            }
-        }
-    } else {
-        for (int jj = 0; jj < ord; ++jj) {
-            const size_t dst_j = (size_t)(shift - jj) * snx;
-            const size_t src_j = (size_t)(shift + jj + 1) * snx;
-            for (int k0 = 0; k0 < extc3; ++k0) {
-                const size_t kbase = interior_k + (size_t)k0 * splane;
-                double *dst = funcc + kbase + dst_j;
-                const double *src = funcc + kbase + src_j;
-                for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
-            }
-        }
-    }
-
-    /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
-    const double s3 = SoA[2];
-    if (s3 == 1.0) {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
-        }
-    } else if (s3 == -1.0) {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
-        }
-    } else {
-        for (int kk = 0; kk < ord; ++kk) {
-            const size_t dst_k = (size_t)(shift - kk) * splane;
-            const size_t src_k = (size_t)(shift + kk + 1) * splane;
-            double *dst = funcc + dst_k;
-            const double *src = funcc + src_k;
-            for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
-        }
-    }
-}
-
 static inline void symmetry_bd(int ord,
                 const int extc[3],
                 const double *func,
                 double *funcc,
                 const double SoA[3])
 {
-    if (ord <= 0) return;
+    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];

-    /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
-    if (ord == 2) {
-        symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
-        return;
+    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
+    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
+    for (int k0 = 0; k0 < extc3; ++k0) {
+        for (int j0 = 0; j0 < extc2; ++j0) {
+            for (int i0 = 0; i0 < extc1; ++i0) {
+                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
+                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
+            }
        }
-    if (ord == 3) {
-        symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
-        return;
    }

-    symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
+    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
+    for (int ii = 0; ii <= ord - 1; ++ii) {
+        const int iF_dst = -ii;       // 0, -1, -2, ...
+        const int iF_src = ii + 1;    // 1, 2, 3, ...
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int jF = 1; jF <= extc2; ++jF) {
+                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
+            }
+        }
+    }
+
+    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
+    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
+    for (int jj = 0; jj <= ord - 1; ++jj) {
+        const int jF_dst = -jj;
+        const int jF_src = jj + 1;
+        for (int kF = 1; kF <= extc3; ++kF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
+            }
+        }
+    }
+
+    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
+    for (int kk = 0; kk <= ord - 1; ++kk) {
+        const int kF_dst = -kk;
+        const int kF_src = kk + 1;
+        for (int jF = -ord + 1; jF <= extc2; ++jF) {
+            for (int iF = -ord + 1; iF <= extc1; ++iF) {
+                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
+                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
+            }
+        }
+    }
 }
 #endif
--- a/AMSS_NCKU_source/tool.h
+++ b/AMSS_NCKU_source/tool.h
@@ -25,9 +25,3 @@ void lopsided(const int ex[3],
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3]);
-
-void lopsided_kodis(const int ex[3],
-                    const double *X, const double *Y, const double *Z,
-                    const double *f, double *f_rhs,
-                    const double *Sfx, const double *Sfy, const double *Sfz,
-                    int Symmetry, const double SoA[3], double eps);
--- a/generate_interp_lb_header.py
+++ b/generate_interp_lb_header.py
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
-import struct, sys
-
-if len(sys.argv) < 3:
-    print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
-    sys.exit(1)
-
-with open(sys.argv[1], 'rb') as f:
-    magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
-    threshold = struct.unpack('d', f.read(8))[0]
-    times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
-    heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
-
-# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
-# (or vice versa depending on which neighbor is lighter)
-splits = []
-for hr in heavy:
-    prev_t = times[hr - 1] if hr > 0 else 1e30
-    next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
-    if prev_t <= next_t:
-        splits.append((hr, hr - 1, hr))  # (block_id, r_left, r_right)
-    else:
-        splits.append((hr, hr, hr + 1))
-
-# Also remap the displaced neighbor blocks
-remaps = {}
-for hr, r_l, r_r in splits:
-    if r_l != hr:
-        # We took r_l's slot, so remap block r_l to its other neighbor
-        displaced = r_l
-        if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
-            remaps[displaced] = displaced - 1
-        elif displaced < nprocs - 1:
-            remaps[displaced] = displaced + 1
-    else:
-        displaced = r_r
-        if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
-            remaps[displaced] = displaced + 1
-        elif displaced > 0:
-            remaps[displaced] = displaced - 1
-
-with open(sys.argv[2], 'w') as out:
-    out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
-    out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
-    out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
-    out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
-    out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
-    out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
-    out.write(", ".join(str(h) for h in heavy))
-    out.write("};\n\n")
-    out.write("/* Split table: {block_id, r_left, r_right} */\n")
-    out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
-    for bid, rl, rr in splits:
-        out.write(f"    {{{bid}, {rl}, {rr}}},\n")
-    out.write("};\n\n")
-    out.write("/* Rank remap for displaced neighbor blocks */\n")
-    out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
-    out.write(f"static const int interp_lb_remaps[][2] = {{\n")
-    for src, dst in sorted(remaps.items()):
-        out.write(f"    {{{src}, {dst}}},\n")
-    if not remaps:
-        out.write("    {-1, -1},\n")
-    out.write("};\n\n")
-    out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
-
-print(f"Generated {sys.argv[2]}:")
-print(f"  {num_heavy} heavy blocks to split: {heavy}")
-for bid, rl, rr in splits:
-    print(f"    block {bid}: split -> rank {rl} (left), rank {rr} (right)")
-for src, dst in sorted(remaps.items()):
-    print(f"    block {src}: remap -> rank {dst}")
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -43,8 +43,7 @@ def get_last_n_cores_per_socket(n=32):
    cpu_str = ",".join(segments)
    total = len(segments) * n
    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
-    #return f"taskset -c {cpu_str}"
-    return f""
+    return f"taskset -c {cpu_str}"


 ## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
@@ -70,7 +69,7 @@ def makefile_ABE():

    ## Build command with CPU binding to nohz_full cores
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
    elif (input_data.GPU_Calculation == "yes"):
        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
    else:
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -0,0 +1,97 @@
+# AMSS-NCKU PGO Profile Analysis Report
+
+## 1. Profiling Environment
+
+| Item | Value |
+|------|-------|
+| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
+| Instrumentation Flag | `-fprofile-instr-generate` |
+| Optimization Level (instrumented) | `-O2 -xHost -fma` |
+| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
+| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
+| Merged Profile | `default.profdata` (394 KB) |
+| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
+
+## 2. Reduced Simulation Parameters (for profiling run)
+
+| Parameter | Production Value | Profiling Value |
+|-----------|-----------------|-----------------|
+| MPI_processes | 64 | 1 |
+| grid_level | 9 | 4 |
+| static_grid_level | 5 | 3 |
+| static_grid_number | 96 | 24 |
+| moving_grid_number | 48 | 16 |
+| largest_box_xyz_max | 320^3 | 160^3 |
+| Final_Evolution_Time | 1000.0 | 10.0 |
+| Evolution_Step_Number | 10,000,000 | 1,000 |
+| Detector_Number | 12 | 2 |
+
+## 3. Profile Summary
+
+| Metric | Value |
+|--------|-------|
+| Total instrumented functions | 1,392 |
+| Functions with non-zero counts | 117 (8.4%) |
+| Functions with zero counts | 1,275 (91.6%) |
+| Maximum function entry count | 386,459,248 |
+| Maximum internal block count | 370,477,680 |
+| Total block count | 4,198,023,118 |
+
+## 4. Top 20 Hotspot Functions
+
+| Rank | Total Count | Max Block Count | Function | Category |
+|------|------------|-----------------|----------|----------|
+| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
+| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
+| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
+| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
+| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
+| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
+| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
+| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
+| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
+| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
+| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
+| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
+| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
+| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
+| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
+| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
+| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
+| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
+| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
+| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
+
+## 5. Hotspot Category Breakdown
+
+Top 20 functions account for ~98% of total execution counts:
+
+| Category | Functions | Combined Count | Share |
+|----------|-----------|---------------|-------|
+| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
+| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
+| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
+| Time integration | rungekutta4_rout_ | ~119M | ~3% |
+| Dissipation | kodis_ | ~92M | ~2% |
+| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
+
+## 6. Conclusions
+
+1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
+2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
+3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
+4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
+5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
+
+## 7. PGO Phase 2 Usage
+
+To apply the profile, use the following flags in `makefile.inc`:
+
+```makefile
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
+              -align array64byte -fpp -I${MKLROOT}/include
+```
--- a/pgo_profile/TwoPunctureABE.profdata
+++ b/pgo_profile/TwoPunctureABE.profdata
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default_9726853898452064389_0.profdata
+++ b/pgo_profile/default_9726853898452064389_0.profdata
--- a/pgo_profile/default.profdatabackup3
+++ b/pgo_profile/default.profdatabackup3
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/pgo_profile/default_9725923726611433605_0.profraw
+++ b/pgo_profile/default_9725923726611433605_0.profraw