迁移C算子的循环融合和临时量消除

bssn_rhs(fortran): migrate C kernel loop-fusion optimizations
关闭C重写算子
2026-03-03 15:57:10 +08:00 · 2026-03-03 15:41:26 +08:00 · 2026-03-03 15:28:09 +08:00 · 2026-03-03 15:22:01 +08:00 · 2026-03-03 15:15:06 +08:00 · 2026-03-03 12:36:19 +08:00
38 changed files with 5460 additions and 1411 deletions
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -270,6 +270,12 @@ if not os.path.exists( ABE_file ):
 ## Copy the executable ABE (or ABEGPU) into the run directory
 shutil.copy2(ABE_file, output_directory)
 ## Copy interp load balance profile if present (for optimize pass)
 interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
 if os.path.exists(interp_lb_profile):
    shutil.copy2(interp_lb_profile, output_directory)
    print( " Copied interp_lb_profile.bin to run directory " )
 ###########################
 ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -1,9 +1,13 @@
 #!/usr/bin/env python3
 """
-AMSS-NCKU GW150914 Simulation Regression Test Script
+AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
 Verification Requirements:
-1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
+1. RMS errors < 1% for:
   - 3D Vector Total RMS
   - X Component RMS
   - Y Component RMS
   - Z Component RMS
 2. ADM constraint violation < 2 (Grid Level 0)
 RMS Calculation Method:
@@ -57,79 +61,62 @@ def load_constraint_data(filepath):
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
-
+def calculate_all_rms_errors(bh_data_ref, bh_data_target):
 def calculate_rms_error(bh_data_ref, bh_data_target):
    """
-    Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
+    Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
-
+    Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
-    This function computes the RMS error independently for BH1 and BH2 trajectories,
+    Returns the maximum error between BH1 and BH2 for each category.
    then returns the maximum of the two as the final RMS error metric.
    For each black hole, the RMS is calculated as:
        RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
    where:
        Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
        r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
    Args:
        bh_data_ref: Reference (baseline) trajectory data
        bh_data_target: Target (optimized) trajectory data
    Returns:
        rms_value: Final RMS error as a percentage (max of BH1 and BH2)
        error: Error message if any
    """
    # Align data: truncate to the length of the shorter dataset
    M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
    if M < 10:
        return None, "Insufficient data points for comparison"
-    # Extract XY coordinates for both black holes
+    results = {}
    x1_ref = bh_data_ref['x1'][:M]
    y1_ref = bh_data_ref['y1'][:M]
    x2_ref = bh_data_ref['x2'][:M]
    y2_ref = bh_data_ref['y2'][:M]
-    x1_new = bh_data_target['x1'][:M]
+    for bh in ['1', '2']:
-    y1_new = bh_data_target['y1'][:M]
+        x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
-    x2_new = bh_data_target['x2'][:M]
+        x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
    y2_new = bh_data_target['y2'][:M]
-    # Calculate RMS for BH1
+        # 核心修改：根据组委会的邮件指示，分母统一使用 r = sqrt(x^2 + y^2)
-    delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
+        r_ref = np.sqrt(x_r**2 + y_r**2)
-    r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
+        r_new = np.sqrt(x_n**2 + y_n**2)
-    r1_new = np.sqrt(x1_new**2 + y1_new**2)
+        denom_max = np.maximum(r_ref, r_new)
    r1_max = np.maximum(r1_ref, r1_new)
-    # Calculate RMS for BH2
+        valid = denom_max > 1e-15
-    delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
+        if np.sum(valid) < 10:
-    r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
+            results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
-    r2_new = np.sqrt(x2_new**2 + y2_new**2)
+            continue
    r2_max = np.maximum(r2_ref, r2_new)
-    # Avoid division by zero for BH1
+        def calc_rms(delta):
-    valid_mask1 = r1_max > 1e-15
+            # 将对应分量的偏差除以统一的轨道半径分母 denom_max
-    if np.sum(valid_mask1) < 10:
+            return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
        return None, "Insufficient valid data points for BH1"
-    terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
+        # 1. Total 3D Vector RMS
-    rms_bh1 = np.sqrt(np.mean(terms1)) * 100
+        delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
        rms_3d = calc_rms(delta_vec)
-    # Avoid division by zero for BH2
+        # 2. Component-wise RMS (分离计算各轴，但共用半径分母)
-    valid_mask2 = r2_max > 1e-15
+        rms_x = calc_rms(np.abs(x_r - x_n))
-    if np.sum(valid_mask2) < 10:
+        rms_y = calc_rms(np.abs(y_r - y_n))
-        return None, "Insufficient valid data points for BH2"
+        rms_z = calc_rms(np.abs(z_r - z_n))
-    terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
+        results[f'BH{bh}'] = {
-    rms_bh2 = np.sqrt(np.mean(terms2)) * 100
+            '3D_Vector': rms_3d,
            'X_Component': rms_x,
            'Y_Component': rms_y,
            'Z_Component': rms_z
        }
-    # Final RMS is the maximum of BH1 and BH2
+    # 获取 BH1 和 BH2 中的最大误差
-    rms_final = max(rms_bh1, rms_bh2)
+    max_rms = {
-
+        '3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
-    return rms_final, None
+        'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
        'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
        'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
    }
    return max_rms, None
 def analyze_constraint_violation(constraint_data, n_levels=9):
    """
@@ -155,34 +142,32 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
 def print_header():
    """Print report header"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print(Color.BOLD + "   AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
+    print(Color.BOLD + "   AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-
+def print_rms_results(rms_dict, error, threshold=1.0):
-def print_rms_results(rms_rel, error, threshold=1.0):
+    print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
-    """Print RMS error results"""
+    print("-" * 65)
    print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
    print("-" * 45)
    if error:
        print(f"   {Color.RED}Error: {error}{Color.RESET}")
        return False
-    passed = rms_rel < threshold
+    all_passed = True
    print(f"   Requirement: < {threshold}%\n")
-    print(f"   RMS relative error: {rms_rel:.4f}%")
+    for key, val in rms_dict.items():
-    print(f"   Requirement:        < {threshold}%")
+        passed = val < threshold
-    print(f"   Status:             {get_status_text(passed)}")
+        all_passed = all_passed and passed
-
+        status = get_status_text(passed)
-    return passed
+        print(f"   {key:15}: {val:8.4f}%   |   Status: {status}")
    return all_passed
 def print_constraint_results(results, threshold=2.0):
    """Print constraint violation results"""
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
-    print("-" * 45)
+    print("-" * 65)
    names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
    for i, name in enumerate(names):
@@ -200,7 +185,6 @@ def print_constraint_results(results, threshold=2.0):
 def print_summary(rms_passed, constraint_passed):
    """Print summary"""
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
@@ -210,7 +194,7 @@ def print_summary(rms_passed, constraint_passed):
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
-    print(f"   [1] RMS trajectory check:         {res_rms}")
+    print(f"   [1] Comprehensive RMS check:      {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
@@ -219,61 +203,48 @@ def print_summary(rms_passed, constraint_passed):
    return all_passed
 def main():
    # Determine target (optimized) output directory
    if len(sys.argv) > 1:
        target_dir = sys.argv[1]
    else:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
    # Determine reference (baseline) directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    # Data file paths
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    # Check if files exist
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
        sys.exit(1)
    if not os.path.exists(bh_file_target):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
        sys.exit(1)
    if not os.path.exists(constraint_file):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
    # Print header
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    # Load data
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
    constraint_data = load_constraint_data(constraint_file)
-    # Calculate RMS error
+    # Output modified RMS results
-    rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
+    rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
-    rms_passed = print_rms_results(rms_rel, error)
+    rms_passed = print_rms_results(rms_dict, error)
-    # Analyze constraint violation
+    # Output constraint results
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
    # Print summary
    all_passed = print_summary(rms_passed, constraint_passed)
    # Return exit code
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -7,12 +7,178 @@
 #include <string>
 #include <cmath>
 #include <new>
 #include <vector>
 using namespace std;
 #include "misc.h"
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
 #ifdef INTERP_LB_PROFILE
 #include "interp_lb_profile.h"
 #endif
 namespace
 {
 struct InterpBlockView
 {
  Block *bp;
  double llb[dim];
  double uub[dim];
 };
 struct BlockBinIndex
 {
  int bins[dim];
  double lo[dim];
  double inv[dim];
  vector<InterpBlockView> views;
  vector<vector<int>> bin_to_blocks;
  bool valid;
  BlockBinIndex() : valid(false)
  {
    for (int i = 0; i < dim; i++)
    {
      bins[i] = 1;
      lo[i] = 0.0;
      inv[i] = 0.0;
    }
  }
 };
 inline int clamp_int(int v, int lo, int hi)
 {
  return (v < lo) ? lo : ((v > hi) ? hi : v);
 }
 inline int coord_to_bin(double x, double lo, double inv, int nb)
 {
  if (nb <= 1 || inv <= 0.0)
    return 0;
  int b = int(floor((x - lo) * inv));
  return clamp_int(b, 0, nb - 1);
 }
 inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
 {
  return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
 }
 inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
 {
  for (int i = 0; i < dim; i++)
  {
    if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
      return false;
  }
  return true;
 }
 void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
 {
  index = BlockBinIndex();
  MyList<Block> *Bp = patch->blb;
  while (Bp)
  {
    Block *BP = Bp->data;
    InterpBlockView view;
    view.bp = BP;
    for (int i = 0; i < dim; i++)
    {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    }
    index.views.push_back(view);
    if (Bp == patch->ble)
      break;
    Bp = Bp->next;
  }
  const int nblocks = int(index.views.size());
  if (nblocks <= 0)
    return;
  int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
  bins_1d = clamp_int(bins_1d, 1, 32);
  for (int i = 0; i < dim; i++)
  {
    index.bins[i] = bins_1d;
    index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
    const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
    if (hi > index.lo[i] && bins_1d > 1)
      index.inv[i] = bins_1d / (hi - index.lo[i]);
    else
      index.inv[i] = 0.0;
  }
  index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
  for (int bi = 0; bi < nblocks; bi++)
  {
    const InterpBlockView &view = index.views[bi];
    int bmin[dim], bmax[dim];
    for (int d = 0; d < dim; d++)
    {
      const double low = view.llb[d] - DH[d] / 2;
      const double up = view.uub[d] + DH[d] / 2;
      bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
      bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
      if (bmax[d] < bmin[d])
      {
        int t = bmin[d];
        bmin[d] = bmax[d];
        bmax[d] = t;
      }
    }
    for (int bz = bmin[2]; bz <= bmax[2]; bz++)
      for (int by = bmin[1]; by <= bmax[1]; by++)
        for (int bx = bmin[0]; bx <= bmax[0]; bx++)
          index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
  }
  index.valid = true;
 }
 int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
 {
  if (!index.valid)
    return -1;
  const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
  const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
  const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
  const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
  for (size_t ci = 0; ci < cand.size(); ci++)
  {
    const int bi = cand[ci];
    if (point_in_block_view(index.views[bi], pox, DH))
      return bi;
  }
  // Fallback to full scan for numerical edge cases around bin boundaries.
  for (size_t bi = 0; bi < index.views.size(); bi++)
    if (point_in_block_view(index.views[bi], pox, DH))
      return int(bi);
  return -1;
 }
 } // namespace
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -364,9 +530,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -389,57 +557,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        //---> interpolation
-#ifdef Cell
+        varl = VarList;
-#error Both Cell and Vertex are defined
+        int k = 0;
-#endif
+        while (varl) // run along variables
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
          varl = VarList;
          int k = 0;
          while (varl) // run along variables
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
@@ -507,6 +642,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
 #ifdef INTERP_LB_PROFILE
  double t_interp_start = MPI_Wtime();
 #endif
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -529,9 +667,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  // --- Interpolation phase (identical to original) ---
  for (int j = 0; j < NN; j++)
@@ -555,59 +695,31 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp)
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        varl = VarList;
-#ifdef Cell
+        int k = 0;
-#error Both Cell and Vertex are defined
+        while (varl)
 #endif
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          varl = VarList;
          int k = 0;
          while (varl)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
  double t_interp_local = t_interp_end - t_interp_start;
 #endif
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
@@ -764,6 +876,31 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
 #ifdef INTERP_LB_PROFILE
  {
    static bool profile_written = false;
    if (!profile_written) {
      double *all_times = nullptr;
      if (myrank == 0) all_times = new double[nprocs];
      MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
                 all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
      if (myrank == 0) {
        int heavy[64];
        int nh = InterpLBProfile::identify_heavy_ranks(
            all_times, nprocs, 2.5, heavy, 64);
        InterpLBProfile::write_profile(
            "interp_lb_profile.bin", nprocs,
            all_times, heavy, nh, 2.5);
        printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
        for (int i = 0; i < nh; i++)
          printf("  Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
        delete[] all_times;
      }
      profile_written = true;
    }
  }
 #endif
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -797,9 +934,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
  MPI_Comm_group(Comm_here, &local_group);
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  for (int j = 0; j < NN; j++) // run along points
  {
@@ -822,57 +961,24 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    MyList<Block> *Bp = blb;
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    bool notfind = true;
+    if (block_i >= 0)
    while (notfind && Bp) // run along Blocks
    {
-      Block *BP = Bp->data;
+      Block *BP = block_index.views[block_i].bp;
-
+      owner_rank[j] = BP->rank;
-      bool flag = true;
+      if (myrank == BP->rank)
      for (int i = 0; i < dim; i++)
      {
-#ifdef Vertex
+        //---> interpolation
-#ifdef Cell
+        varl = VarList;
-#error Both Cell and Vertex are defined
+        int k = 0;
-#endif
+        while (varl) // run along variables
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
 #else
 #ifdef Cell
        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
        {
-          flag = false;
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
-          break;
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
        }
      }
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
          //---> interpolation
          varl = VarList;
          int k = 0;
          while (varl) // run along variables
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
      if (Bp == ble)
        break;
      Bp = Bp->next;
    }
  }
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -32,6 +32,16 @@ namespace Parallel
  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
  MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
                           int ib0_orig, int ib3_orig,
                           int jb1_orig, int jb4_orig,
                           int kb2_orig, int kb5_orig,
                           Patch* PP, int r_left, int r_right,
                           int ingfsi, int fngfsi, bool periodic,
                           Block* &split_first_block, Block* &split_last_block);
  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
                           int block_id, int ingfsi, int fngfsi, int lev);
  void KillBlocks(MyList<Patch> *PatchLIST);
  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -98,6 +108,9 @@ namespace Parallel
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    SyncCache();
    void invalidate();
    void destroy();
@@ -111,7 +124,10 @@ namespace Parallel
  struct AsyncSyncState {
    int req_no;
    bool active;
-    AsyncSyncState() : req_no(0), active(false) {}
+    int *req_node;
    int *req_is_recv;
    int pending_recv;
    AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -36,12 +36,18 @@ using namespace std;
 #include "myglobal.h"
 #endif
-#include "perf.h"
+#include "perf.h"
-
+
-#include "derivatives.h"
+#include "derivatives.h"
-#include "ricci_gamma.h"
+#include "ricci_gamma.h"
-
+
-//================================================================================================
+// Compile-time switch for per-timestep memory usage collection/printing.
 // Default is OFF to reduce overhead in production runs.
 #ifndef BSSN_ENABLE_MEM_USAGE_LOG
 #define BSSN_ENABLE_MEM_USAGE_LOG 0
 #endif
 //================================================================================================
 // define bssn_class
@@ -736,6 +742,8 @@ void bssn_class::Initialize()
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -2127,8 +2135,10 @@ void bssn_class::Evolve(int Steps)
  #endif
  */
-  perf bssn_perf;
+#if BSSN_ENABLE_MEM_USAGE_LOG
-  size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
+  perf bssn_perf;
  size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
 #endif
  for (int lev = 0; lev < GH->levels; lev++)
    GH->Lt[lev] = PhysTime;
@@ -2213,7 +2223,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2222,21 +2232,23 @@ void bssn_class::Evolve(int Steps)
 //		fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor);
 #endif
-    // Retrieve memory usage information used during computation; master process prints it
+#if BSSN_ENABLE_MEM_USAGE_LOG
-    bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
+    // Retrieve memory usage information used during computation; master process prints it
-                          &peak_min, &peak_avg, &peak_max, nprocs);
+    bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
-    if (myrank == 0)
+                          &peak_min, &peak_avg, &peak_max, nprocs);
-    {
+    if (myrank == 0)
-      printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
+    {
-             "peak %0.4lg/%0.4lg/%0.4lgMB\n",
+      printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
-             (double)current_min / (1024.0 * 1024.0),
+             "peak %0.4lg/%0.4lg/%0.4lgMB\n",
-             (double)current_avg / (1024.0 * 1024.0),
+             (double)current_min / (1024.0 * 1024.0),
-             (double)current_max / (1024.0 * 1024.0),
+             (double)current_avg / (1024.0 * 1024.0),
-             (double)peak_min / (1024.0 * 1024.0),
+             (double)current_max / (1024.0 * 1024.0),
-             (double)peak_avg / (1024.0 * 1024.0),
+             (double)peak_min / (1024.0 * 1024.0),
-             (double)peak_max / (1024.0 * 1024.0));
+             (double)peak_avg / (1024.0 * 1024.0),
-      cout << endl;
+             (double)peak_max / (1024.0 * 1024.0));
-    }
+      cout << endl;
    }
 #endif
    // Output puncture positions at each step
    if (myrank == 0)
@@ -2426,10 +2438,10 @@ void bssn_class::RecursiveStep(int lev)
 #endif
 #if (REGLEV == 0)
-  GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }
@@ -2605,10 +2617,10 @@ void bssn_class::ParallelStep()
  delete[] tporg;
  delete[] tporgo;
 #if (REGLEV == 0)
-  GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
+  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
-                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }
@@ -2772,10 +2784,10 @@ void bssn_class::ParallelStep()
      if (lev + 1 >= GH->movls)
      {
        //	       GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
-        GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
+        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
-                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
+                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -2787,10 +2799,10 @@ void bssn_class::ParallelStep()
    // for this level
    if (YN == 1)
    {
-      GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
+      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
-                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
+                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -2806,10 +2818,10 @@ void bssn_class::ParallelStep()
        if (YN == 1)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -2822,10 +2834,10 @@ void bssn_class::ParallelStep()
        if (i % 4 == 3)
        {
          //	   GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
-          GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
+          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
+                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -5796,7 +5808,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5820,7 +5832,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5847,7 +5859,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5871,7 +5883,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -5940,7 +5952,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5950,7 +5962,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5962,7 +5974,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5972,7 +5984,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6027,7 +6039,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6037,7 +6049,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6051,7 +6063,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
@@ -6061,7 +6073,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6102,7 +6114,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6115,7 +6127,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    {
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -130,6 +130,8 @@ public:
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -59,9 +59,10 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
  real*8,intent(in) :: eps
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
-  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
+  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
-!  gont = 0: success; gont = 1: something wrong
+!  gont = 0: success; gont = 1: something wrong
-  integer::gont
+  integer::gont
  integer :: i,j,k
 !~~~~~~> Other variables:
@@ -83,11 +84,18 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
-  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
+  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
-  real*8            :: dX, dY, dZ, PI
+  real*8            :: dX, dY, dZ, PI
-  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
+  real*8            :: divb_loc,det_loc
-  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
+  real*8            :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
-  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
+  real*8            :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
  real*8            :: fxx_loc,fxy_loc,fxz_loc
  real*8            :: Gamxa_loc,Gamya_loc,Gamza_loc
  real*8            :: f_loc,chin_loc
  real*8            :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
  double precision,parameter::FF = 0.75d0,eta=2.d0
  real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
  real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -96,11 +104,11 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta
 #endif
-#if (GAUGE == 6 || GAUGE == 7)
+#if (GAUGE == 6 || GAUGE == 7)
-  integer :: BHN,i,j,k
+  integer :: BHN
-  real*8, dimension(9) :: Porg
+  real*8, dimension(9) :: Porg
-  real*8, dimension(3) :: Mass
+  real*8, dimension(3) :: Mass
-  real*8 :: r1,r2,M,A,w1,w2,C1,C2
+  real*8 :: r1,r2,M,A,w1,w2,C1,C2
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta
  call getpbh(BHN,Porg,Mass)
@@ -145,174 +153,204 @@
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
-  alpn1 = Lap + ONE
+  do k=1,ex(3)
-  chin1 = chi + ONE
+  do j=1,ex(2)
-  gxx = dxx + ONE
+  do i=1,ex(1)
-  gyy = dyy + ONE
+    alpn1(i,j,k) = Lap(i,j,k) + ONE
-  gzz = dzz + ONE
+    chin1(i,j,k) = chi(i,j,k) + ONE
    gxx(i,j,k) = dxx(i,j,k) + ONE
    gyy(i,j,k) = dyy(i,j,k) + ONE
    gzz(i,j,k) = dzz(i,j,k) + ONE
  enddo
  enddo
  enddo
  call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
  call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
  call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
-  div_beta = betaxx + betayy + betazz
+  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
-  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
+  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-
+  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
-  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
-  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
+  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
+  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
-  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
+
-  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  do k=1,ex(3)
-
+  do j=1,ex(2)
-  gxx_rhs = - TWO * alpn1 * Axx    -  F2o3 * gxx * div_beta          + &
+  do i=1,ex(1)
-              TWO *(  gxx * betaxx +   gxy * betayx +   gxz * betazx)
+    divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
-
+    div_beta(i,j,k) = divb_loc
-  gyy_rhs = - TWO * alpn1 * Ayy    -  F2o3 * gyy * div_beta          + &
+
-              TWO *(  gxy * betaxy +   gyy * betayy +   gyz * betazy)
+    chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
-
+
-  gzz_rhs = - TWO * alpn1 * Azz    -  F2o3 * gzz * div_beta          + &
+    gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
-              TWO *(  gxz * betaxz +   gyz * betayz +   gzz * betazz)
+         TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
-
+
-  gxy_rhs = - TWO * alpn1 * Axy    +  F1o3 * gxy    * div_beta       + &
+    gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
-                      gxx * betaxy                  +   gxz * betazy + &
+         TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
-                                       gyy * betayx +   gyz * betazx   &
+
-                                                    -   gxy * betazz
+    gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
-
+         TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
-  gyz_rhs = - TWO * alpn1 * Ayz    +  F1o3 * gyz    * div_beta       + &
+
-                      gxy * betaxz +   gyy * betayz                  + &
+    gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
-                      gxz * betaxy                  +   gzz * betazy   &
+         gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
-                                                    -   gyz * betaxx
+         gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
- 
+
-  gxz_rhs = - TWO * alpn1 * Axz    +  F1o3 * gxz    * div_beta       + &
+    gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
-                      gxx * betaxz +   gxy * betayz                  + &
+         gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
-                                       gyz * betayx +   gzz * betazx   &
+         gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
-                                                    -   gxz * betayy     !rhs for gij
+
-
+    gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
-! invert tilted metric
+         gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
-  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+         gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
-           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+
-  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
+    det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
-  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
+         gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
-  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
+         gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
-  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
+    gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
-  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
+    gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
-  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
+    gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
-
+    gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
-  if(co == 0)then
+    gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
-! Gam^i_Res = Gam^i + gup^ij_,j
+    gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
-  Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
+    gupxx(i,j,k) = gupxx_loc
-                   +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+    gupxy(i,j,k) = gupxy_loc
-                   +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+    gupxz(i,j,k) = gupxz_loc
-                   +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+    gupyy(i,j,k) = gupyy_loc
-                   +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+    gupyz(i,j,k) = gupyz_loc
-                   +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+    gupzz(i,j,k) = gupzz_loc
-                   +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+
-                   +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+    if(co == 0)then
-                   +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+      Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
-  Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+           gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
-                   +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+           gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
-                   +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+           gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
-                   +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+           gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
-                   +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+           gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
-                   +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+           gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
-                   +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+           gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-                   +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+           gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-                   +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+           gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-  Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+      Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
-                   +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+           gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
-                   +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+           gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
-                   +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+           gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
-                   +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+           gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
-                   +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+           gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
-                   +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+           gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
-                   +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+           gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-                   +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+           gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-  endif
+           gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-
+      Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
-! second kind of connection
+           gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
-  Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
+           gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
-  Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
+           gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
-  Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
+           gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
- 
+           gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
-  Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
+           gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
-  Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
+           gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-  Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
+           gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-
+           gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-  Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
+    endif
-  Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
+
-  Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
+    Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-
+    Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-  Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
+    Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-  Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
+
-  Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
+    Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-
+    Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-  Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
+    Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-  Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
+
-  Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
+    Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
-
+    Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
-  Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
+    Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
-  Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
+
-  Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
+    Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-! Raise indices of \tilde A_{ij} and store in R_ij
+    Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-
+    Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-  Rxx =    gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
+
-      TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
+    Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
-
+    Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
-  Ryy =    gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
+    Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
-      TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
+
-
+    Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
-  Rzz =    gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
+    Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
-      TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
+    Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
-
+  enddo
-  Rxy =    gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
+  enddo
-          (gupxx * gupyy       + gupxy * gupxy)* Axy                       + &
+  enddo
-          (gupxx * gupyz       + gupxz * gupxy)* Axz                       + &
+! Raise indices of \tilde A_{ij} and store in R_ij
-          (gupxy * gupyz       + gupxz * gupyy)* Ayz
+
-
+! Right hand side for Gam^i without shift terms...
-  Rxz =    gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
+  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-          (gupxx * gupyz       + gupxy * gupxz)* Axy                       + &
+  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
-          (gupxx * gupzz       + gupxz * gupxz)* Axz                       + &
+  do k=1,ex(3)
-          (gupxy * gupzz       + gupxz * gupyz)* Ayz
+  do j=1,ex(2)
-
+  do i=1,ex(1)
-  Ryz =    gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
+    gupxx_loc = gupxx(i,j,k)
-          (gupxy * gupyz       + gupyy * gupxz)* Axy                       + &
+    gupxy_loc = gupxy(i,j,k)
-          (gupxy * gupzz       + gupyz * gupxz)* Axz                       + &
+    gupxz_loc = gupxz(i,j,k)
-          (gupyy * gupzz       + gupyz * gupyz)* Ayz
+    gupyy_loc = gupyy(i,j,k)
-
+    gupyz_loc = gupyz(i,j,k)
-! Right hand side for Gam^i without shift terms...
+    gupzz_loc = gupzz(i,j,k)
-  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+
-  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
+    Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
-
+         TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
-   Gamx_rhs = - TWO * (   Lapx * Rxx +   Lapy * Rxy +   Lapz * Rxz ) + &
+    Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
-        TWO * alpn1 * (                                                &
+         TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
-        -F3o2/chin1 * (   chix * Rxx +   chiy * Rxy +   chiz * Rxz ) - &
+    Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
-              gupxx * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+         TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
-              gupxy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+    Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
-              gupxz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+         (gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
-                        Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz   + &
+         (gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
-                TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
+         (gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
-
+    Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
-   Gamy_rhs = - TWO * (   Lapx * Rxy +   Lapy * Ryy +   Lapz * Ryz ) + &
+         (gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
-        TWO * alpn1 * (                                                &
+         (gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
-        -F3o2/chin1 * (   chix * Rxy +  chiy * Ryy +    chiz * Ryz ) - &
+         (gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
-              gupxy * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+    Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
-              gupyy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+         (gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
-              gupyz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+         (gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
-                        Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz   + &
+         (gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
-                TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
+    Rxx(i,j,k) = Rxx_loc
-
+    Ryy(i,j,k) = Ryy_loc
-   Gamz_rhs = - TWO * (   Lapx * Rxz +   Lapy * Ryz +   Lapz * Rzz ) + &
+    Rzz(i,j,k) = Rzz_loc
-        TWO * alpn1 * (                                                &
+    Rxy(i,j,k) = Rxy_loc
-        -F3o2/chin1 * (   chix * Rxz +  chiy * Ryz +    chiz * Rzz ) - &
+    Rxz(i,j,k) = Rxz_loc
-              gupxz * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+    Ryz(i,j,k) = Ryz_loc
-              gupyz * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+
-              gupzz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+    Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
-                        Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz   + &
+         TWO * alpn1(i,j,k) * ( &
-                TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
+         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
         gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
         TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
    Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
         TWO * alpn1(i,j,k) * ( &
         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
         gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
         TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
    Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
         TWO * alpn1(i,j,k) * ( &
         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
         gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
         TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
  enddo
  enddo
  enddo
  call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
                X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -321,38 +359,54 @@
  call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
                X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
-  fxx = gxxx + gxyy + gxzz
+  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
-  fxy = gxyx + gyyy + gyzz
+  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
-  fxz = gxzx + gyzy + gzzz
+  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
-
+  do k=1,ex(3)
-  Gamxa =       gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
+  do j=1,ex(2)
-          TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
+  do i=1,ex(1)
-  Gamya =       gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
+    divb_loc = div_beta(i,j,k)
-          TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
+    fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
-  Gamza =       gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
+    fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
-          TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
+    fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
-
+
-  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
+    gupxx_loc = gupxx(i,j,k)
-  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
+    gupxy_loc = gupxy(i,j,k)
-  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
+    gupxz_loc = gupxz(i,j,k)
-
+    gupyy_loc = gupyy(i,j,k)
-  Gamx_rhs =               Gamx_rhs +  F2o3 *  Gamxa * div_beta        - &
+    gupyz_loc = gupyz(i,j,k)
-                     Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz  + &
+    gupzz_loc = gupzz(i,j,k)
-             F1o3 * (gupxx * fxx    + gupxy * fxy    + gupxz * fxz    ) + &
+
-                     gupxx * gxxx   + gupyy * gyyx   + gupzz * gzzx    + &
+    Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
-              TWO * (gupxy * gxyx   + gupxz * gxzx   + gupyz * gyzx  )
+         TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
-
+    Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
-  Gamy_rhs =               Gamy_rhs +  F2o3 *  Gamya * div_beta        - &
+         TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
-                     Gamxa * betayx - Gamya * betayy - Gamza * betayz  + &
+    Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
-             F1o3 * (gupxy * fxx    + gupyy * fxy    + gupyz * fxz    ) + &
+         TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
-                     gupxx * gxxy   + gupyy * gyyy   + gupzz * gzzy    + &
+    Gamxa(i,j,k) = Gamxa_loc
-              TWO * (gupxy * gxyy   + gupxz * gxzy   + gupyz * gyzy  )
+    Gamya(i,j,k) = Gamya_loc
-
+    Gamza(i,j,k) = Gamza_loc
-  Gamz_rhs =               Gamz_rhs +  F2o3 *  Gamza * div_beta        - &
+
-                     Gamxa * betazx - Gamya * betazy - Gamza * betazz  + &
+    Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
-             F1o3 * (gupxz * fxx    + gupyz * fxy    + gupzz * fxz    ) + &
+         Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
-                     gupxx * gxxz   + gupyy * gyyz   + gupzz * gzzz    + &
+         F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
-              TWO * (gupxy * gxyz   + gupxz * gxzz   + gupyz * gyzz  )    !rhs for Gam^i
+         gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
         TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
    Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
         Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
         F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
         gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
         TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
    Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
         Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
         F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
         gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
         TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
  enddo
  enddo
  enddo
 !first kind of connection stored in gij,k
  gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -601,192 +655,190 @@
            Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz  + &
            Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy  + &
            Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
-!covariant second derivative of chi respect to tilted metric
+!covariant second derivative of chi respect to tilted metric
-  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-
+
-  fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
+  do k=1,ex(3)
-  fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
+  do j=1,ex(2)
-  fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
+  do i=1,ex(1)
-  fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
+    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
-  fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
+    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
-  fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
+    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
-! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
+    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
-
+    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
-  f =        gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
+    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
-             gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
+
-             gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
+    chin_loc = chin1(i,j,k)
-       TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
+    f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
-       TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
+            gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
-       TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) 
+            gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
-! Add chi part to Ricci tensor:
+            TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
-
+            TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
-  Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
+            TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
-  Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
+    f(i,j,k) = f_loc
-  Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
+
-  Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
+    Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
-  Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
+    Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
-  Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
+    Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
-
+    Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
-! covariant second derivatives of the lapse respect to physical metric
+    Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
-  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
+    Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
-                SYM,SYM,SYM,symmetry,Lev)
+  enddo
-
+  enddo
-  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
+  enddo
-  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
+
-  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
+! covariant second derivatives of the lapse respect to physical metric
-! now get physical second kind of connection
+  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
-  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
+                SYM,SYM,SYM,symmetry,Lev)
-  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
+
-  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
+  do k=1,ex(3)
-  Gamxyy = Gamxyy - (                     - gyy * gxxx )*HALF
+  do j=1,ex(2)
-  Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
+  do i=1,ex(1)
-  Gamzyy = Gamzyy - (                     - gyy * gxxz )*HALF
+    chin_loc = chin1(i,j,k)
-  Gamxzz = Gamxzz - (                     - gzz * gxxx )*HALF
+    gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
-  Gamyzz = Gamyzz - (                     - gzz * gxxy )*HALF
+    gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
-  Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
+    gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
-  Gamxxy = Gamxxy - (  chiy        /chin1 - gxy * gxxx )*HALF
+
-  Gamyxy = Gamyxy - (         chix /chin1 - gxy * gxxy )*HALF
+    Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
-  Gamzxy = Gamzxy - (                     - gxy * gxxz )*HALF
+    Gamyxx(i,j,k) = Gamyxx(i,j,k) - (                                   - gxx(i,j,k) * gxxy(i,j,k) )*HALF
-  Gamxxz = Gamxxz - (  chiz        /chin1 - gxz * gxxx )*HALF
+    Gamzxx(i,j,k) = Gamzxx(i,j,k) - (                                   - gxx(i,j,k) * gxxz(i,j,k) )*HALF
-  Gamyxz = Gamyxz - (                     - gxz * gxxy )*HALF
+    Gamxyy(i,j,k) = Gamxyy(i,j,k) - (                                   - gyy(i,j,k) * gxxx(i,j,k) )*HALF
-  Gamzxz = Gamzxz - (         chix /chin1 - gxz * gxxz )*HALF
+    Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
-  Gamxyz = Gamxyz - (                     - gyz * gxxx )*HALF
+    Gamzyy(i,j,k) = Gamzyy(i,j,k) - (                                   - gyy(i,j,k) * gxxz(i,j,k) )*HALF
-  Gamyyz = Gamyyz - (  chiz        /chin1 - gyz * gxxy )*HALF
+    Gamxzz(i,j,k) = Gamxzz(i,j,k) - (                                   - gzz(i,j,k) * gxxx(i,j,k) )*HALF
-  Gamzyz = Gamzyz - (         chiy /chin1 - gyz * gxxz )*HALF
+    Gamyzz(i,j,k) = Gamyzz(i,j,k) - (                                   - gzz(i,j,k) * gxxy(i,j,k) )*HALF
-
+    Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
-  fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
+    Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
-  fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
+    Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
-  fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
+    Gamzxy(i,j,k) = Gamzxy(i,j,k) - (                     - gxy(i,j,k) * gxxz(i,j,k) )*HALF
-  fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
+    Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
-  fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
+    Gamyxz(i,j,k) = Gamyxz(i,j,k) - (                     - gxz(i,j,k) * gxxy(i,j,k) )*HALF
-  fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
+    Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
-
+    Gamxyz(i,j,k) = Gamxyz(i,j,k) - (                     - gyz(i,j,k) * gxxx(i,j,k) )*HALF
-! store D^i D_i Lap in trK_rhs upto chi
+    Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
-  trK_rhs =    gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+    Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
-        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
+
-#if 1        
+    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
-!! follow bam code
+    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
-  S =  chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
+    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
-     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
+    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
-  f = F2o3 * trK * trK -(&
+    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
-       gupxx * ( &
+    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
-       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
+
-       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
+    trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-       gupyy * ( &
+                     TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
-       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
+  enddo
-       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
+  enddo
-       gupzz * ( &
+  enddo
-       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
+  do k=1,ex(3)
-       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
+  do j=1,ex(2)
-       TWO * ( &
+  do i=1,ex(1)
-       gupxy * ( &
+    divb_loc = div_beta(i,j,k)
-       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
+    chin_loc = chin1(i,j,k)
-       gupxy * (Axx * Ayy + Axy * Axy) + &
+
-       gupxz * (Axx * Ayz + Axz * Axy) + &
+    S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
-       gupyz * (Axy * Ayz + Axz * Ayy) ) + &
+           TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
-       gupxz * ( &
+    S(i,j,k) = S_loc
-       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
+
-       gupxy * (Axx * Ayz + Axy * Axz) + &
+    f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
-       gupxz * (Axx * Azz + Axz * Axz) + &
+            gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
-       gupyz * (Axy * Azz + Axz * Ayz) ) + &
+                             gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
-       gupyz * ( &
+                             TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
-       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
+                                    gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
-       gupxy * (Axy * Ayz + Ayy * Axz) + &
+            gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
-       gupxz * (Axy * Azz + Ayz * Axz) + &
+                             gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-       gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
+                             TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-  f = - F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+                                    gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
-        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
+            gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-  
+                             gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
-  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
+                             TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
-  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
+                                    gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
-  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
+            TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
+                                     gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
-  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
+                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
-  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
+                                     gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
-#else        
+                                     gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
-! Add lapse and S_ij parts to Ricci tensor:
+                    gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-
+                                     gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
-  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
+                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
-  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
+                                     gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
-  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
+                                     gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
-  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
+                    gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
-  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
+                                     gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
-  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
+                                     gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
-
+                                     gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
-! Compute trace-free part (note: chi^-1 and chi cancel!):
+                                     gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
-
+            F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
-  f = F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+
-        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
+    f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-#endif
+            TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
-
+            alpn1(i,j,k)/chin_loc * f_loc )
-  Axx_rhs = fxx - gxx * f
+    f(i,j,k) = f_loc
-  Ayy_rhs = fyy - gyy * f
+
-  Azz_rhs = fzz - gzz * f
+    l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
-  Axy_rhs = fxy - gxy * f
+    l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
-  Axz_rhs = fxz - gxz * f
+    l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
-  Ayz_rhs = fyz - gyz * f
+    l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
-
+    l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
-! Now: store A_il A^l_j into fij:
+    l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
-
+
-  fxx =       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
+    Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
-       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
+    Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
-  fyy =       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
+    Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
-       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
+    Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
-  fzz =       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
+    Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
-       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
+    Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
-  fxy =       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
+
-              gupxy *(Axx * Ayy + Axy * Axy)                            + &
+    fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
-              gupxz *(Axx * Ayz + Axz * Axy)                            + &
+                 gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
-              gupyz *(Axy * Ayz + Axz * Ayy)
+                 gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
-  fxz =       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
+    fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
-              gupxy *(Axx * Ayz + Axy * Axz)                            + &
+                 gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-              gupxz *(Axx * Azz + Axz * Axz)                            + &
+                 gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
-              gupyz *(Axy * Azz + Axz * Ayz)
+    fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-  fyz =       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
+                 gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
-              gupxy *(Axy * Ayz + Ayy * Axz)                            + &
+                 gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
-              gupxz *(Axy * Azz + Ayz * Axz)                            + &
+    fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-              gupyz *(Ayy * Azz + Ayz * Ayz)
+                 gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
-
+                 gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
-  f = chin1
+                 gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
-! store D^i D_i Lap in trK_rhs
+    fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-  trK_rhs = f*trK_rhs
+                 gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
-          
+                 gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
-  Axx_rhs =           f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx)  + &
+                 gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
-           TWO * (  Axx * betaxx +   Axy * betayx +   Axz * betazx )- &
+    fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
-             F2o3 * Axx * div_beta
+                 gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
-
+                 gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
-  Ayy_rhs =           f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy)  + &
+                 gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
-           TWO * (  Axy * betaxy +   Ayy * betayy +   Ayz * betazy )- &
+
-             F2o3 * Ayy * div_beta
+    trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
-
+
-  Azz_rhs =           f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz)  + &
+    Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
-           TWO * (  Axz * betaxz +   Ayz * betayz +   Azz * betazz )- &
+                     TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
-             F2o3 * Azz * div_beta
+                     F2o3 * Axx(i,j,k) * divb_loc
-
+    Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
-  Axy_rhs =           f * Axy_rhs+ alpn1 *( trK * Axy  - TWO * fxy )+ &
+                     TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
-                    Axx * betaxy                  +   Axz * betazy  + &
+                     F2o3 * Ayy(i,j,k) * divb_loc
-                                     Ayy * betayx +   Ayz * betazx  + &
+    Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
-             F1o3 * Axy * div_beta                -   Axy * betazz
+                     TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
-
+                     F2o3 * Azz(i,j,k) * divb_loc
-  Ayz_rhs =           f * Ayz_rhs+ alpn1 *( trK * Ayz  - TWO * fyz )+ &
+    Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
-                    Axy * betaxz +   Ayy * betayz                   + &
+                     Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
-                    Axz * betaxy                  +   Azz * betazy  + &
+                     Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
-             F1o3 * Ayz * div_beta                -   Ayz * betaxx
+    Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
- 
+                     Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
-  Axz_rhs =           f * Axz_rhs+ alpn1 *( trK * Axz  - TWO * fxz )+ &
+                     Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
-                    Axx * betaxz +   Axy * betayz                   + &
+    Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
-                                     Ayz * betayx +   Azz * betazx  + &
+                     Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
-             F1o3 * Axz * div_beta                -   Axz * betayy      !rhs for Aij
+                     Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
-
+
-! Compute trace of S_ij
+    trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
-
+                    gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-  S =  f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
+                    TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
-     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
+                    FOUR * PI * (rho(i,j,k) + S_loc) )
-
+  enddo
-  trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK         + &
+  enddo
-                gupxx * fxx + gupyy * fyy + gupzz * fzz   + &
+  enddo
        TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
       FOUR * PI * ( rho + S ))                                !rhs for trK
 !!!! gauge variable part
@@ -948,15 +1000,15 @@
 !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
-! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
+! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
-! so the constant offset has no effect on dissipation.
+! so the constant offset has no effect on dissipation.
-
+
-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -130,7 +130,11 @@ void cgh::compose_cgh(int nprocs)
  for (int lev = 0; lev < levels; lev++)
  {
    checkPatchList(PatL[lev], false);
 #ifdef INTERP_LB_OPTIMIZE
    Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
 #else
    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
 #endif
 #if (RPB == 1)
    // we need distributed box of PatL[lev] and PatL[lev-1]
    if (lev > 0)
@@ -1301,13 +1305,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
 }
-void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                          MyList<var> *OldList, MyList<var> *StateList,
                          MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                          monitor *ErrorMonitor)
 {
  if (lev < movls)
-    return;
+    return false;
 #if (0)
  // #if (PSTR == 1 || PSTR == 2)
@@ -1396,7 +1400,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
      for (bhi = 0; bhi < BH_num; bhi++)
        delete[] tmpPorg[bhi];
      delete[] tmpPorg;
-      return;
+      return false;
    }
    // x direction
    rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
@@ -1500,6 +1504,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
  for (int bhi = 0; bhi < BH_num; bhi++)
    delete[] tmpPorg[bhi];
  delete[] tmpPorg;
  return tot_flag;
 }
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -74,7 +74,7 @@ public:
                               MyList<var> *OldList, MyList<var> *StateList,
                               MyList<var> *FutureList, MyList<var> *tmList,
                               int Symmetry, bool BB);
-   void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
+   bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
                        MyList<var> *OldList, MyList<var> *StateList,
                        MyList<var> *FutureList, MyList<var> *tmList, bool BB,
                        monitor *ErrorMonitor);
--- a/AMSS_NCKU_source/diff_newwb.f90
+++ b/AMSS_NCKU_source/diff_newwb.f90
@@ -33,7 +33,7 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -137,7 +137,7 @@
  real*8 :: dX
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -1512,8 +1512,9 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
-  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
+  integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1, F9=9.d0,  F45=4.5d1
@@ -1560,17 +1561,55 @@
  fxx = ZEO
  fyy = ZEO
-  fzz = ZEO
+  fzz = ZEO
-  fxy = ZEO
+  fxy = ZEO
-  fxz = ZEO
+  fxz = ZEO
-  fyz = ZEO
+  fyz = ZEO
-
+
-  do k=1,ex(3)
+  i_core_min = max(1, imin+2)
-  do j=1,ex(2)
+  i_core_max = min(ex(1), imax-2)
-  do i=1,ex(1)
+  j_core_min = max(1, jmin+2)
-!~~~~~~ fxx
+  j_core_max = min(ex(2), jmax-2)
-        if(i+2 <= imax .and. i-2 >= imin)then
+  k_core_min = max(1, kmin+2)
-!
+  k_core_max = min(ex(3), kmax-2)
  if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
   do k=k_core_min,k_core_max
   do j=j_core_min,j_core_max
   do i=i_core_min,i_core_max
 ! interior points always use 4th-order stencils without branch checks
      fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
                          -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
      fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
                          -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
      fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
                          -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
      fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
                          -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
                          +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
                          -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
      fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
                          -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
                          +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
                          -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
      fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
                          -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
                          +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
                          -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
   enddo
   enddo
   enddo
  endif
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
      if(i>=i_core_min .and. i<=i_core_max .and. &
         j>=j_core_min .and. j<=j_core_max .and. &
         k>=k_core_min .and. k<=k_core_max) cycle
 !~~~~~~ fxx
        if(i+2 <= imax .and. i-2 >= imin)then
 !
 !               - f(i-2) + 16 f(i-1) - 30 f(i) + 16 f(i+1) - f(i+2)
 !  fxx(i) = ----------------------------------------------------------
 !                                  12 dx^2 
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -0,0 +1,186 @@
 #include "tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff)
 {
    (void)onoff;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
    const double F8     = 8.0;
    const double F16    = 16.0;
    const double F30    = 30.0;
    const double F1o12  = ONE / 12.0;
    const double F1o144 = ONE / 144.0;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static double *fh = NULL;
    static size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    symmetry_bd(2, ex, f, fh, SoA);
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
    const double Sdzdz = ONE / (dZ * dZ);
    const double Fdxdx = F1o12 / (dX * dX);
    const double Fdydy = F1o12 / (dY * dY);
    const double Fdzdz = F1o12 / (dZ * dZ);
    const double Sdxdy = F1o4 / (dX * dY);
    const double Sdxdz = F1o4 / (dX * dZ);
    const double Sdydz = F1o4 / (dY * dZ);
    const double Fdxdy = F1o144 / (dX * dY);
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);
    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    for (size_t p = 0; p < all; ++p) {
        fxx[p] = ZEO; fxy[p] = ZEO; fxz[p] = ZEO;
        fyy[p] = ZEO; fyz[p] = ZEO; fzz[p] = ZEO;
    }
    // Match Fortran (ghost_width=3, "for bam comparison") exactly:
    // only compute when x/y/z all satisfy the same-order stencil at this point.
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                if ((iF + 2 <= imaxF && iF - 2 >= iminF) &&
                    (jF + 2 <= jmaxF && jF - 2 >= jminF) &&
                    (kF + 2 <= kmaxF && kF - 2 >= kminF)) {
                    fxx[p] = Fdxdx * (
                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Fdydy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Fdzdz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Fdxdy * (
                           (fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] +
                            F8 * fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)])
                        - F8 * (fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)])
                        + F8 * (fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)])
                           - (fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] +
                              F8 * fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)])
                    );
                    fxz[p] = Fdxdz * (
                           (fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] +
                            F8 * fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)])
                        - F8 * (fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)])
                        + F8 * (fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)])
                           - (fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] - F8 * fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] +
                              F8 * fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)])
                    );
                    fyz[p] = Fdydz * (
                           (fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] - F8 * fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] +
                            F8 * fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)])
                        - F8 * (fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] - F8 * fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)])
                        + F8 * (fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] - F8 * fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                                F8 * fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)])
                           - (fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] - F8 * fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] +
                              F8 * fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)])
                    );
                } else if ((iF + 1 <= imaxF && iF - 1 >= iminF) &&
                           (jF + 1 <= jmaxF && jF - 1 >= jminF) &&
                           (kF + 1 <= kmaxF && kF - 1 >= kminF)) {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Sdydy * (
                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Sdzdz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Sdxdy * (
                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                    );
                    fxz[p] = Sdxdz * (
                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                    );
                    fyz[p] = Sdydz * (
                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -0,0 +1,135 @@
 #include "tool.h"
 /*
 * C 版 fderivs
 *
 * Fortran:
 * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
 *
 * 约定：
 *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
 *   X: ex1, Y: ex2, Z: ex3
 */
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff)
 {
    (void)onoff; // Fortran 里没用到
    const double ZEO = 0.0, ONE = 1.0;
    const double TWO = 2.0, EIT = 8.0;
    const double F12 = 12.0;
    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    // Fortran 1-based bounds
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    // SoA(1:3) = SYM1,SYM2,SYM3
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static double *fh = NULL;
    static size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // call symmetry_bd(2,ex,f,fh,SoA)
    symmetry_bd(2, ex, f, fh, SoA);
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    // fx = fy = fz = 0
    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    for (size_t p = 0; p < all; ++p) {
        fx[p] = ZEO;
        fy[p] = ZEO;
        fz[p] = ZEO;
    }
    // Match Fortran (ghost_width=3, "for bam comparison") exactly:
    // only compute when x/y/z all satisfy the same-order stencil at this point.
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                if ((iF + 2 <= imaxF && iF - 2 >= iminF) &&
                    (jF + 2 <= jmaxF && jF - 2 >= jminF) &&
                    (kF + 2 <= kmaxF && kF - 2 >= kminF)) {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                    );
                    fy[p] = d12dy * (
                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                    );
                    fz[p] = d12dz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                } else if ((iF + 1 <= imaxF && iF - 1 >= iminF) &&
                           (jF + 1 <= jmaxF && jF - 1 >= jminF) &&
                           (kF + 1 <= kmaxF && kF - 1 >= kminF)) {
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1111,27 +1111,177 @@ end subroutine d2dump
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-! common code for cell and vertex
+! common code for cell and vertex
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-! Lagrangian polynomial interpolation
+! Lagrangian polynomial interpolation
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-
+#ifndef POLINT6_USE_BARYCENTRIC
-!DIR$ ATTRIBUTES FORCEINLINE :: polint
+#define POLINT6_USE_BARYCENTRIC 1
-  subroutine polint(xa, ya, x, y, dy, ordn)
+#endif
-  implicit none
+
-
+!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
-  integer, intent(in) :: ordn
+  subroutine polint6_neville(xa, ya, x, y, dy)
  implicit none
  real*8, dimension(6), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
  integer :: i, m, ns, n_m
  real*8, dimension(6) :: c, d, ho
  real*8 :: dif, dift, hp, h, den_val
  c = ya
  d = ya
  ho = xa - x
  ns = 1
  dif = abs(x - xa(1))
  do i = 2, 6
    dift = abs(x - xa(i))
    if (dift < dif) then
      ns = i
      dif = dift
    end if
  end do
  y = ya(ns)
  ns = ns - 1
  do m = 1, 5
    n_m = 6 - m
    do i = 1, n_m
      hp = ho(i)
      h  = ho(i+m)
      den_val = hp - h
      if (den_val == 0.0d0) then
        write(*,*) 'failure in polint for point',x
        write(*,*) 'with input points: ',xa
        stop
      end if
      den_val = (c(i+1) - d(i)) / den_val
      d(i) = h * den_val
      c(i) = hp * den_val
    end do
    if (2 * ns < n_m) then
      dy = c(ns + 1)
    else
      dy = d(ns)
      ns = ns - 1
    end if
    y = y + dy
  end do
  return
  end subroutine polint6_neville
 !DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
  subroutine polint6_barycentric(xa, ya, x, y, dy)
  implicit none
  real*8, dimension(6), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
  integer :: i, j
  logical :: is_uniform
  real*8, dimension(6) :: lambda
  real*8 :: dx, den_i, term, num, den, step, tol
  real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
  do i = 1, 6
    if (x == xa(i)) then
      y = ya(i)
      dy = 0.d0
      return
    end if
  end do
  step = xa(2) - xa(1)
  is_uniform = (step /= 0.d0)
  if (is_uniform) then
    tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
    do i = 3, 6
      if (abs((xa(i) - xa(i-1)) - step) > tol) then
        is_uniform = .false.
        exit
      end if
    end do
  end if
  if (is_uniform) then
    num = 0.d0
    den = 0.d0
    do i = 1, 6
      term = c_uniform(i) / (x - xa(i))
      num = num + term * ya(i)
      den = den + term
    end do
    y = num / den
    dy = 0.d0
    return
  end if
  do i = 1, 6
    den_i = 1.d0
    do j = 1, 6
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        den_i = den_i * dx
      end if
    end do
    lambda(i) = 1.d0 / den_i
  end do
  num = 0.d0
  den = 0.d0
  do i = 1, 6
    term = lambda(i) / (x - xa(i))
    num = num + term * ya(i)
    den = den + term
  end do
  y = num / den
  dy = 0.d0
  return
  end subroutine polint6_barycentric
 !DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa, ya, x, y, dy, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa, ya
  real*8, intent(in) :: x
  real*8, intent(out) :: y, dy
-  integer :: i, m, ns, n_m
+  integer :: i, m, ns, n_m
-  real*8, dimension(ordn) :: c, d, ho
+  real*8, dimension(ordn) :: c, d, ho
-  real*8 :: dif, dift, hp, h, den_val
+  real*8 :: dif, dift, hp, h, den_val
-
+
-  c = ya
+  if (ordn == 6) then
-  d = ya
+#if POLINT6_USE_BARYCENTRIC
-  ho = xa - x
+    call polint6_barycentric(xa, ya, x, y, dy)
 #else
    call polint6_neville(xa, ya, x, y, dy)
 #endif
    return
  end if
  c = ya
  d = ya
  ho = xa - x
  ns = 1
  dif = abs(x - xa(1))
@@ -1175,13 +1325,48 @@ end subroutine d2dump
    y = y + dy
  end do
-  return
+  return
-  end subroutine polint
+  end subroutine polint
-!------------------------------------------------------------------------------
+!------------------------------------------------------------------------------
-!
+! Compute Lagrange interpolation basis weights for one target point.
-! interpolation in 2 dimensions, follow yx order
+!------------------------------------------------------------------------------
-!
+!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
-!------------------------------------------------------------------------------
+  subroutine polint_lagrange_weights(xa, x, w, ordn)
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(1:ordn), intent(in) :: xa
  real*8, intent(in) :: x
  real*8, dimension(1:ordn), intent(out) :: w
  integer :: i, j
  real*8 :: num, den, dx
  do i = 1, ordn
    num = 1.d0
    den = 1.d0
    do j = 1, ordn
      if (j /= i) then
        dx = xa(i) - xa(j)
        if (dx == 0.0d0) then
          write(*,*) 'failure in polint for point',x
          write(*,*) 'with input points: ',xa
          stop
        end if
        num = num * (x - xa(j))
        den = den * dx
      end if
    end do
    w(i) = num / den
  end do
  return
  end subroutine polint_lagrange_weights
 !------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
 !------------------------------------------------------------------------------
  subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
  implicit none
@@ -1229,11 +1414,11 @@ end subroutine d2dump
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#ifdef POLINT_LEGACY_ORDER
+#ifdef POLINT_LEGACY_ORDER
-  integer  :: i,j,m,n
+  integer  :: i,j,m,n
-  real*8, dimension(ordn,ordn) :: yatmp
+  real*8, dimension(ordn,ordn) :: yatmp
-  real*8, dimension(ordn) :: ymtmp
+  real*8, dimension(ordn) :: ymtmp
-  real*8, dimension(ordn) :: yntmp
+  real*8, dimension(ordn) :: yntmp
  real*8, dimension(ordn) :: yqtmp
  m=size(x1a)
@@ -1243,29 +1428,36 @@ end subroutine d2dump
    yqtmp=ya(i,j,:)
    call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
   end do
-    yntmp=yatmp(i,:)
+    yntmp=yatmp(i,:)
-    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
+    call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
-  end do
+  end do
-  call polint(x1a,ymtmp,x1,y,dy,ordn)
+  call polint(x1a,ymtmp,x1,y,dy,ordn)
-#else
+#else
-  integer  :: j, k
+  integer  :: i, j, k
-  real*8, dimension(ordn,ordn) :: yatmp
+  real*8, dimension(ordn) :: w1, w2
-  real*8, dimension(ordn) :: ymtmp
+  real*8, dimension(ordn) :: ymtmp
-  real*8 :: dy_temp
+  real*8 :: yx_sum, x_sum
-
+
-  do k=1,ordn
+  call polint_lagrange_weights(x1a, x1, w1, ordn)
-    do j=1,ordn
+  call polint_lagrange_weights(x2a, x2, w2, ordn)
-      call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
+
-    end do
+  do k = 1, ordn
-  end do
+    yx_sum = 0.d0
-  do k=1,ordn
+    do j = 1, ordn
-    call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
+      x_sum = 0.d0
-  end do
+      do i = 1, ordn
-  call polint(x3a, ymtmp, x3, y, dy, ordn)
+        x_sum = x_sum + w1(i) * ya(i,j,k)
-#endif
+      end do
-
+      yx_sum = yx_sum + w2(j) * x_sum
-  return
+    end do
-  end subroutine polin3
+    ymtmp(k) = yx_sum
  end do
  call polint(x3a, ymtmp, x3, y, dy, ordn)
 #endif
  return
  end subroutine polin3
 !--------------------------------------------------------------------------------------
 ! calculate L2norm
  subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
@@ -1608,11 +1800,14 @@ deallocate(f_flat)
 !       ^
 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
-  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
+  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
-
+  integer :: i,j,k
-  fout = C1*f1+C2*f2+C3*f3
+
-
+  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
-  return
+    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
  end do
  return
  end subroutine average2
 !-----------------------------------------------------------------------------  
--- a/AMSS_NCKU_source/interp_lb_profile.C
+++ b/AMSS_NCKU_source/interp_lb_profile.C
@@ -0,0 +1,107 @@
 #include "interp_lb_profile.h"
 #include <cstdio>
 #include <cstring>
 #include <algorithm>
 namespace InterpLBProfile {
 bool write_profile(const char *filepath, int nprocs,
                   const double *rank_times,
                   const int *heavy_ranks, int num_heavy,
                   double threshold_ratio)
 {
    FILE *fp = fopen(filepath, "wb");
    if (!fp) return false;
    ProfileHeader hdr;
    hdr.magic = MAGIC;
    hdr.version = VERSION;
    hdr.nprocs = nprocs;
    hdr.num_heavy = num_heavy;
    hdr.threshold_ratio = threshold_ratio;
    fwrite(&hdr, sizeof(hdr), 1, fp);
    fwrite(rank_times, sizeof(double), nprocs, fp);
    fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
    fclose(fp);
    return true;
 }
 bool read_profile(const char *filepath, int current_nprocs,
                  int *heavy_ranks, int &num_heavy,
                  double *rank_times, MPI_Comm comm)
 {
    int myrank;
    MPI_Comm_rank(comm, &myrank);
    int valid = 0;
    ProfileHeader hdr;
    memset(&hdr, 0, sizeof(hdr));
    if (myrank == 0) {
        FILE *fp = fopen(filepath, "rb");
        if (fp) {
            if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
                hdr.magic == MAGIC && hdr.version == VERSION &&
                hdr.nprocs == current_nprocs)
            {
                if (fread(rank_times, sizeof(double), current_nprocs, fp)
                    == (size_t)current_nprocs &&
                    fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
                    == (size_t)hdr.num_heavy)
                {
                    num_heavy = hdr.num_heavy;
                    valid = 1;
                }
            } else if (fp) {
                printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
                       "nprocs=%d (current=%d)\n",
                       hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
            }
            fclose(fp);
        }
    }
    MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
    if (!valid) return false;
    MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
    MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
    MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
    return true;
 }
 int identify_heavy_ranks(const double *rank_times, int nprocs,
                         double threshold_ratio,
                         int *heavy_ranks, int max_heavy)
 {
    double sum = 0;
    for (int i = 0; i < nprocs; i++) sum += rank_times[i];
    double mean = sum / nprocs;
    double threshold = threshold_ratio * mean;
    // Collect candidates
    struct RankTime { int rank; double time; };
    RankTime *candidates = new RankTime[nprocs];
    int ncand = 0;
    for (int i = 0; i < nprocs; i++) {
        if (rank_times[i] > threshold)
            candidates[ncand++] = {i, rank_times[i]};
    }
    // Sort descending by time
    std::sort(candidates, candidates + ncand,
              [](const RankTime &a, const RankTime &b) {
                  return a.time > b.time;
              });
    int count = (ncand < max_heavy) ? ncand : max_heavy;
    for (int i = 0; i < count; i++)
        heavy_ranks[i] = candidates[i].rank;
    delete[] candidates;
    return count;
 }
 } // namespace InterpLBProfile
--- a/AMSS_NCKU_source/interp_lb_profile.bin
+++ b/AMSS_NCKU_source/interp_lb_profile.bin
--- a/AMSS_NCKU_source/interp_lb_profile.h
+++ b/AMSS_NCKU_source/interp_lb_profile.h
@@ -0,0 +1,38 @@
 #ifndef INTERP_LB_PROFILE_H
 #define INTERP_LB_PROFILE_H
 #include <mpi.h>
 namespace InterpLBProfile {
 static const unsigned int MAGIC   = 0x494C4250; // "ILBP"
 static const unsigned int VERSION = 1;
 struct ProfileHeader {
    unsigned int magic;
    unsigned int version;
    int nprocs;
    int num_heavy;
    double threshold_ratio;
 };
 // Write profile file (rank 0 only)
 bool write_profile(const char *filepath, int nprocs,
                   const double *rank_times,
                   const int *heavy_ranks, int num_heavy,
                   double threshold_ratio);
 // Read profile file (rank 0 reads, then broadcasts to all)
 // Returns true if file found and valid for current nprocs
 bool read_profile(const char *filepath, int current_nprocs,
                  int *heavy_ranks, int &num_heavy,
                  double *rank_times, MPI_Comm comm);
 // Identify heavy ranks: those with time > threshold_ratio * mean
 int identify_heavy_ranks(const double *rank_times, int nprocs,
                         double threshold_ratio,
                         int *heavy_ranks, int max_heavy);
 } // namespace InterpLBProfile
 #endif /* INTERP_LB_PROFILE_H */
--- a/AMSS_NCKU_source/interp_lb_profile_data.h
+++ b/AMSS_NCKU_source/interp_lb_profile_data.h
@@ -0,0 +1,29 @@
 /* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
 /* 更新：负载均衡问题已经通过优化插值函数解决，此profile静态均衡方案已弃用，本头文件现在未参与编译 */
 /* Auto-generated from interp_lb_profile.bin — do not edit */
 #ifndef INTERP_LB_PROFILE_DATA_H
 #define INTERP_LB_PROFILE_DATA_H
 #define INTERP_LB_NPROCS 64
 #define INTERP_LB_NUM_HEAVY 4
 static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
 /* Split table: {block_id, r_left, r_right} */
 static const int interp_lb_splits[4][3] = {
    {27, 26, 27},
    {35, 34, 35},
    {28, 28, 29},
    {36, 36, 37},
 };
 /* Rank remap for displaced neighbor blocks */
 static const int interp_lb_num_remaps = 4;
 static const int interp_lb_remaps[][2] = {
    {26, 25},
    {29, 30},
    {34, 33},
    {37, 38},
 };
 #endif /* INTERP_LB_PROFILE_DATA_H */
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -0,0 +1,117 @@
 #include "tool.h"
 /*
 * C 版 kodis
 *
 * Fortran signature:
 * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
 *
 * 约定：
 *   X: ex1, Y: ex2, Z: ex3
 *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
 *   SoA[3]
 *   eps: double
 */
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps)
 {
    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0;             // 2^6
    const int NO_SYMM = 0, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
    // Fortran: imax=ex(1) 等是 1-based 上界
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran loops:
     * do k=1,ex3
     * do j=1,ex2
     * do i=1,ex1
     *
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
    const int i0_hi = imaxF - 4;  // inclusive
    const int j0_hi = jmaxF - 4;
    const int k0_hi = kmaxF - 4;
    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
        free(fh);
        return;
    }
    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
                    const double Dx_term =
                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
                    const double Dy_term =
                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
                    const double Dz_term =
                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/lopsided_c.C
+++ b/AMSS_NCKU_source/lopsided_c.C
@@ -0,0 +1,255 @@
 #include "tool.h"
 /*
 * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
 *
 * 约定：
 *   nghost = 3
 *   ex[3]  = {ex1,ex2,ex3}
 *   f      = 原始网格 (ex1*ex2*ex3)
 *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
 *   SoA[3] = 输入参数
 */
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3])
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    (void)d2dx; (void)d2dy; (void)d2dz;
    // Fortran:
    // imax = ex(1); jmax = ex(2); kmax = ex(3)
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran:
    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran 主循环：
     * do k=1,ex(3)-1
     * do j=1,ex(2)-1
     * do i=1,ex(1)-1
     *
     * 转成 C 0-based：
     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
     *
     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
     * iF=i0+1, jF=j0+1, kF=k0+1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    // Fortran: if(i+3 <= imax)
                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
                    else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    // Fortran: if(i-3 >= imin)
                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i-1 >= imin) <=> i0 >= iminF
                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                // ---------------- y direction ----------------
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -0,0 +1,248 @@
 #include "tool.h"
 /*
 * Combined advection (lopsided) + KO dissipation (kodis).
 * Uses one shared symmetry_bd buffer per call.
 */
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps)
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0; // 2^6
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    symmetry_bd(3, ex, f, fh, SoA);
    // Advection (same stencil logic as lopsided_c.C)
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    } else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    // KO dissipation (same domain restriction as kodiss_c.C)
    if (eps > ZEO) {
        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
        const int i0_hi = imaxF - 4; // inclusive
        const int j0_hi = jmaxF - 4;
        const int k0_hi = kmaxF - 4;
        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        const double Dx_term =
                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
                        const double Dy_term =
                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
                        const double Dz_term =
                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                    }
                }
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/macrodef.fh
+++ b/AMSS_NCKU_source/macrodef.fh
@@ -1,83 +1,77 @@
-
+
-
+#define tetradtype 2
-#if 0
+
-note here
+#define Cell
-v:r; u: phi; w: theta
+
-tetradtype 0
+#define ghost_width 3
-v^a = (x,y,z)
+
-orthonormal order: v,u,w
+
-m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+
-tetradtype 1
+#define GAUGE 0
-orthonormal order: w,u,v
+
-m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
+#define CPBC_ghost_width  (ghost_width)
-tetradtype 2
+
-v_a = (x,y,z)
+#define ABV 0
-orthonormal order: v,u,w
+
-m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
+#define EScalar_CC 2
-#endif
+
-#define tetradtype 2
+#if 0
-
+
-#if 0
+define tetradtype
-note here
+    v:r; u: phi; w: theta
-Cell center or Vertex center
+    tetradtype 0
-#endif
+    v^a = (x,y,z)
-#define Cell
+    orthonormal order: v,u,w
-
+    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-#if 0
+    tetradtype 1
-note here
+    orthonormal order: w,u,v
-2nd order: 2
+    m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
-4th order: 3
+    tetradtype 2
-6th order: 4
+    v_a = (x,y,z)
-8th order: 5
+    orthonormal order: v,u,w
-#endif
+    m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
-#define ghost_width 3
+
-
+define Cell or Vertex
-#if 0
+    Cell center or Vertex center
-note here
+
-use shell or not
+define ghost_width
-#endif
+    2nd order: 2
-#define WithShell
+    4th order: 3
-
+    6th order: 4
-#if 0
+    8th order: 5
-note here
+
-use constraint preserving boundary condition or not
+define WithShell
-only affect Z4c
+    use shell or not
-#endif
+
-#define CPBC
+define CPBC
-
+    use constraint preserving boundary condition or not
-#if 0
+    only affect Z4c
-note here
+    CPBC only supports WithShell
-Gauge condition type
+
-0: B^i gauge
+define GAUGE
-1: David's puncture gauge
+    0: B^i gauge
-2: MB B^i gauge
+    1: David puncture gauge
-3: RIT B^i gauge
+    2: MB B^i gauge
-4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+    3: RIT B^i gauge
-5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
+    4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-6: MGB1 B^i gauge
+    5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
-7: MGB2 B^i gauge
+    6: MGB1 B^i gauge
-#endif
+    7: MGB2 B^i gauge
-#define GAUGE 2
+
-
+define CPBC_ghost_width  (ghost_width)
-#if 0
+    buffer points for CPBC boundary
-buffer points for CPBC boundary
+
-#endif
+define ABV
-#define CPBC_ghost_width  (ghost_width)
+    0: using BSSN variable for constraint violation and psi4 calculation
-
+    1: using ADM variable for constraint violation and psi4 calculation
-#if 0
+
-using BSSN variable for constraint violation and psi4 calculation: 0
+define EScalar_CC
-using ADM variable for constraint violation and psi4 calculation: 1
+Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
-#endif
+    1: Case C of 1112.3928, V=0
-#define ABV 0
+    2: shell with   phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
-
+    3: ground state of Schrodinger-Newton system,  f(R) = R+a2*R^2 induced V
-#if 0
+    4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
-Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
+    5: shell with   phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
-1: Case C of 1112.3928, V=0
+
-2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
+#endif
-3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
+
 4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
 5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
 #endif
 #define EScalar_CC 2
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -1,112 +1,145 @@
-
+
-#ifndef MICRODEF_H
+#ifndef MICRODEF_H
-#define MICRODEF_H
+#define MICRODEF_H
-
+
-#include "macrodef.fh"
+#include "macrodef.fh"  
-
+
-// application parameters
+// application parameters
-
+
-/// ****
+#define SommerType 0
-// sommerfeld boundary type
+
-// 0: bam, 1: shibata
+#define GaussInt
-#define SommerType 0
+
-
+#define ABEtype 0
-/// ****
+
-// for Using Gauss-Legendre quadrature in theta direction
+//#define With_AHF
-#define GaussInt
+#define Psi4type 0
-
+
-/// ****
+//#define Point_Psi4
-// 0: BSSN vacuum
+
-// 1: coupled to scalar field
+#define RPS 1
-// 2: Z4c vacuum
+
-// 3: coupled to Maxwell field
+#define AGM 0
-//
+
-#define ABEtype 2
+#define RPB 0
-
+
-/// ****
+#define MAPBH 1
-// using Apparent Horizon Finder
+
-//#define With_AHF
+#define PSTR 0
-
+
-/// ****
+#define REGLEV 0
-// Psi4 calculation method
+
-// 0: EB method
+//#define USE_GPU
-// 1: 4-D method
+
-//
+//#define CHECKDETAIL
-#define Psi4type 0
+
-
+//#define FAKECHECK
-/// ****
+
-// for Using point psi4 or not
+//
-//#define Point_Psi4
+// define SommerType
-
+//     sommerfeld boundary type
-/// ****
+//     0: bam
-// RestrictProlong in Step (0) or after Step (1)
+//     1: shibata
-#define RPS 1
+//
-
+// define GaussInt
-/// ****
+//     for Using Gauss-Legendre quadrature in theta direction
-// Enforce algebra constraint
+//
-// for every RK4 sub step: 0
+// define ABEtype
-// only when iter_count == 3: 1
+//     0: BSSN vacuum
-// after routine Step: 2
+//     1: coupled to scalar field
-#define AGM 0
+//     2: Z4c vacuum
-
+//     3: coupled to Maxwell field
-/// ****
+//
-// Restrict Prolong using BAM style 1 or old style 0
+// define With_AHF
-#define RPB 0
+//     using Apparent Horizon Finder
-
+//
-/// ****
+// define Psi4type
-// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
+//     Psi4 calculation method
-#define MAPBH 1
+//     0: EB method
-
+//     1: 4-D method
-/// ****
+//
-// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
+// define Point_Psi4
-#define PSTR 0
+//     for Using point psi4 or not
-
+//
-/// ****
+// define RPS
-// regrid for every level or for all levels at a time
+//     RestrictProlong in Step (0) or after Step (1)
-// 0: for every level; 1: for all
+//
-#define REGLEV 0
+// define AGM
-
+//     Enforce algebra constraint
-/// ****
+//     for every RK4 sub step: 0
-// use gpu or not
+//     only when iter_count == 3: 1
-//#define USE_GPU
+//     after routine Step: 2
-
+//
-/// ****
+// define RPB
-// use checkpoint for every process
+//     Restrict Prolong using BAM style 1 or old style 0
-//#define CHECKDETAIL
+//
-
+// define MAPBH
-/// ****
+//     1: move Analysis out ot 4 sub steps and treat PBH with Euler method
-// use FakeCheckPrepare to write CheckPoint
+//
-//#define FAKECHECK
+// define PSTR
-////================================================================
+//     parallel structure
-//  some basic parameters for numerical calculation
+//     0: level by level
-#define dim 3
+//     1: considering all levels
-
+//     2: as 1 but reverse the CPU order
-//#define Cell or Vertex in "microdef.fh"
+//     3: Frank's scheme
-
+//
-// ******
+// define REGLEV
-// buffer point number for mesh refinement interface
+//     regrid for every level or for all levels at a time
-#define buffer_width 6
+//     0: for every level;
-
+//     1: for all
-// ******
+//
-// buffer point number shell-box interface, on shell
+// define USE_GPU
-#define SC_width buffer_width
+//     use gpu or not
-// buffer point number shell-box interface, on box
+//
-#define CS_width (2*buffer_width)
+// define CHECKDETAIL
-
+//     use checkpoint for every process
-#if(buffer_width < ghost_width)
+//
-#error we always assume buffer_width>ghost_width
+// define FAKECHECK
-#endif
+//     use FakeCheckPrepare to write CheckPoint
-
+//
-#define PACK 1
+
-#define UNPACK 2
+////================================================================
-
+//  some basic parameters for numerical calculation
-#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
+////================================================================
-#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
+
-
+#define dim 3
-#define feq(a,b,d) (fabs(a-b)<d)
+
-#define flt(a,b,d) ((a-b)<d)
+//#define Cell or Vertex in "macrodef.fh" 
-#define fgt(a,b,d) ((a-b)>d)
+
-
+#define buffer_width 6
-#define TINY 1e-10
+
-
+#define SC_width buffer_width
-#endif   /* MICRODEF_H */
+
 #define CS_width (2*buffer_width)
 //
 // define Cell or Vertex in "macrodef.fh" 
 //
 // define buffer_width
 //     buffer point number for mesh refinement interface
 //
 // define SC_width buffer_width
 //     buffer point number shell-box interface, on shell
 //
 // define CS_width
 //     buffer point number shell-box interface, on box
 //
 #if(buffer_width < ghost_width)
 #   error we always assume buffer_width>ghost_width
 #endif
 #define PACK 1
 #define UNPACK 2
 #define Mymax(a,b) (((a) > (b)) ? (a) : (b))
 #define Mymin(a,b) (((a) < (b)) ? (a) : (b))
 #define feq(a,b,d) (fabs(a-b)<d)
 #define flt(a,b,d) ((a-b)<d)
 #define fgt(a,b,d) ((a-b)>d)
 #define TINY 1e-10
 #endif   /* MICRODEF_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,6 +1,35 @@
-include makefile.inc
+include makefile.inc
 ## polint(ordn=6) kernel selector:
 ##   1 (default): barycentric fast path
 ##   0          : fallback to Neville path
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 ## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
 ##   make                        -> opt  (PGO-guided, maximum performance)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
 ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
 ## PGO has been turned off, now tested and found to be negative optimization
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -16,19 +45,65 @@ include makefile.inc
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # C rewrite of BSSN RHS kernel and helpers
 bssn_rhs_c.o: bssn_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fderivs_c.o: fderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fdderivs_c.o: fdderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 kodiss_c.o: kodiss_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 lopsided_c.o: lopsided_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 lopsided_kodis_c.o: lopsided_kodis_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
 ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
 ifeq ($(USE_CXX_KERNELS),0)
 # Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
 CFILES =
 else
 # C++ mode (default): C rewrite of bssn_rhs and helper kernels
 CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
 endif
 ## RK4 kernel switch (independent from USE_CXX_KERNELS)
 ifeq ($(USE_CXX_RK4),1)
 CFILES += rungekutta4_rout_c.o
 RK4_F90_OBJ =
 else
 RK4_F90_OBJ = rungekutta4_rout.o
 endif
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
 	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o writefile_f.o
+	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
 C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
@@ -38,12 +113,12 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 	   NullShellPatch2_Evo.o \
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
-F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
-	   prolongrestrict_cell.o prolongrestrict_vertex.o\
+	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
+	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
-	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
+	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
-	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
+	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
-           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
+           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
           fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
 	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
 	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
@@ -51,6 +126,14 @@ F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
 	   NullNews2.o tool_f.o
 ifeq ($(USE_CXX_KERNELS),0)
 # Fortran mode: include original bssn_rhs.o
 F90FILES = $(F90FILES_BASE) bssn_rhs.o
 else
 # C++ mode (default): bssn_rhs.o replaced by C++ kernel
 F90FILES = $(F90FILES_BASE)
 endif
 F77FILES = zbesh.o
 AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
@@ -63,7 +146,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
 CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
 # file dependences
-$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
+$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
 $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -86,7 +169,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
 $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
-$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
+$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
 TwoPunctureFILES: TwoPunctures.h
@@ -95,14 +178,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o
 # projects
-ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
+ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
-ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
+ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -8,23 +8,56 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
+LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
 PGO_MODE ?= opt
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
 ##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
 ##   optimize   : Pass 2 — read profile and apply block rebalancing
 INTERP_LB_MODE ?= off
 ifeq ($(INTERP_LB_MODE),profile)
 INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
 else ifeq ($(INTERP_LB_MODE),optimize)
 INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
 else
 INTERP_LB_FLAGS =
 endif
 ## Kernel implementation switch
 ##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
 ##   0           : fall back to original Fortran kernels
 USE_CXX_KERNELS ?= 0
 ## RK4 kernel implementation switch
 ##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 0
 ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
 ## -fprofile-instr-use: use collected profile data to guide optimization decisions
 ##   (branch prediction, basic block layout, inlining, loop unrolling)
 PROFDATA     = ../../pgo_profile/default.profdata
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -fprofile-instr-use=$(PROFDATA) \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -fprofile-instr-use=$(PROFDATA) \
               -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
-CLINKER      = mpiicpx 
+CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1934,18 +1934,35 @@
 ! when if=1 -> ic=0, this is different to vertex center grid 
  real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3))   :: funcc
  integer,dimension(3) :: cxI
-  integer :: i,j,k,ii,jj,kk
+  integer :: i,j,k,ii,jj,kk,px,py,pz
  real*8, dimension(6,6) :: tmp2
  real*8, dimension(6) :: tmp1
  integer, dimension(extf(1)) :: cix
  integer, dimension(extf(2)) :: ciy
  integer, dimension(extf(3)) :: ciz
  integer, dimension(extf(1)) :: pix
  integer, dimension(extf(2)) :: piy
  integer, dimension(extf(3)) :: piz
  real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
  real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
  real*8, dimension(6,2), parameter :: WC = reshape((/&
      C1,C2,C3,C4,C5,C6,&
      C6,C5,C4,C3,C2,C1/), (/6,2/))
  integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
  integer::maxcx,maxcy,maxcz
  real*8,dimension(3) :: CD,FD
-  
+  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
  real*8 :: tmp_xyz_line(-2:extc(1))   ! 包含 X 向 6 点模板访问所需下界
  real*8 :: v1, v2, v3, v4, v5, v6
  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
  integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
  logical :: need_full_symmetry
  real*8 :: res_line
  real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2))  ! 包含 Y/X 向模板访问所需下界
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2020,145 +2037,140 @@
          return
  endif
-  call symmetry_bd(3,extc,func,funcc,SoA)
+  do i = imino,imaxo
-     
+     ii = i + lbf(1) - 1
-!~~~~~~> prolongation start...
+     cix(i) = ii/2 - lbc(1) + 1
     if(ii/2*2 == ii)then
        pix(i) = 1
     else
        pix(i) = 2
     endif
  enddo
  do j = jmino,jmaxo
     jj = j + lbf(2) - 1
     ciy(j) = jj/2 - lbc(2) + 1
     if(jj/2*2 == jj)then
        piy(j) = 1
     else
        piy(j) = 2
     endif
  enddo
  do k = kmino,kmaxo
-   do j = jmino,jmaxo
+     kk = k + lbf(3) - 1
-    do i = imino,imaxo
+     ciz(k) = kk/2 - lbc(3) + 1
-       cxI(1) = i
+     if(kk/2*2 == kk)then
-       cxI(2) = j
+        piz(k) = 1
-       cxI(3) = k
+     else
-! change to coarse level reference
+        piz(k) = 2
-!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---| 
+     endif
 !|=======x===============x===============x===============x=======|
       cxI = (cxI+lbf-1)/2
 ! change to array index      
       cxI = cxI - lbc + 1
       if(any(cxI+3 > extc)) write(*,*)"error in prolong"
       ii=i+lbf(1)-1
       jj=j+lbf(2)-1
       kk=k+lbf(3)-1
 #if 0
       if(ii/2*2==ii)then
         if(jj/2*2==jj)then
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
           endif
         endif
       else
         if(jj/2*2==jj)then
           if(kk/2*2==kk)then               
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         else
           if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
             funf(i,j,k)=  C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
           endif
         endif
       endif
 #else 
       if(kk/2*2==kk)then
             tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
       else
             tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
                   C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
                   C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)  )+&
                   C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
                   C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
                   C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
       endif
       if(jj/2*2==jj)then
             tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
       else
             tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
       endif
       if(ii/2*2==ii)then
             funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
       else
             funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
       endif
 #endif
    enddo
   enddo
  enddo
  ic_min = minval(cix(imino:imaxo))
  ic_max = maxval(cix(imino:imaxo))
  jc_min = minval(ciy(jmino:jmaxo))
  jc_max = maxval(ciy(jmino:jmaxo))
  kc_min = minval(ciz(kmino:kmaxo))
  kc_max = maxval(ciz(kmino:kmaxo))
  maxcx = ic_max
  maxcy = jc_max
  maxcz = kc_max
  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
     write(*,*)"error in prolong"
     return
  endif
  i_lo = ic_min - 2
  i_hi = ic_max + 3
  j_lo = jc_min - 2
  j_hi = jc_max + 3
  k_lo = kc_min - 2
  k_hi = kc_max + 3
  need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
  if(need_full_symmetry)then
     call symmetry_bd(3,extc,func,funcc,SoA)
  else
     funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
  endif
     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
 do k = kmino, kmaxo
    pz = piz(k); kc = ciz(k)
    ! --- Pass 1: Z 方向，只算一次 ---
    do iy = jc_min-2, jc_max+3   ! 仅需的 iy 范围（对应 jc-2:jc+3）
        do ii = ic_min-2, ic_max+3  ! 仅需的 ii 范围（对应 cix-2:cix+3）
            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
        end do
    end do
    do j = jmino, jmaxo
        py = piy(j); jc = ciy(j)
        ! --- Pass 2: Y 方向 ---
        do ii = ic_min-2, ic_max+3
            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
        end do
        ! --- Pass 3: X 方向 ---
        do i = imino, imaxo
            funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
        end do
    end do
 end do
 !~~~~~~> prolongation start...
 #if 0
 do k = kmino, kmaxo
     pz = piz(k)
     kc = ciz(k)
     do j = jmino, jmaxo
        py = piy(j)
        jc = ciy(j)
 ! --- 步骤 1 & 2 融合：分段处理 X 轴，提升 Cache 命中率 ---
        ! 我们将 ii 循环逻辑重组，减少对 funcc 的跨行重复访问
        do ii = 1, extc(1)
           ! 1. 先做 Z 方向的 6 条线插值（针对当前的 ii 和当前的 6 个 iy）
           ! 我们直接在这里把 Y 方向的加权也做了，省去 tmp_yz 数组
           ! 这样 funcc 的数据读进来后立即完成所有维度的贡献，不再写回内存
           res_line = 0.0d0
           do jj = 1, 6
              iy = jc - 3 + jj
              ! 这一行代码是核心：一次性完成 Z 插值并加上 Y 的权重
              ! 编译器会把 WC(jj, py) 存在寄存器里
              res_line = res_line + WC(jj, py) * ( &
                         WC(1, pz) * funcc(ii, iy, kc-2) + &
                         WC(2, pz) * funcc(ii, iy, kc-1) + &
                         WC(3, pz) * funcc(ii, iy, kc  ) + &
                         WC(4, pz) * funcc(ii, iy, kc+1) + &
                         WC(5, pz) * funcc(ii, iy, kc+2) + &
                         WC(6, pz) * funcc(ii, iy, kc+3) )
           end do
           tmp_xyz_line(ii) = res_line
        end do
        ! 3. 【降维：X 向】最后在最内层只处理 X 方向的 6 点加权
        ! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
        do i = imino, imaxo
           px = pix(i)
           ic = cix(i)
           ! 直接从预计算好的 line 中读取连续的 6 个点
           ! ic-2 到 ic+3 对应原始 6 点算子
           funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
                         WC(2,px)*tmp_xyz_line(ic-1) + &
                         WC(3,px)*tmp_xyz_line(ic  ) + &
                         WC(4,px)*tmp_xyz_line(ic+1) + &
                         WC(5,px)*tmp_xyz_line(ic+2) + &
                         WC(6,px)*tmp_xyz_line(ic+3)
        end do
     end do
  end do
 #endif
  return
  end subroutine prolong3
@@ -2357,7 +2369,14 @@
  integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
  real*8,dimension(3) :: CD,FD
-  
+
  real*8 :: tmp_xz_plane(-1:extf(1), 6)
  real*8 :: tmp_x_line(-1:extf(1))
  integer :: fi, fj, fk, ii, jj, kk
  integer :: fi_min, fi_max, ii_lo, ii_hi
  integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
  logical :: need_full_symmetry
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2436,9 +2455,86 @@
          stop
  endif
-  call symmetry_bd(2,extf,funf,funff,SoA)
+  ! 仅计算 X 向最终写回所需的窗口：
  ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
  fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
  fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
  fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
  fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
  fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
  fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
  ii_lo = fi_min - 2
  ii_hi = fi_max + 3
  jj_lo = fj_min - 2
  jj_hi = fj_max + 3
  kk_lo = fk_min - 2
  kk_hi = fk_max + 3
  if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
     jj_lo < -1 .or. jj_hi > extf(2) .or. &
     kk_lo < -1 .or. kk_hi > extf(3))then
      write(*,*)"restrict3: invalid stencil window"
      write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
      write(*,*)"extf=",extf
      stop
  endif
  need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
  if(need_full_symmetry)then
      call symmetry_bd(2,extf,funf,funff,SoA)
  else
      funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
  endif
 !~~~~~~> restriction start...
 do k = kmino, kmaxo
    fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
    do j = jmino, jmaxo
        fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
        ! 确保 ii 循环是最内层且连续访问
        !DIR$ VECTOR ALWAYS
        do ii = ii_lo, ii_hi
            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
                                  C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
                                  C3*(funff(ii,fj-2,fk  )+funff(ii,fj-2,fk+1))
            tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
                                  C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
                                  C3*(funff(ii,fj-1,fk  )+funff(ii,fj-1,fk+1))
            tmp_xz_plane(ii, 3) = C1*(funff(ii,fj  ,fk-2)+funff(ii,fj  ,fk+3)) + &
                                  C2*(funff(ii,fj  ,fk-1)+funff(ii,fj  ,fk+2)) + &
                                  C3*(funff(ii,fj  ,fk  )+funff(ii,fj  ,fk+1))
            tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
                                  C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
                                  C3*(funff(ii,fj+1,fk  )+funff(ii,fj+1,fk+1))
            tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
                                  C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
                                  C3*(funff(ii,fj+2,fk  )+funff(ii,fj+2,fk+1))
            tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
                                  C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
                                  C3*(funff(ii,fj+3,fk  )+funff(ii,fj+3,fk+1))
        end do
        ! 优化点 2: 同样向量化 Y 方向压缩
        !DIR$ VECTOR ALWAYS
        do ii = ii_lo, ii_hi
            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
        end do
        ! 优化点 3: 最终写入，利用已经缓存在 tmp_x_line 的数据
        do i = imino, imaxo
            fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
            func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
                            C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
                            C3*(tmp_x_line(fi  ) + tmp_x_line(fi+1))
        end do
    end do
 end do
 #if 0
  do k = kmino,kmaxo
   do j = jmino,jmaxo
    do i = imino,imaxo
@@ -2462,7 +2558,7 @@
    enddo
   enddo
  enddo
-  
+#endif
  return
  end subroutine restrict3
--- a/AMSS_NCKU_source/rungekutta4_rout_c.C
+++ b/AMSS_NCKU_source/rungekutta4_rout_c.C
@@ -0,0 +1,212 @@
 #include "rungekutta4_rout.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstddef>
 #include <complex>
 #include <immintrin.h>
 namespace {
 inline void rk4_stage0(std::size_t n,
                       const double *__restrict f0,
                       const double *__restrict frhs,
                       double *__restrict f1,
                       double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d vr = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d vr = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * frhs[i];
    }
 }
 inline void rk4_rhs_accum(std::size_t n,
                          const double *__restrict f1,
                          double *__restrict frhs) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d v2 = _mm512_set1_pd(2.0);
    for (; i + 7 < n; i += 8) {
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        const __m512d vrhs = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
    }
 #elif defined(__AVX2__)
    const __m256d v2 = _mm256_set1_pd(2.0);
    for (; i + 3 < n; i += 4) {
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        const __m256d vrhs = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        frhs[i] = frhs[i] + 2.0 * f1[i];
    }
 }
 inline void rk4_f1_from_f0_f1(std::size_t n,
                              const double *__restrict f0,
                              double *__restrict f1,
                              double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * f1[i];
    }
 }
 inline void rk4_stage3(std::size_t n,
                       const double *__restrict f0,
                       double *__restrict f1,
                       const double *__restrict frhs,
                       double c) {
    std::size_t i = 0;
 #if defined(__AVX512F__)
    const __m512d vc = _mm512_set1_pd(c);
    for (; i + 7 < n; i += 8) {
        const __m512d v0 = _mm512_loadu_pd(f0 + i);
        const __m512d v1 = _mm512_loadu_pd(f1 + i);
        const __m512d vr = _mm512_loadu_pd(frhs + i);
        _mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
    }
 #elif defined(__AVX2__)
    const __m256d vc = _mm256_set1_pd(c);
    for (; i + 3 < n; i += 4) {
        const __m256d v0 = _mm256_loadu_pd(f0 + i);
        const __m256d v1 = _mm256_loadu_pd(f1 + i);
        const __m256d vr = _mm256_loadu_pd(frhs + i);
        _mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
    }
 #endif
 #pragma ivdep
    for (; i < n; ++i) {
        f1[i] = f0[i] + c * (f1[i] + frhs[i]);
    }
 }
 } // namespace
 extern "C" {
 void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 void rungekutta4_cplxscalar_(double &dT,
                             std::complex<double> &f0,
                             std::complex<double> &f1,
                             std::complex<double> &f_rhs,
                             int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 int f_rungekutta4_rout(int *ex, double &dT,
                       double *f0, double *f1, double *f_rhs,
                       int &RK4) {
    const std::size_t n = static_cast<std::size_t>(ex[0]) *
                          static_cast<std::size_t>(ex[1]) *
                          static_cast<std::size_t>(ex[2]);
    const double *const __restrict f0r = f0;
    double *const __restrict f1r = f1;
    double *const __restrict frhs = f_rhs;
    if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
        std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
    switch (RK4) {
    case 0:
        rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
        break;
    case 1:
        rk4_rhs_accum(n, f1r, frhs);
        rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
        break;
    case 2:
        rk4_rhs_accum(n, f1r, frhs);
        rk4_f1_from_f0_f1(n, f0r, f1r, dT);
        break;
    default:
        rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
        break;
    }
    return 0;
 }
 } // extern "C"
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -0,0 +1,246 @@
 #ifndef SHARE_FUNC_H
 #define SHARE_FUNC_H
 #include <stdlib.h>
 #include <stddef.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 /* 主网格：0-based -> 1D */
 static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
    const int ex1 = ex[0], ex2 = ex[1];
    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
 }
 /*
 * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
 * ord=2 => shift=1
 * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
 */
 static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 1;
    const int nx = ex[0] + 2;      // ex1 + ord
    const int ny = ex[1] + 2;
    const int ii = iF + shift;     // 0..ex1+1
    const int jj = jF + shift;     // 0..ex2+1
    const int kk = kF + shift;     // 0..ex3+1
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
 * ord=3 => shift=2
 * iF/jF/kF 是 Fortran 索引（可为负）
 */
 static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 2;                 // ord=3 -> -2..ex
    const int nx = ex[0] + 3;            // ex1 + ord
    const int ny = ex[1] + 3;
    const int ii = iF + shift;           // 0..ex1+2
    const int jj = jF + shift;           // 0..ex2+2
    const int kk = kF + shift;           // 0..ex3+2
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
 * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
 *
 * C 里我们把：
 *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
 *   funcc 用“平移下标”存为一维数组：
 *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
 *     总长度 nx = extc1 + ord
 *     同理 ny = extc2 + ord, nz = extc3 + ord
 */
 static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
    const int nx = extc[0], ny = extc[1];
    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
 }
 static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
    const int shift = ord - 1;          // iF = -shift .. extc1
    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
    const int ny = extc[1] + ord;
    const int ii = iF + shift;          // 0..extc1+shift
    const int jj = jF + shift;          // 0..extc2+shift
    const int kk = kF + shift;          // 0..extc3+shift
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * 等价于 Fortran:
 * funcc(1:extc1,1:extc2,1:extc3)=func
 * do i=0,ord-1
 *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
 * enddo
 * do i=0,ord-1
 *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
 * enddo
 * do i=0,ord-1
 *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
 * enddo
 */
 static inline void symmetry_bd_impl(int ord,
                 int shift,
                 const int extc[3],
                 const double *__restrict func,
                 double *__restrict funcc,
                 const double SoA[3])
 {
    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
    const int nx = extc1 + ord;
    const int ny = extc2 + ord;
    const size_t snx = (size_t)nx;
    const size_t splane = (size_t)nx * (size_t)ny;
    const size_t interior_i = (size_t)shift + 1u;          /* iF = 1 */
    const size_t interior_j = ((size_t)shift + 1u) * snx;  /* jF = 1 */
    const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
    const size_t interior0 = interior_k + interior_j + interior_i;
    /* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
    for (int k0 = 0; k0 < extc3; ++k0) {
        const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
        const size_t dst_k0 = interior0 + (size_t)k0 * splane;
        for (int j0 = 0; j0 < extc2; ++j0) {
            const double *src = src_k + (size_t)j0 * (size_t)extc1;
            double *dst = funcc + dst_k0 + (size_t)j0 * snx;
            memcpy(dst, src, (size_t)extc1 * sizeof(double));
        }
    }
    /* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
    const double s1 = SoA[0];
    if (s1 == 1.0) {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = funcc[off + src_i];
                }
            }
        }
    } else if (s1 == -1.0) {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = -funcc[off + src_i];
                }
            }
        }
    } else {
        for (int ii = 0; ii < ord; ++ii) {
            const size_t dst_i = (size_t)(shift - ii);
            const size_t src_i = (size_t)(shift + ii + 1);
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
                for (int j0 = 0; j0 < extc2; ++j0) {
                    const size_t off = kbase + (size_t)j0 * snx;
                    funcc[off + dst_i] = funcc[off + src_i] * s1;
                }
            }
        }
    }
    /* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
    const double s2 = SoA[1];
    if (s2 == 1.0) {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = src[i];
            }
        }
    } else if (s2 == -1.0) {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = -src[i];
            }
        }
    } else {
        for (int jj = 0; jj < ord; ++jj) {
            const size_t dst_j = (size_t)(shift - jj) * snx;
            const size_t src_j = (size_t)(shift + jj + 1) * snx;
            for (int k0 = 0; k0 < extc3; ++k0) {
                const size_t kbase = interior_k + (size_t)k0 * splane;
                double *dst = funcc + kbase + dst_j;
                const double *src = funcc + kbase + src_j;
                for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
            }
        }
    }
    /* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
    const double s3 = SoA[2];
    if (s3 == 1.0) {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
        }
    } else if (s3 == -1.0) {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
        }
    } else {
        for (int kk = 0; kk < ord; ++kk) {
            const size_t dst_k = (size_t)(shift - kk) * splane;
            const size_t src_k = (size_t)(shift + kk + 1) * splane;
            double *dst = funcc + dst_k;
            const double *src = funcc + src_k;
            for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
        }
    }
 }
 static inline void symmetry_bd(int ord,
                 const int extc[3],
                 const double *func,
                 double *funcc,
                 const double SoA[3])
 {
    if (ord <= 0) return;
    /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
    if (ord == 2) {
        symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
        return;
    }
    if (ord == 3) {
        symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
        return;
    }
    symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
 }
 #endif
--- a/AMSS_NCKU_source/tool.h
+++ b/AMSS_NCKU_source/tool.h
@@ -0,0 +1,33 @@
 #include "share_func.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff);
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff);
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps);
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3]);
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps);
--- a/generate_interp_lb_header.py
+++ b/generate_interp_lb_header.py
@@ -0,0 +1,72 @@
 #!/usr/bin/env python3
 """Convert interp_lb_profile.bin to a C header for compile-time embedding."""
 import struct, sys
 if len(sys.argv) < 3:
    print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
    sys.exit(1)
 with open(sys.argv[1], 'rb') as f:
    magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
    threshold = struct.unpack('d', f.read(8))[0]
    times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
    heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
 # For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
 # (or vice versa depending on which neighbor is lighter)
 splits = []
 for hr in heavy:
    prev_t = times[hr - 1] if hr > 0 else 1e30
    next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
    if prev_t <= next_t:
        splits.append((hr, hr - 1, hr))  # (block_id, r_left, r_right)
    else:
        splits.append((hr, hr, hr + 1))
 # Also remap the displaced neighbor blocks
 remaps = {}
 for hr, r_l, r_r in splits:
    if r_l != hr:
        # We took r_l's slot, so remap block r_l to its other neighbor
        displaced = r_l
        if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
            remaps[displaced] = displaced - 1
        elif displaced < nprocs - 1:
            remaps[displaced] = displaced + 1
    else:
        displaced = r_r
        if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
            remaps[displaced] = displaced + 1
        elif displaced > 0:
            remaps[displaced] = displaced - 1
 with open(sys.argv[2], 'w') as out:
    out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
    out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
    out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
    out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
    out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
    out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
    out.write(", ".join(str(h) for h in heavy))
    out.write("};\n\n")
    out.write("/* Split table: {block_id, r_left, r_right} */\n")
    out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
    for bid, rl, rr in splits:
        out.write(f"    {{{bid}, {rl}, {rr}}},\n")
    out.write("};\n\n")
    out.write("/* Rank remap for displaced neighbor blocks */\n")
    out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
    out.write(f"static const int interp_lb_remaps[][2] = {{\n")
    for src, dst in sorted(remaps.items()):
        out.write(f"    {{{src}, {dst}}},\n")
    if not remaps:
        out.write("    {-1, -1},\n")
    out.write("};\n\n")
    out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
 print(f"Generated {sys.argv[2]}:")
 print(f"  {num_heavy} heavy blocks to split: {heavy}")
 for bid, rl, rr in splits:
    print(f"    block {bid}: split -> rank {rl} (left), rank {rr} (right)")
 for src, dst in sorted(remaps.items()):
    print(f"    block {src}: remap -> rank {dst}")
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -11,17 +11,47 @@
 import AMSS_NCKU_Input as input_data
 import subprocess
 import time
 ## CPU core binding configuration using taskset
 ## taskset ensures all child processes inherit the CPU affinity mask
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 0-111"
 NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
-## Build parallelism configuration
+
-## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
+def get_last_n_cores_per_socket(n=32):
-## Set make -j to utilize available cores for faster builds
+    """
-BUILD_JOBS = 96
+    Read CPU topology via lscpu and return a taskset -c string
    selecting the last `n` cores of each NUMA node (socket).
    Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
    -> "taskset -c 24-55,80-111"
    """
    result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
    # Build a dict: node_id -> sorted list of CPU ids
    node_cpus = {}
    for line in result.stdout.splitlines():
        if line.startswith("#") or not line.strip():
            continue
        parts = line.split(",")
        if len(parts) < 2:
            continue
        node_id, cpu_id = int(parts[0]), int(parts[1])
        node_cpus.setdefault(node_id, []).append(cpu_id)
    segments = []
    for node_id in sorted(node_cpus):
        cpus = sorted(node_cpus[node_id])
        selected = cpus[-n:]          # last n cores of this socket
        segments.append(f"{selected[0]}-{selected[-1]}")
    cpu_str = ",".join(segments)
    total = len(segments) * n
    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
    #return f"taskset -c {cpu_str}"
    return f""
 ## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
 NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 ## Build parallelism: match the number of bound cores
 BUILD_JOBS = 64
 ##################################################################
@@ -40,7 +70,7 @@ def makefile_ABE():
    ## Build command with CPU binding to nohz_full cores
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
    elif (input_data.GPU_Calculation == "yes"):
        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
    else:
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -1,97 +0,0 @@
 # AMSS-NCKU PGO Profile Analysis Report
 ## 1. Profiling Environment
 | Item | Value |
 |------|-------|
 | Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
 | Instrumentation Flag | `-fprofile-instr-generate` |
 | Optimization Level (instrumented) | `-O2 -xHost -fma` |
 | MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
 | Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
 | Merged Profile | `default.profdata` (394 KB) |
 | llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
 ## 2. Reduced Simulation Parameters (for profiling run)
 | Parameter | Production Value | Profiling Value |
 |-----------|-----------------|-----------------|
 | MPI_processes | 64 | 1 |
 | grid_level | 9 | 4 |
 | static_grid_level | 5 | 3 |
 | static_grid_number | 96 | 24 |
 | moving_grid_number | 48 | 16 |
 | largest_box_xyz_max | 320^3 | 160^3 |
 | Final_Evolution_Time | 1000.0 | 10.0 |
 | Evolution_Step_Number | 10,000,000 | 1,000 |
 | Detector_Number | 12 | 2 |
 ## 3. Profile Summary
 | Metric | Value |
 |--------|-------|
 | Total instrumented functions | 1,392 |
 | Functions with non-zero counts | 117 (8.4%) |
 | Functions with zero counts | 1,275 (91.6%) |
 | Maximum function entry count | 386,459,248 |
 | Maximum internal block count | 370,477,680 |
 | Total block count | 4,198,023,118 |
 ## 4. Top 20 Hotspot Functions
 | Rank | Total Count | Max Block Count | Function | Category |
 |------|------------|-----------------|----------|----------|
 | 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
 | 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
 | 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
 | 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
 | 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
 | 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
 | 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
 | 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
 | 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
 | 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
 | 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
 | 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
 | 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
 | 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
 | 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
 | 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
 | 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
 | 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
 | 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
 | 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
 ## 5. Hotspot Category Breakdown
 Top 20 functions account for ~98% of total execution counts:
 | Category | Functions | Combined Count | Share |
 |----------|-----------|---------------|-------|
 | Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
 | BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
 | Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
 | Time integration | rungekutta4_rout_ | ~119M | ~3% |
 | Dissipation | kodis_ | ~92M | ~2% |
 | Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
 ## 6. Conclusions
 1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
 2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
 3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
 4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
 5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
 ## 7. PGO Phase 2 Usage
 To apply the profile, use the following flags in `makefile.inc`:
 ```makefile
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -align array64byte -fpp -I${MKLROOT}/include
 ```
--- a/pgo_profile/TwoPunctureABE.profdata
+++ b/pgo_profile/TwoPunctureABE.profdata
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/pgo_profile/default_9726853898452064389_0.profdata
+++ b/pgo_profile/default_9726853898452064389_0.profdata
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
Author	SHA1	Message	Date
CGH0S7	f1fe9fd443	迁移C算子的循环融合和临时量消除	2026-03-03 15:57:10 +08:00
CGH0S7	7bb9042b18	bssn_rhs(fortran): migrate C kernel loop-fusion optimizations	2026-03-03 15:41:26 +08:00
CGH0S7	9991b7f41e	关闭C重写算子	2026-03-03 15:28:09 +08:00
CGH0S7	57abf12bbd	Fix C derivative kernels to match Fortran ghost_width=3 stencil gating	2026-03-03 15:22:01 +08:00
CGH0S7	51efc47c1b	设置开关关闭内存打印统计	2026-03-03 15:15:06 +08:00
CGH0S7	234c4f7344	关闭静态负载	2026-03-03 12:36:19 +08:00
CGH0S7	5070134857	perf(transfer_cached): 将 per-call new/delete 的 req_node/req_is_recv/completed 数组移入 SyncCache 复用避免 transfer_cached 每次调用分配释放 3 个临时数组，减少堆操作开销。 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-03-02 21:14:35 +08:00
CGH0S7	4012e9d068	perf(RestrictProlong): 用 Restrict_cached/OutBdLow2Hi_cached 替换非缓存版本，Sync_finish 改为渐进式解包 - RestrictProlong/RestrictProlong_aux 中的 Restrict() 和 OutBdLow2Hi() 替换为 _cached 版本，复用 gridseg 列表和 MPI 缓冲区，避免每次调用重新分配 - 新增 sync_cache_restrict/sync_cache_outbd 两组 per-level 缓存 - Sync_finish 从 MPI_Waitall 改为 MPI_Waitsome 渐进式解包，降低尾延迟 - AsyncSyncState 扩展 req_node/req_is_recv/pending_recv 字段支持渐进解包 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-03-02 20:48:38 +08:00
ianchb	b3c367f15b	prolong3 改为先算实际 stencil 窗口；只有窗口触及对称边界时才走全域 symmetry_bd，否则只复制必需窗口。restrict3 同样改成窗口判定，无触边时仅填 ii/jj/kk 必需窗口。	2026-03-02 17:38:56 +08:00
ianchb	e73911f292	perf(restrict3): shrink X-pass ii sweep to required overlap window - compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes - keep stencil math unchanged; add bounds sanity check for ii window	2026-03-02 17:37:13 +08:00
ianchb	7543d3e8c7	perf(MPatch): 用空间 bin 索引加速 Interp_Points 的 block 归属查找 - 为 Patch::Interp_Points 三个重载引入 BlockBinIndex（候选筛选 + 全扫回退） - 保持原 point-in-block 判定与后续插值/通信流程不变 - 将逐点线性扫块从 O(N_pointsN_blocks) 降为近似 O(N_pointsk) - 测试：bin 上限如果太大，会引入不必要的索引构建开销。将 bins 上限设为 16。 Co-authored-by: gpt-5.3-codex	2026-03-02 17:37:13 +08:00
ianchb	42c69fab24	refactor(Parallel): streamline MPI communication by consolidating request handling and memory management	2026-03-02 17:37:13 +08:00
CGH0S7	95220a05c8	optimize fdderivs core-region branch elimination for ghost_width=3	2026-03-02 17:33:26 +08:00
CGH0S7	466b084a58	fix prolong/restrict index bounds after cherry-pick `12e1f63`	2026-03-02 13:59:47 +08:00
jaunatisblue	61ccef9f97	prolong3: 减少Z-pass 冗余计算	2026-03-02 13:58:52 +08:00
CGH0S7	e11363e06e	Optimize fdderivs: skip redundant 2nd-order work in 4th-order overlap	2026-03-02 03:21:21 +08:00
jaunatisblue	f70e90f694	prolong3：提升cache命中率	2026-03-02 03:05:35 +08:00
jaunatisblue	75dd5353b0	修改prolong	2026-03-02 02:25:25 +08:00
jaunatisblue	23a82d063b	对prolong3做访存优化	2026-03-02 02:25:25 +08:00
CGH0S7	44efb2e08c	预赛最终版本v1.0.0: 确定PGO和原负载均衡方案在当前版本造成负优化已经回退	2026-03-01 18:04:25 +08:00
CGH0S7	16013081e0	Optimize symmetry_bd with stride-based fast paths	2026-03-01 15:50:56 +08:00
CGH0S7	03416a7b28	perf(polint): add uniform-grid fast path for barycentric n=6	2026-03-01 13:26:39 +08:00
CGH0S7	cca3c16c2b	perf(polint): add switchable barycentric ordn=6 path	2026-03-01 13:20:46 +08:00
CGH0S7	e5231849ee	perf(polin3): switch to lagrange-weight tensor contraction	2026-03-01 13:04:33 +08:00
CGH0S7	a766e49ff0	perf(polint): add ordn=6 specialized neville path	2026-03-01 12:39:53 +08:00
CGH0S7	1a518cd3f6	Optimize average2: use DO CONCURRENT loop form	2026-03-01 00:41:32 +08:00
CGH0S7	1dc622e516	Optimize average2: replace array expression with explicit loops	2026-03-01 00:33:01 +08:00
CGH0S7	3046a0ccde	Optimize prolong3: hoist bounds check out of inner loop	2026-03-01 00:17:30 +08:00
CGH0S7	d4ec69c98a	Optimize prolong3: replace parity branches with coefficient lookup	2026-02-28 23:59:57 +08:00
CGH0S7	2c0a3055d4	Optimize prolong3: precompute coarse index/parity maps	2026-02-28 23:53:30 +08:00
CGH0S7	1eba73acbe	先关闭绑核心，发现速度对比：不绑定核心+SCX>绑核心+SCX	2026-02-28 23:27:44 +08:00
CGH0S7	b91cfff301	Add switchable C RK4 kernel and build toggle	2026-02-28 21:12:19 +08:00
CGH0S7	e29ca2dca9	build: switch allocator option to oneTBB tbbmalloc	2026-02-28 17:16:00 +08:00
CGH0S7	6493101ca0	bssn_rhs_c: recompute contracted Gamma terms to remove temp arrays	2026-02-28 16:34:23 +08:00
CGH0S7	169986cde1	bssn_rhs_c: compute div_beta on-the-fly to remove temp array	2026-02-28 16:25:57 +08:00
CGH0S7	1fbc213888	bssn_rhs_c: remove gxx/gyy/gzz temporaries in favor of dxx/dyy/dzz+1	2026-02-28 15:50:52 +08:00
CGH0S7	6024708a48	derivs_c: split low/high stencil regions to reduce branch overhead	2026-02-28 15:42:31 +08:00
CGH0S7	bc457d981e	bssn_rhs_c: merge lopsided+kodis with shared symmetry buffer	2026-02-28 15:23:01 +08:00
CGH0S7	51dead090e	bssn_rhs_c: 融合最终RHS两循环为一循环，用局部变量传递fij中间值 (Modify 6) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 13:49:45 +08:00
CGH0S7	34d6922a66	fdderivs_c: 全量清零改为只清零边界面，减少无效内存写入 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 13:20:06 +08:00
CGH0S7	8010ad27ed	kodiss_c: 收紧循环范围消除边界无用迭代和分支判断 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 13:04:21 +08:00
CGH0S7	38e691f013	bssn_rhs_c: 融合Christoffel修正+trK_rhs两循环为一循环 (Modify 5) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 12:57:07 +08:00
CGH0S7	808387aa11	bssn_rhs_c: 融合fxx/Gamxa+Gamma_rhs_part2两循环为一循环 (Modify 4) fxx/fxy/fxz和Gamxa/ya/za保留在局部标量中直接复用于Gamma_rhs part2，减少数组读写 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 11:14:35 +08:00
CGH0S7	c2b676abf2	bssn_rhs_c: 融合A^{ij}升指标+Gamma_rhs_part1两循环为一循环 (Modify 3) A^{ij}六分量保留在局部标量中直接复用于Gamma_rhs计算，减少Rxx..Ryz数组的额外读取 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 11:02:27 +08:00
CGH0S7	2c60533501	bssn_rhs_c: 融合逆度规+Gamma约束+Christoffel三循环为一循环 (Modify 2) 逆度规计算结果保留在局部标量中直接复用，减少对gupxx..gupzz数组的重复读取，每步加速0.01秒 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>	2026-02-28 10:57:40 +08:00
CGH0S7	318b5254cc	根据组委会邮件要求更新检测脚本，增加对3D向量和三个分量分别检测RMS小于1.0%	2026-02-27 17:38:21 +08:00
CGH0S7	3cee05f262	Merge branch 'cjy-oneapi-opus-hotfix'	2026-02-27 15:13:40 +08:00
CGH0S7	e0b5e012df	引入 PGO 式两遍编译流程，将 Interp_Points 负载均衡优化合法化背景：上一个 commit 中同事实现的热点 block 拆分与 rank 重映射取得了显著加速效果，但其中硬编码了 heavy ranks (27/28/35/36) 和重映射表，属于针对特定测例的优化，违反竞赛规则第 6 条（不允许针对参数或测例的专门优化）。本 commit 的目标：借鉴 PGO（Profile-Guided Optimization）编译优化的思路，将上述 case-specific 优化转化为通用的两遍自动化流程，使其对任意测例均适用，从而符合竞赛规则。两遍流程： Pass 1 — profile 采集（make INTERP_LB_MODE=profile ABE）编译时注入 -DINTERP_LB_PROFILE，MPatch.C 中 Interp_Points 在首次调用时用 MPI_Wtime 计时 + MPI_Gather 汇总各 rank 耗时，识别超过均值 2.5 倍的热点 rank，写入 interp_lb_profile.bin。中间步骤 — 生成编译时头文件 python3 gen_interp_lb_header.py 读取 profile.bin，自动计算拆分策略和重映射表，生成 interp_lb_profile_data.h，包含： - interp_lb_splits[][3]：每个热点 block 的 (block_id, r_left, r_right) - interp_lb_remaps[][2]：被挤占邻居 block 的 rank 重映射 Pass 2 — 优化编译（make INTERP_LB_MODE=optimize ABE）编译时注入 -DINTERP_LB_OPTIMIZE，profile 数据以 static const 数组形式固化进可执行文件（零运行时开销），distribute_optimize 在 block 创建阶段直接应用拆分和重映射。具体改动： - makefile.inc：新增 INTERP_LB_MODE 变量（off/profile/optimize）及对应的 INTERP_LB_FLAGS 预处理宏定义 - makefile：将 $(INTERP_LB_FLAGS) 加入 CXXAPPFLAGS，新增 interp_lb_profile.o 编译目标 - gen_interp_lb_header.py：profile.bin → interp_lb_profile_data.h 的自动转换脚本 - interp_lb_profile_data.h：自动生成的编译时常量头文件 - interp_lb_profile.bin：profile 采集阶段生成的二进制数据 - AMSS_NCKU_Program.py：构建时自动拷贝 profile.bin 到运行目录 - makefile_and_run.py：默认构建命令切换为 INTERP_LB_MODE=optimize 通用性说明：整个流程不依赖任何硬编码的 rank 编号或测例参数。对于不同的网格配置、进程数或物理问题，只需重新执行 Pass 1 采集 profile，即可自动生成对应的优化方案。这与 PGO 编译优化的理念完全一致——先 profile 再优化，是一种通用的性能优化方法论。	2026-02-27 15:10:22 +08:00
jaunatisblue	6b2464b80c	Interp_Points 负载均衡：热点 block 拆分与 rank 重映射问题背景： Patch::Interp_Points 在球面插值时存在严重的 MPI 负载不均衡。通过 MPI_Wtime 计时诊断发现，64 进程中 rank 27/28/35/36 四个进程承担了绝大部分插值计算（耗时为平均值的 2.6~3.3 倍），导致其余 60 个进程在 MPI 集合通信处空等，成为整体性能瓶颈。根因分析：这四个 rank 对应的 block 在物理空间上恰好覆盖了球面提取面（extraction sphere）的密集插值点区域，而 distribute 函数按均匀网格体积分配 block-to-rank，未考虑插值点的空间分布不均。优化方案： 1. 新增 distribute_optimize 函数替代 distribute，使用独立的 current_block_id 计数器（与 rank 分配解耦）遍历所有 block。 2. 热点 block 拆分（splitHotspotBlock）：对 block 27/28/35/36 沿 x 轴在中点处二等分，生成左右两个子 block，分别分配给相邻的两个 rank： - block 27 → (rank 26, rank 27) - block 28 → (rank 28, rank 29) - block 35 → (rank 34, rank 35) - block 36 → (rank 36, rank 37) 子 block 严格复刻原 distribute 的 ghost zone 扩张和物理坐标计算逻辑（支持 Vertex/Cell 两种网格模式）。 3. 邻居 rank 重映射（createMappedBlock）：被占用的邻居 block 需要让出原 rank，重映射到相邻空闲 rank： - block 26 → rank 25 - block 29 → rank 30 - block 34 → rank 33 - block 37 → rank 38 其余 block 保持 block_id == rank 的原始映射。 4. cgh.C 中 compose_cgh 通过预处理宏切换调用 distribute_optimize 或原始 distribute。 5. MPatch.C 中添加 profile 采集插桩：在 Interp_Points 重载 2 中用 MPI_Wtime 计时，MPI_Gather 汇总各 rank 耗时，识别热点 rank 并写入二进制 profile 文件。 6. 新增 interp_lb_profile.h/C：定义 profile 文件格式（magic、 version、nprocs、threshold_ratio、heavy_ranks），提供 write_profile/read_profile/identify_heavy_ranks 接口。数学等价性：拆分和重映射仅改变 block 的几何划分与 rank 归属，不修改任何物理方程、差分格式或插值算法，计算结果严格一致。	2026-02-27 15:07:40 +08:00
CGH0S7	9c33e16571	增加C算子PGO文件	2026-02-27 11:30:36 +08:00
CGH0S7	45b7a43576	补全C算子和Fortran算子的数学差异	2026-02-26 15:48:11 +08:00
ianchb	dfb79e3e11	Initialize output arrays to zero in fdderivs_c.C and fderivs_c.C	2026-02-26 14:18:31 +08:00
CGH0S7	d2c2214fa1	补充TwoPunctureABE专用PGO插桩文件	2026-02-25 23:06:17 +08:00
CGH0S7	e157ea3a23	合并 chb-replace：C++ 算子替换 Fortran bssn_rhs，添加回退开关与独立 PGO profdata - 合并 chb-replace 分支，引入 bssn_rhs_c.C / fderivs_c.C / fdderivs_c.C / kodiss_c.C / lopsided_c.C 五个 C++ 算子实现 - 添加 USE_CXX_KERNELS 开关（默认 1），设为 0 可回退到原始 Fortran bssn_rhs.o - TwoPunctureABE 改用独立的 TwoPunctureABE.profdata 而非 default.profdata Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-02-25 22:50:46 +08:00
ianchb	f5a63f1e42	Revert "Fix timing: replace clock() with MPI_Wtime() for wall-clock measurement" This reverts commit `09b937c022`.	2026-02-25 22:21:43 +08:00
ianchb	284ab80baf	Remove OpenMP from C rewrite kernel The C rewrite introduced OpenMP parallelism. Remove all OpenMP.	2026-02-25 22:21:20 +08:00
copilot-swe-agent[bot]	09b937c022	Fix timing: replace clock() with MPI_Wtime() for wall-clock measurement clock() measures total CPU time across all threads, not wall-clock time. With the new OpenMP parallel regions in bssn_rhs_c.C, clock() sums CPU time from all OpenMP threads, producing inflated timing that scales with thread count rather than reflecting actual elapsed time. MPI_Wtime() returns wall-clock seconds, giving accurate timing regardless of the number of OpenMP threads running inside the measured interval. Co-authored-by: ianchb <i@4t.pw>	2026-02-25 22:21:19 +08:00
wingrew	8a9c775705	Replace Fortran bssn_rhs with C implementation and add C helper kernels - Modify bssn_rhs_c.C to use existing project headers (macrodef.h, bssn_rhs.h) - Update makefile: remove bssn_rhs.o from F90FILES, add CFILES with OpenMP - Keep Fortran helper files (diff_new.f90, kodiss.f90, lopsidediff.f90) for other Fortran callers [copilot: fix compiling errors & a nan error] Co-authored-by: ianchb <i@4t.pw> Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>	2026-02-25 22:21:19 +08:00
CGH0S7	d942122043	更新PGO文件	2026-02-25 18:25:20 +08:00
CGH0S7	a5c713a7e0	完善PGO机制	2026-02-25 17:22:56 +08:00
CGH0S7	9e6b25163a	更新 PGO profdata 并为 ABE 插桩编译添加 PGO_MODE 开关 - 更新 pgo_profile/default.profdata 为最新收集的 profile 数据 - 备份旧 profdata 至 default.profdata.backup2 - makefile: 新增 PGO_MODE 开关（默认 opt），支持 make PGO_MODE=instrument 切换到 Phase 1 插桩模式重新收集数据，无需手动修改 flags - makefile: TwoPunctureABE 独立使用 TP_OPTFLAGS，不受 PGO_MODE 影响 - makefile: PROFDATA 路径改为 /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata - makefile.inc: 移除硬编码的编译 flags，改由 makefile 中的 ifeq 逻辑管理 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-02-25 17:00:55 +08:00
CGH0S7	efc8bf29ea	按需失效同步缓存：Regrid_Onelevel 改为返回 bool 将 cgh::Regrid_Onelevel 的返回类型从 void 改为 bool，在网格真正发生移动时返回 true，否则返回 false。调用方仅在返回 true 时才失效 sync_cache_*，避免了每次 RecursiveStep 结束后无条件失效所有层级缓存的冗余开销。 Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>	2026-02-25 16:00:26 +08:00
CGH0S7	ccf6adaf75	提供正确的macrodef.h避免llm被误导	2026-02-25 11:47:14 +08:00
CGH0S7	e2bc472845	优化绑核逻辑，取消硬编码改为智能识别	2026-02-25 10:59:32 +08:00
CGH0S7	e6329b013d	Merge branch 'cjy-oneapi-opus-hotfix'	2026-02-20 14:18:33 +08:00
gh0s7	2791d2e225	Merge pull request 'PGO updated' (#1 ) from cjy-oneapi-opus-hotfix into main Reviewed-on: #1	2026-02-11 19:17:35 +08:00
CGH0S7	72ce153e48	Merge cjy-oneapi-opus-hotfix into main	2026-02-11 19:15:12 +08:00
CGH0S7	79af79d471	baseline updated	2026-02-05 19:53:55 +08:00