prolong3 改为先算实际 stencil 窗口；只有窗口触及对称边界时才走全域 symmetry_bd，否则只复制必需窗口。restrict3 同样改成窗口判定，无触边时仅填 ii/jj/kk 必需窗口。

perf(restrict3): shrink X-pass ii sweep to required overlap window
- compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes - keep stencil math unchanged; add bounds sanity check for ii window
2026-03-02 18:10:38 +08:00 · 2026-03-02 18:10:38 +08:00 · 2026-03-02 18:10:35 +08:00 · 2026-03-02 14:11:52 +08:00 · 2026-03-02 14:08:27 +08:00 · 2026-03-02 14:08:13 +08:00
47 changed files with 5821 additions and 6171 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__
 GW150914
-GW150914*
+GW150914-origin
 docs
 *.tmp
-.codex
+
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -174,14 +174,11 @@ import generate_macrodef
 generate_macrodef.generate_macrodef_h()
 print( " AMSS-NCKU macro file macrodef.h has been generated. " )
-generate_macrodef.generate_macrodef_fh()
+generate_macrodef.generate_macrodef_fh()
-print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
+print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
-
+
-generate_macrodef.generate_build_config()
+
-print( " AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. " )
+##################################################################
 ##################################################################
 # Compile the AMSS-NCKU program according to user requirements
@@ -220,13 +217,11 @@ shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
 # Copy the generated macro files into the AMSS_NCKU source folder
-macrodef_h_path  = os.path.join(File_directory, "macrodef.h")
+macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
-macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
+macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
-build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
+
-
+shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
-shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
+shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
 shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 # Notes on copying files:
 # shutil.copy2 preserves file metadata such as modification time.
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -2,18 +2,13 @@
 """
 AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
-Verification Requirements:
+Verification Requirements:
-1. RMS errors < 1% for:
+1. RMS errors < 1% for:
-   - 3D Vector Total RMS
+   - 3D Vector Total RMS
-   - X Component RMS
+   - X Component RMS
-   - Y Component RMS
+   - Y Component RMS
-   - Z Component RMS
+   - Z Component RMS
-2. ADM constraint violation < 2 (Grid Level 0)
+2. ADM constraint violation < 2 (Grid Level 0)
 3. The following figure PDFs must match GW150914-origin exactly after rasterization:
   - ADM_Constraint_Grid_Level_0.pdf
   - BH_Trajectory_21_XY.pdf
   - BH_Trajectory_XY.pdf
   The script also reports the percentage of differing pixels for each figure.
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
@@ -25,13 +20,9 @@ Default: output_dir = GW150914/AMSS_NCKU_output
 Reference: GW150914-origin (baseline simulation)
 """
-import numpy as np
+import numpy as np
-import sys
+import sys
-import os
+import os
 import shutil
 import subprocess
 import tempfile
 from PIL import Image
 # ANSI Color Codes
 class Color:
@@ -58,143 +49,17 @@ def load_bh_trajectory(filepath):
    }
-def load_constraint_data(filepath):
+def load_constraint_data(filepath):
-    """Load constraint violation data"""
+    """Load constraint violation data"""
-    data = []
+    data = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            parts = line.split()
            if len(parts) >= 8:
-                data.append([float(x) for x in parts[:8]])
+                data.append([float(x) for x in parts[:8]])
-    return np.array(data)
+    return np.array(data)
 def resolve_figure_dir(path):
    """Resolve the sibling figure directory from an output or figure path."""
    normalized = os.path.normpath(path)
    if os.path.basename(normalized) == "figure":
        return normalized
    return os.path.join(os.path.dirname(normalized), "figure")
 def render_pdf_to_images(pdf_path, dpi=150):
    """Render a PDF to RGB images using Ghostscript."""
    gs_path = shutil.which("gs")
    if gs_path is None:
        raise RuntimeError("Ghostscript executable 'gs' was not found in PATH")
    with tempfile.TemporaryDirectory(prefix="amss_verify_pdf_") as temp_dir:
        output_pattern = os.path.join(temp_dir, "page-%03d.ppm")
        cmd = [
            gs_path,
            "-q",
            "-dSAFER",
            "-dBATCH",
            "-dNOPAUSE",
            "-sDEVICE=ppmraw",
            f"-r{dpi}",
            f"-o{output_pattern}",
            pdf_path
        ]
        try:
            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
        except subprocess.CalledProcessError as exc:
            message = exc.stderr.strip() or str(exc)
            raise RuntimeError(f"Failed to render PDF '{pdf_path}': {message}") from exc
        ppm_files = sorted(
            os.path.join(temp_dir, filename)
            for filename in os.listdir(temp_dir)
            if filename.endswith(".ppm")
        )
        if not ppm_files:
            raise RuntimeError(f"No rendered pages were produced for '{pdf_path}'")
        images = []
        for ppm_file in ppm_files:
            with Image.open(ppm_file) as img:
                images.append(np.array(img.convert("RGB"), dtype=np.uint8))
        return images
 def compare_rendered_pages(ref_img, target_img):
    """Return (different_pixels, total_pixels) for two rendered RGB pages."""
    ref_h, ref_w = ref_img.shape[:2]
    tgt_h, tgt_w = target_img.shape[:2]
    total_pixels = max(ref_h, tgt_h) * max(ref_w, tgt_w)
    if ref_h == tgt_h and ref_w == tgt_w:
        different_pixels = int(np.count_nonzero(np.any(ref_img != target_img, axis=2)))
        return different_pixels, total_pixels
    diff_mask = np.ones((max(ref_h, tgt_h), max(ref_w, tgt_w)), dtype=bool)
    overlap_h = min(ref_h, tgt_h)
    overlap_w = min(ref_w, tgt_w)
    overlap_diff = np.any(ref_img[:overlap_h, :overlap_w] != target_img[:overlap_h, :overlap_w], axis=2)
    diff_mask[:overlap_h, :overlap_w] = overlap_diff
    different_pixels = int(np.count_nonzero(diff_mask))
    return different_pixels, total_pixels
 def compare_pdf_images(ref_pdf, target_pdf, dpi=150, threshold_percent=0.001):
    """Compare two PDFs by rasterizing them and counting differing pixels."""
    ref_pages = render_pdf_to_images(ref_pdf, dpi=dpi)
    target_pages = render_pdf_to_images(target_pdf, dpi=dpi)
    total_pixels = 0
    different_pixels = 0
    max_pages = max(len(ref_pages), len(target_pages))
    for page_idx in range(max_pages):
        if page_idx < len(ref_pages) and page_idx < len(target_pages):
            page_diff, page_total = compare_rendered_pages(ref_pages[page_idx], target_pages[page_idx])
        else:
            existing_page = ref_pages[page_idx] if page_idx < len(ref_pages) else target_pages[page_idx]
            page_total = existing_page.shape[0] * existing_page.shape[1]
            page_diff = page_total
        total_pixels += page_total
        different_pixels += page_diff
    diff_percent = (different_pixels / total_pixels * 100.0) if total_pixels else 0.0
    return {
        "different_pixels": different_pixels,
        "total_pixels": total_pixels,
        "diff_percent": diff_percent,
        "pages_ref": len(ref_pages),
        "pages_target": len(target_pages),
        "passed": diff_percent < threshold_percent
    }
 def compare_required_figures(reference_figure_dir, target_figure_dir):
    """Compare the required GW150914 figure PDFs."""
    figure_names = [
        "ADM_Constraint_Grid_Level_0.pdf",
        "BH_Trajectory_21_XY.pdf",
        "BH_Trajectory_XY.pdf"
    ]
    results = []
    for figure_name in figure_names:
        ref_pdf = os.path.join(reference_figure_dir, figure_name)
        target_pdf = os.path.join(target_figure_dir, figure_name)
        if not os.path.exists(ref_pdf):
            raise FileNotFoundError(f"Reference figure not found: {ref_pdf}")
        if not os.path.exists(target_pdf):
            raise FileNotFoundError(f"Target figure not found: {target_pdf}")
        comparison = compare_pdf_images(ref_pdf, target_pdf)
        comparison["name"] = figure_name
        results.append(comparison)
    return results
 def calculate_all_rms_errors(bh_data_ref, bh_data_target):
    """
@@ -300,7 +165,7 @@ def print_rms_results(rms_dict, error, threshold=1.0):
    return all_passed
-def print_constraint_results(results, threshold=2.0):
+def print_constraint_results(results, threshold=2.0):
    print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
    print("-" * 65)
@@ -315,49 +180,22 @@ def print_constraint_results(results, threshold=2.0):
    print(f"\n   Maximum violation:  {results['max_violation']:.6f}")
    print(f"   Requirement:        < {threshold}")
    print(f"   Status:             {get_status_text(passed)}")
-
+
-    return passed
+    return passed
-
+
-
+
-def print_figure_results(results, threshold_percent=0.001):
+def print_summary(rms_passed, constraint_passed):
-    print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
+    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    print("-" * 65)
+    print(Color.BOLD + "Verification Summary" + Color.RESET)
-    print(f"   Requirement: < {threshold_percent:.3f}% differing pixels\n")
+    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-
+
-    all_passed = True
+    all_passed = rms_passed and constraint_passed
-    for result in results:
+    
-        passed = result["passed"]
+    res_rms = get_status_text(rms_passed)
-        all_passed = all_passed and passed
+    res_con = get_status_text(constraint_passed)
-        status = get_status_text(passed)
+
-        print(f"   {result['name']:32}: {result['diff_percent']:10.6f}%   |   Status: {status}")
+    print(f"   [1] Comprehensive RMS check:      {res_rms}")
-
+    print(f"   [2] ADM constraint check:         {res_con}")
        if result["pages_ref"] != result["pages_target"]:
            print(f"   {'':32}  pages(ref/target): {result['pages_ref']}/{result['pages_target']}")
    return all_passed
 def print_figure_error(error_message):
    print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
    print("-" * 65)
    print(f"   {Color.RED}Error: {error_message}{Color.RESET}")
    return False
 def print_summary(rms_passed, constraint_passed, figure_passed):
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    all_passed = rms_passed and constraint_passed and figure_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    res_fig = get_status_text(figure_passed)
    print(f"   [1] Comprehensive RMS check:      {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    print(f"   [3] Figure pixel comparison:      {res_fig}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
@@ -372,14 +210,12 @@ def main():
        script_dir = os.path.dirname(os.path.abspath(__file__))
        target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
-    script_dir = os.path.dirname(os.path.abspath(__file__))
+    script_dir = os.path.dirname(os.path.abspath(__file__))
-    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
+    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
-    target_figure_dir = resolve_figure_dir(target_dir)
+
-    reference_figure_dir = os.path.join(script_dir, "GW150914-origin/figure")
+    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
-
+    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
-    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
+    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
    constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
    if not os.path.exists(bh_file_ref):
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
@@ -391,11 +227,9 @@ def main():
        print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
        sys.exit(1)
-    print_header()
+    print_header()
-    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
+    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
-    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
+    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    print(f"{Color.BOLD}Reference Figures:   {Color.RESET} {Color.BLUE}{reference_figure_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target Figures:      {Color.RESET} {Color.BLUE}{target_figure_dir}{Color.RESET}")
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
@@ -405,18 +239,12 @@ def main():
    rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
    rms_passed = print_rms_results(rms_dict, error)
-    # Output constraint results
+    # Output constraint results
-    constraint_results = analyze_constraint_violation(constraint_data)
+    constraint_results = analyze_constraint_violation(constraint_data)
-    constraint_passed = print_constraint_results(constraint_results)
+    constraint_passed = print_constraint_results(constraint_results)
-
+
-    try:
+    all_passed = print_summary(rms_passed, constraint_passed)
-        figure_results = compare_required_figures(reference_figure_dir, target_figure_dir)
+    sys.exit(0 if all_passed else 1)
        figure_passed = print_figure_results(figure_results)
    except (FileNotFoundError, RuntimeError) as exc:
        figure_passed = print_figure_error(str(exc))
    all_passed = print_summary(rms_passed, constraint_passed, figure_passed)
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
    main()
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -108,9 +108,6 @@ namespace Parallel
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    SyncCache();
    void invalidate();
    void destroy();
@@ -124,10 +121,7 @@ namespace Parallel
  struct AsyncSyncState {
    int req_no;
    bool active;
-    int *req_node;
+    AsyncSyncState() : req_no(0), active(false) {}
    int *req_is_recv;
    int pending_recv;
    AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
  };
  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
@@ -179,13 +173,12 @@ namespace Parallel
  MyList<Parallel::gridseg> *clone_gsl(MyList<Parallel::gridseg> *p, bool first_only);
  MyList<Parallel::gridseg> *build_bulk_gsl(Patch *Pat); // similar to build_owned_gsl0 but does not care rank issue
  MyList<Parallel::gridseg> *build_bulk_gsl(Block *bp, Patch *Pat);
-  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
+  void build_PhysBD_gstl(Patch *Pat, MyList<Parallel::gridseg> *srci, MyList<Parallel::gridseg> *dsti,
-                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
+                         MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
-  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  double L2Norm(Patch *Pat, var *vf);
+  double L2Norm(Patch *Pat, var *vf);
-  void L2Norm7(Patch *Pat, var **vf, double *norms);
+  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
-  void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
+  void checkvarl(MyList<var> *pp, bool first_only);
  void checkvarl(MyList<var> *pp, bool first_only);
  MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
  MyList<Parallel::gridseg> *divide_gs(MyList<Parallel::gridseg> *p, Patch *Pat);
  void prepare_inter_time_level(Patch *Pat,
@@ -217,12 +210,11 @@ namespace Parallel
  void aligncheck(double *bbox0, double *bboxl, int lev, double *DH0, int *shape);
  bool point_locat_gsl(double *pox, MyList<Parallel::gridseg> *gsl);
  void checkpatchlist(MyList<Patch> *PatL, bool buflog);
-
+
-  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
+  double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
-  void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
+  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
-  bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
+                             int NN, double **XX,
-                             int NN, double **XX,
+                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
                             double *Shellf, int Symmetry, MPI_Comm Comm_here);
 #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                            bool periodic, int start_rank, int end_rank, int nodes = 0);
--- a/AMSS_NCKU_source/ShellPatch.C
+++ b/AMSS_NCKU_source/ShellPatch.C
@@ -3439,10 +3439,10 @@ void ShellPatch::write_Pablo_file_ss(int *ext, double xmin, double xmax, double
  delete[] Z;
 }
-double ShellPatch::L2Norm(var *vf)
+double ShellPatch::L2Norm(var *vf)
-{
+{
-  double tvf, dtvf = 0;
+  double tvf, dtvf = 0;
-  int BDW = overghost;
+  int BDW = overghost;
  MyList<ss_patch> *sPp = PatL;
  while (sPp)
@@ -3469,50 +3469,13 @@ double ShellPatch::L2Norm(var *vf)
  MPI_Allreduce(&dtvf, &tvf, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  tvf = sqrt(tvf);
-
+
-  return tvf;
+  return tvf;
-}
+}
-void ShellPatch::L2Norm7(var **vf, double *norms)
+
-{
+// find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
-  double tvf[7], dtvf[7];
+void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
-  int BDW = overghost;
+                              double *Shellf)
  for (int i = 0; i < 7; i++)
    dtvf[i] = 0;
  MyList<ss_patch> *sPp = PatL;
  while (sPp)
  {
    MyList<Block> *Bp = sPp->data->blb;
    while (Bp)
    {
      Block *cg = Bp->data;
      if (myrank == cg->rank)
      {
        f_l2normhelper7(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                        sPp->data->bbox[0], sPp->data->bbox[1], sPp->data->bbox[2],
                        sPp->data->bbox[3], sPp->data->bbox[4], sPp->data->bbox[5],
                        cg->fgfs[vf[0]->sgfn], cg->fgfs[vf[1]->sgfn], cg->fgfs[vf[2]->sgfn],
                        cg->fgfs[vf[3]->sgfn], cg->fgfs[vf[4]->sgfn], cg->fgfs[vf[5]->sgfn],
                        cg->fgfs[vf[6]->sgfn], tvf, BDW);
        for (int i = 0; i < 7; i++)
          dtvf[i] += tvf[i];
      }
      if (Bp == sPp->data->ble)
        break;
      Bp = Bp->next;
    }
    sPp = sPp->next;
  }
  MPI_Allreduce(dtvf, tvf, 7, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  for (int i = 0; i < 7; i++)
    norms[i] = sqrt(tvf[i]);
 }
 // find maximum of abstract value, XX store position for maximum, Shellf store maximum themselvs
 void ShellPatch::Find_Maximum(MyList<var> *VarList, double *XX,
                              double *Shellf)
 {
  MyList<var> *varl;
  int num_var = 0;
--- a/AMSS_NCKU_source/ShellPatch.h
+++ b/AMSS_NCKU_source/ShellPatch.h
@@ -195,11 +195,10 @@ public:
   bool Interp_One_Point(MyList<var> *VarList,
                         double *XX, /*input global Cartesian coordinate*/
                         double *Shellf, int Symmetry);
-   void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
+   void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
-                            char *filename, int sst);
+                            char *filename, int sst);
-   double L2Norm(var *vf);
+   double L2Norm(var *vf);
-   void L2Norm7(var **vf, double *norms);
+   void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
-   void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
+};
 };
 #endif /* SHELLPATCH_H */
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -258,8 +258,6 @@ void bssnEM_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  setup_transfer_caches();
 }
 //================================================================================================
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -23,14 +23,8 @@ using namespace std;
 #include "rungekutta4_rout.h"
 #include "sommerfeld_rout.h"
 #include "getnp4.h"
-#include "shellfunctions.h"
+#include "shellfunctions.h"
-#include "parameters.h"
+#include "parameters.h"
 #if BSSN_USE_ESCALAR_C_KERNEL
 #define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar_c
 #else
 #define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
@@ -80,8 +74,8 @@ bssnEScalar_class::bssnEScalar_class(double Couranti, double StartTimei, double
 //================================================================================================
-void bssnEScalar_class::Initialize()
+void bssnEScalar_class::Initialize()
-{
+{
  Sphio = new var("Sphio", ngfs++, 1, 1, 1);
  Spio = new var("Spio", ngfs++, 1, 1, 1);
  Sphi0 = new var("Sphi0", ngfs++, 1, 1, 1);
@@ -138,14 +132,11 @@ void bssnEScalar_class::Initialize()
      }
    } 
-  GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
+  GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
-  ConstraintRefreshLevels = new int[GH->levels];
+  if (checkrun)
-  for (int il = 0; il < GH->levels; il++)
+    CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
-    ConstraintRefreshLevels[il] = 0;
+  else
-  if (checkrun)
+    GH->compose_cgh(nprocs);
    CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
  else
    GH->compose_cgh(nprocs);
 #ifdef WithShell
  SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
@@ -169,14 +160,12 @@ void bssnEScalar_class::Initialize()
  {
    CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs);
  }
-  else
+  else
-  {
+  {
-    PhysTime = StartTime;
+    PhysTime = StartTime;
-    Setup_Black_Hole_position();
+    Setup_Black_Hole_position();
-  }
+  }
-
+}
  setup_transfer_caches();
 }
 //================================================================================================
@@ -218,10 +207,10 @@ bssnEScalar_class::~bssnEScalar_class()
 // Read initial data solved by Ansorg, PRD 70, 064011 (2004)
-void bssnEScalar_class::Read_Ansorg()
+void bssnEScalar_class::Read_Ansorg()
-{
+{
-  if (!checkrun)
+  if (!checkrun)
-  {
+  {
    if (myrank == 0)
      cout << "Read initial data from Ansorg's solver,"
           << " please be sure the input parameters for black holes are puncture parameters!!" 
@@ -238,12 +227,9 @@ void bssnEScalar_class::Read_Ansorg()
        cout << "Error inputpar" << endl;
        exit(0);
      }
-    }
+    }
-    int BH_NM;
+    int BH_NM;
-    double *Porg_here;
+    double *Porg_here;
    double *pmom_local;
    double *spin_local;
    double *mass_local;
    // read parameter from file
    {
      const int LEN = 256;
@@ -283,11 +269,11 @@ void bssnEScalar_class::Read_Ansorg()
      }
      inf.close();
    }
-
+
-    Porg_here = new double[3 * BH_NM];
+    Porg_here = new double[3 * BH_NM];
-    pmom_local = new double[3 * BH_NM];
+    Pmom = new double[3 * BH_NM];
-    spin_local = new double[3 * BH_NM];
+    Spin = new double[3 * BH_NM];
-    mass_local = new double[BH_NM];
+    Mass = new double[BH_NM];
    // read parameter from file
    {
      const int LEN = 256;
@@ -319,37 +305,37 @@ void bssnEScalar_class::Read_Ansorg()
        else if (status == 0)
          continue;
-        if (sgrp == "BSSN" && sind < BH_NM)
+        if (sgrp == "BSSN" && sind < BH_NM)
-        {
+        {
-          if (skey == "Mass")
+          if (skey == "Mass")
-            mass_local[sind] = atof(sval.c_str());
+            Mass[sind] = atof(sval.c_str());
-          else if (skey == "Porgx")
+          else if (skey == "Porgx")
-            Porg_here[sind * 3] = atof(sval.c_str());
+            Porg_here[sind * 3] = atof(sval.c_str());
-          else if (skey == "Porgy")
+          else if (skey == "Porgy")
-            Porg_here[sind * 3 + 1] = atof(sval.c_str());
+            Porg_here[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Porgz")
+          else if (skey == "Porgz")
-            Porg_here[sind * 3 + 2] = atof(sval.c_str());
+            Porg_here[sind * 3 + 2] = atof(sval.c_str());
-          else if (skey == "Spinx")
+          else if (skey == "Spinx")
-            spin_local[sind * 3] = atof(sval.c_str());
+            Spin[sind * 3] = atof(sval.c_str());
-          else if (skey == "Spiny")
+          else if (skey == "Spiny")
-            spin_local[sind * 3 + 1] = atof(sval.c_str());
+            Spin[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Spinz")
+          else if (skey == "Spinz")
-            spin_local[sind * 3 + 2] = atof(sval.c_str());
+            Spin[sind * 3 + 2] = atof(sval.c_str());
-          else if (skey == "Pmomx")
+          else if (skey == "Pmomx")
-            pmom_local[sind * 3] = atof(sval.c_str());
+            Pmom[sind * 3] = atof(sval.c_str());
-          else if (skey == "Pmomy")
+          else if (skey == "Pmomy")
-            pmom_local[sind * 3 + 1] = atof(sval.c_str());
+            Pmom[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Pmomz")
+          else if (skey == "Pmomz")
-            pmom_local[sind * 3 + 2] = atof(sval.c_str());
+            Pmom[sind * 3 + 2] = atof(sval.c_str());
-        }
+        }
-      }
+      }
-      inf.close();
+      inf.close();
    }
-    int order = 6;
+    int order = 6;
-    Ansorg read_ansorg("Ansorg.psid", order);
+    Ansorg read_ansorg("Ansorg.psid", order);
-    // set initial data
+    // set initial data
-    for (int lev = 0; lev < GH->levels; lev++)
+    for (int lev = 0; lev < GH->levels; lev++)
-    {
+    {
      MyList<Patch> *Pp = GH->PatL[lev];
      while (Pp)
      {
@@ -372,21 +358,21 @@ void bssnEScalar_class::Read_Ansorg()
                                      cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                      cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                      cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
-                                      cg->fgfs[Lap0->sgfn], 
+                                      cg->fgfs[Lap0->sgfn], 
-                                      cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
+                                      cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
-                                      cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
+                                      cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
-                                      cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
+                                      cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                      mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                      Mass, Porg_here, Pmom, Spin, BH_NM);
          }
          if (BL == Pp->data->ble)
            break;
          BL = BL->next;
-        }
+        }
-        Pp = Pp->next;
+        Pp = Pp->next;
-      }
+      }
-    }
+    }
-#ifdef WithShell
+#ifdef WithShell
-    // ShellPatch part
+    // ShellPatch part
    MyList<ss_patch> *Pp = SH->PatL;
    while (Pp)
    {
@@ -414,28 +400,25 @@ void bssnEScalar_class::Read_Ansorg()
                                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                       cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
-                                       cg->fgfs[Lap0->sgfn], 
+                                       cg->fgfs[Lap0->sgfn], 
-                                       cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
+                                       cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
-                                       cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
+                                       cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
-                                       cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
+                                       cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                       mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                       Mass, Porg_here, Pmom, Spin, BH_NM);
        }
        if (BL == Pp->data->ble)
          break;
        BL = BL->next;
-      }
+      }
-      Pp = Pp->next;
+      Pp = Pp->next;
-    }
+    }
-#endif
+#endif
-
+
-    delete[] Porg_here;
+    delete[] Porg_here;
-    delete[] pmom_local;
+    // dump read_in initial data
-    delete[] spin_local;
+    //   for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
-    delete[] mass_local;
+  }
-    // dump read_in initial data
+}
    //   for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
  }
 }
 //================================================================================================
@@ -449,10 +432,10 @@ void bssnEScalar_class::Read_Ansorg()
 // Read initial data solved by Pablo's Olliptic Phys.Rev.D 82 024005 (2010)
-void bssnEScalar_class::Read_Pablo()
+void bssnEScalar_class::Read_Pablo()
-{
+{
-  if (!checkrun)
+  if (!checkrun)
-  {
+  {
    if (myrank == 0)
      cout << "Read initial data from Pablo's solver,"
           << " please be sure the input parameters for black holes are puncture parameters!!" 
@@ -469,12 +452,9 @@ void bssnEScalar_class::Read_Pablo()
        cout << "Error inputpar" << endl;
        exit(0);
      }
-    }
+    }
-    int BH_NM;
+    int BH_NM;
-    double *Porg_here;
+    double *Porg_here;
    double *pmom_local;
    double *spin_local;
    double *mass_local;
    // read parameter from file
    {
      const int LEN = 256;
@@ -514,11 +494,11 @@ void bssnEScalar_class::Read_Pablo()
      }
      inf.close();
    }
-
+
-    Porg_here = new double[3 * BH_NM];
+    Porg_here = new double[3 * BH_NM];
-    pmom_local = new double[3 * BH_NM];
+    Pmom = new double[3 * BH_NM];
-    spin_local = new double[3 * BH_NM];
+    Spin = new double[3 * BH_NM];
-    mass_local = new double[BH_NM];
+    Mass = new double[BH_NM];
    // read parameter from file
    {
      const int LEN = 256;
@@ -550,31 +530,31 @@ void bssnEScalar_class::Read_Pablo()
        else if (status == 0)
          continue;
-        if (sgrp == "BSSN" && sind < BH_NM)
+        if (sgrp == "BSSN" && sind < BH_NM)
-        {
+        {
-          if (skey == "Mass")
+          if (skey == "Mass")
-            mass_local[sind] = atof(sval.c_str());
+            Mass[sind] = atof(sval.c_str());
-          else if (skey == "Porgx")
+          else if (skey == "Porgx")
-            Porg_here[sind * 3] = atof(sval.c_str());
+            Porg_here[sind * 3] = atof(sval.c_str());
-          else if (skey == "Porgy")
+          else if (skey == "Porgy")
-            Porg_here[sind * 3 + 1] = atof(sval.c_str());
+            Porg_here[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Porgz")
+          else if (skey == "Porgz")
-            Porg_here[sind * 3 + 2] = atof(sval.c_str());
+            Porg_here[sind * 3 + 2] = atof(sval.c_str());
-          else if (skey == "Spinx")
+          else if (skey == "Spinx")
-            spin_local[sind * 3] = atof(sval.c_str());
+            Spin[sind * 3] = atof(sval.c_str());
-          else if (skey == "Spiny")
+          else if (skey == "Spiny")
-            spin_local[sind * 3 + 1] = atof(sval.c_str());
+            Spin[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Spinz")
+          else if (skey == "Spinz")
-            spin_local[sind * 3 + 2] = atof(sval.c_str());
+            Spin[sind * 3 + 2] = atof(sval.c_str());
-          else if (skey == "Pmomx")
+          else if (skey == "Pmomx")
-            pmom_local[sind * 3] = atof(sval.c_str());
+            Pmom[sind * 3] = atof(sval.c_str());
-          else if (skey == "Pmomy")
+          else if (skey == "Pmomy")
-            pmom_local[sind * 3 + 1] = atof(sval.c_str());
+            Pmom[sind * 3 + 1] = atof(sval.c_str());
-          else if (skey == "Pmomz")
+          else if (skey == "Pmomz")
-            pmom_local[sind * 3 + 2] = atof(sval.c_str());
+            Pmom[sind * 3 + 2] = atof(sval.c_str());
-        }
+        }
-      }
+      }
-      inf.close();
+      inf.close();
    }
    bool flag = false;
    int DIM = dim;
@@ -614,11 +594,11 @@ void bssnEScalar_class::Read_Pablo()
                                        cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                        cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                        cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
-                                        cg->fgfs[Lap0->sgfn], 
+                                        cg->fgfs[Lap0->sgfn], 
-                                        cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
+                                        cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
-                                        cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
+                                        cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
-                                        cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
+                                        cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                        mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                        Mass, Porg_here, Pmom, Spin, BH_NM);
            }
            if (BL == Pp->data->ble)
              break;
@@ -678,11 +658,11 @@ void bssnEScalar_class::Read_Pablo()
                                         cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                         cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                         cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
-                                         cg->fgfs[Lap0->sgfn], 
+                                         cg->fgfs[Lap0->sgfn], 
-                                         cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
+                                         cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
-                                         cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
+                                         cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
-                                         cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
+                                         cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                         mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                         Mass, Porg_here, Pmom, Spin, BH_NM);
          }
          if (BL == Pp->data->ble)
            break;
@@ -704,13 +684,10 @@ void bssnEScalar_class::Read_Pablo()
      Pp = Pp->next;
    }
 #endif
-
+
-    delete[] Porg_here;
+    delete[] Porg_here;
-    delete[] pmom_local;
+    if (flag && myrank == 0)
-    delete[] spin_local;
+      MPI_Abort(MPI_COMM_WORLD, 1);
    delete[] mass_local;
    if (flag && myrank == 0)
      MPI_Abort(MPI_COMM_WORLD, 1);
    // dump read_in initial data
    for (int lev = 0; lev < GH->levels; lev++)
      Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT);
@@ -762,10 +739,10 @@ void bssnEScalar_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
-        if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                       cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                       cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                       cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -1016,12 +993,11 @@ void bssnEScalar_class::Step(int lev, int YN)
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  sync_predictor_start(lev, SynchList_pre, async_pre);
-#ifdef WithShell
+#ifdef WithShell
-  if (lev == 0)
+  if (lev == 0)
-  {
+  {
    clock_t prev_clock, curr_clock;
    if (myrank == 0)
      curr_clock = clock();
@@ -1033,10 +1009,9 @@ void bssnEScalar_class::Step(int lev, int YN)
      cout << " Shell stuff synchronization used " 
           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
           << " seconds! " << endl;
-    }
+    }
-  }
+  }
-#endif
+#endif
  sync_predictor_finish(lev, async_pre, SynchList_pre);
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1106,10 +1081,10 @@ void bssnEScalar_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                         cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
+                                         cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
-                                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
                                         cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
@@ -1374,12 +1349,11 @@ void bssnEScalar_class::Step(int lev, int YN)
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    sync_corrector_start(lev, SynchList_cor, async_cor);
-#ifdef WithShell
+#ifdef WithShell
-    if (lev == 0)
+    if (lev == 0)
-    {
+    {
      clock_t prev_clock, curr_clock;
      if (myrank == 0)
        curr_clock = clock();
@@ -1391,10 +1365,9 @@ void bssnEScalar_class::Step(int lev, int YN)
        cout << " Shell stuff synchronization used " 
             << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
             << " seconds! " << endl;
-      }
+      }
-    }
+    }
-#endif
+#endif
    sync_corrector_finish(lev, async_cor, SynchList_cor);
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
@@ -1862,14 +1835,11 @@ void bssnEScalar_class::AnalysisStuff_EScalar(int lev, double dT_lev)
 //================================================================================================
-void bssnEScalar_class::Interp_Constraint(bool infg)
+void bssnEScalar_class::Interp_Constraint()
-{
+{
-  if (!infg)
+  // we do not support a_lev != 0 yet.
-    return;
+  if (a_lev > 0)
-
+    return;
  // we do not support a_lev != 0 yet.
  if (a_lev > 0)
    return;
  for (int lev = 0; lev < GH->levels; lev++)
  {
@@ -1888,10 +1858,10 @@ void bssnEScalar_class::Interp_Constraint(bool infg)
          if (myrank == cg->rank)
          {
            if (lev > 0)
-              BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                         cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                         cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                         cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                         cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                                         cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                         cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                         cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                         cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                         cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -2108,10 +2078,10 @@ void bssnEScalar_class::Constraint_Out()
            if (myrank == cg->rank)
            {
              if (lev > 0)
-                BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                                           cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                           cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                           cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                           cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                           cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
--- a/AMSS_NCKU_source/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/bssnEScalar_class.h
@@ -51,7 +51,7 @@ public:
     void Compute_Psi4(int lev);
     void Step(int lev, int YN);
     void AnalysisStuff_EScalar(int lev, double dT_lev);
-     void Interp_Constraint(bool infg);
+     void Interp_Constraint();
     void Constraint_Out(); 
 protected:
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -31,19 +31,11 @@ using namespace std;
 #include "surface_integral.h"
 #include "checkpoint.h"
-extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
+extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
-
+
-#ifndef BSSN_USE_TRANSFER_CACHE
+class bssn_class
-#define BSSN_USE_TRANSFER_CACHE 1
+{
-#endif
+public:
 #ifndef BSSN_USE_ESCALAR_C_KERNEL
 #define BSSN_USE_ESCALAR_C_KERNEL 1
 #endif
 class bssn_class
 {
 public:
       int ngfs;
       int nprocs, myrank;
       cgh *GH;
@@ -53,11 +45,10 @@ public:
       int checkrun;
       char checkfilename[50];
       int Steps;
-       double StartTime, TotalTime;
+       double StartTime, TotalTime;
-       double AnasTime, DumpTime, d2DumpTime, CheckTime;
+       double AnasTime, DumpTime, d2DumpTime, CheckTime;
-       double LastAnas, LastConsOut;
+       double LastAnas, LastConsOut;
-       int *ConstraintRefreshLevels;
+       double Courant;
       double Courant;
       double numepss, numepsb, numepsh;
       int Symmetry;
       int maxl, decn;
@@ -139,12 +130,10 @@ public:
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
-       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
+       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
-       monitor *ConVMonitor, *TimingMonitor;
+       monitor *ConVMonitor;
-       surface_integral *Waveshell;
+       surface_integral *Waveshell;
       checkpoint *CheckPoint;
 public:
@@ -175,25 +164,14 @@ public:
       void Setup_KerrSchild();
       void Enforce_algcon(int lev, int fg);
-       void testRestrict();
+       void testRestrict();
-       void testOutBd();
+       void testOutBd();
-       
+       
-       bool check_Stdin_Abort(); 
+       bool check_Stdin_Abort(); 
-       bool use_transfer_cache() const;
+
-       void setup_transfer_caches();
+       virtual void Setup_Initial_Data_Cao();
-       void invalidate_transfer_caches();
+       virtual void Setup_Initial_Data_Lousto();
-       void destroy_transfer_caches();
+       virtual void Initialize();
       void sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array = 0);
       void restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       void outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       virtual void Setup_Initial_Data_Cao();
       virtual void Setup_Initial_Data_Lousto();
       virtual void Initialize();
       virtual void Read_Ansorg();
       virtual void Read_Pablo() {};
       virtual void Compute_Psi4(int lev);
--- a/AMSS_NCKU_source/bssn_escalar_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_escalar_rhs_c.C
@@ -1,169 +0,0 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
 #include <vector>
 namespace
 {
    // Reuse the temporary workspace across block calls to avoid repeated heap churn
    // in the EScalar wrapper. MPI ranks execute this path sequentially, so a single
    // process-local buffer is sufficient here.
    std::vector<double> g_escalar_tmp_store;
 }
 #ifdef fortran1
 #define f_frpotential frpotential
 #endif
 #ifdef fortran2
 #define f_frpotential FRPOTENTIAL
 #endif
 #ifdef fortran3
 #define f_frpotential frpotential_
 #endif
 extern "C"
 {
    void f_frpotential(int *, double *, double *, double *);
 }
 int f_compute_rhs_bssn_escalar_c(int *ex, double &T,
                                 double *X, double *Y, double *Z,
                                 double *chi, double *trK,
                                 double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                 double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                 double *Gamx, double *Gamy, double *Gamz,
                                 double *Lap, double *betax, double *betay, double *betaz,
                                 double *dtSfx, double *dtSfy, double *dtSfz,
                                 double *Sphi, double *Spi,
                                 double *chi_rhs, double *trK_rhs,
                                 double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                 double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                 double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                 double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                 double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                 double *Sphi_rhs, double *Spi_rhs,
                                 double *rho, double *Sx, double *Sy, double *Sz,
                                 double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                 double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                 double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                 double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                 double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                 double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                                 double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                                 int &Symmetry, int &Lev, double &eps, int &co)
 {
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const size_t workspace_size = size_t(all) * 17;
    if (g_escalar_tmp_store.size() < workspace_size)
        g_escalar_tmp_store.resize(workspace_size);
    double *tmp_ptr = g_escalar_tmp_store.data();
    auto alloc_tmp = [&](int n = 1) -> double *
    {
        double *ptr = tmp_ptr;
        tmp_ptr += size_t(all) * n;
        return ptr;
    };
    double *chix = alloc_tmp(), *chiy = alloc_tmp(), *chiz = alloc_tmp();
    double *Kx = alloc_tmp(), *Ky = alloc_tmp(), *Kz = alloc_tmp();
    double *fxx = alloc_tmp(), *fxy = alloc_tmp(), *fxz = alloc_tmp();
    double *fyy = alloc_tmp(), *fyz = alloc_tmp(), *fzz = alloc_tmp();
    double *Lapx = alloc_tmp(), *Lapy = alloc_tmp(), *Lapz = alloc_tmp();
    double *V = alloc_tmp(), *dVdSphi = alloc_tmp();
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, HALF = 0.5;
    const double SSS[3] = {1.0, 1.0, 1.0};
    fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Sphi, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, Sphi, fxx, fxy, fxz, fyy, fyz, fzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    f_frpotential(ex, Sphi, V, dVdSphi);
    for (int i = 0; i < all; ++i)
    {
        const double alpn1 = Lap[i] + ONE;
        const double chin1 = chi[i] + ONE;
        const double gxx = dxx[i] + ONE;
        const double gyy = dyy[i] + ONE;
        const double gzz = dzz[i] + ONE;
        const double det = gxx * gyy * gzz + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i]
                         - gxz[i] * gyy * gxz[i] - gxy[i] * gxy[i] * gzz - gxx * gyz[i] * gyz[i];
        const double gupxx = (gyy * gzz - gyz[i] * gyz[i]) / det;
        const double gupxy = -(gxy[i] * gzz - gyz[i] * gxz[i]) / det;
        const double gupxz = (gxy[i] * gyz[i] - gyy * gxz[i]) / det;
        const double gupyy = (gxx * gzz - gxz[i] * gxz[i]) / det;
        const double gupyz = -(gxx * gyz[i] - gxy[i] * gxz[i]) / det;
        const double gupzz = (gxx * gyy - gxy[i] * gxy[i]) / det;
        Sphi_rhs[i] = alpn1 * Spi[i];
        Spi_rhs[i] = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
                   + TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
                   - ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) / TWO / chin1) * Kx[i]
                   +  (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) / TWO / chin1) * Ky[i]
                   +  (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) / TWO / chin1) * Kz[i]);
        Spi_rhs[i] = Spi_rhs[i] * alpn1
                   + gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
                   + gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
                   + gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i];
        Spi_rhs[i] = Spi_rhs[i] * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi[i]);
        rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
               + gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
               + Spi[i] * Spi[i] * HALF + V[i];
        Sx[i] = -Spi[i] * Kx[i];
        Sy[i] = -Spi[i] * Ky[i];
        Sz[i] = -Spi[i] * Kz[i];
        const double pressure = (rho[i] - Spi[i] * Spi[i]) / chin1;
        Sxx[i] = Kx[i] * Kx[i] - pressure * gxx;
        Sxy[i] = Kx[i] * Ky[i] - pressure * gxy[i];
        Sxz[i] = Kx[i] * Kz[i] - pressure * gxz[i];
        Syy[i] = Ky[i] * Ky[i] - pressure * gyy;
        Syz[i] = Ky[i] * Kz[i] - pressure * gyz[i];
        Szz[i] = Kz[i] * Kz[i] - pressure * gzz;
    }
    if (f_compute_rhs_bssn(ex, T, X, Y, Z,
                           chi, trK,
                           dxx, gxy, gxz, dyy, gyz, dzz,
                           Axx, Axy, Axz, Ayy, Ayz, Azz,
                           Gamx, Gamy, Gamz,
                           Lap, betax, betay, betaz,
                           dtSfx, dtSfy, dtSfz,
                           chi_rhs, trK_rhs,
                           gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
                           Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
                           Gamx_rhs, Gamy_rhs, Gamz_rhs,
                           Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
                           dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
                           rho, Sx, Sy, Sz,
                           Sxx, Sxy, Sxz, Syy, Syz, Szz,
                           Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                           Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                           Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                           Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                           ham_Res, movx_Res, movy_Res, movz_Res,
                           Gmx_Res, Gmy_Res, Gmz_Res,
                           Symmetry, Lev, eps, co))
        return 1;
    lopsided_kodis(ex, X, Y, Z, Sphi, Sphi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    lopsided_kodis(ex, X, Y, Z, Spi, Spi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    for (int i = 0; i < all; ++i)
    {
        if (Sphi_rhs[i] != Sphi_rhs[i] || Spi_rhs[i] != Spi_rhs[i] || rho[i] != rho[i])
            return 1;
    }
    return 0;
 }
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -59,10 +59,9 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
  real*8,intent(in) :: eps
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
-  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
+  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
-!  gont = 0: success; gont = 1: something wrong
+!  gont = 0: success; gont = 1: something wrong
-  integer::gont
+  integer::gont
  integer :: i,j,k
 !~~~~~~> Other variables:
@@ -84,18 +83,11 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz
-  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
+  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
-  real*8            :: dX, dY, dZ, PI
+  real*8            :: dX, dY, dZ, PI
-  real*8            :: divb_loc,det_loc
+  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
-  real*8            :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
+  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
-  real*8            :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
+  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
  real*8            :: fxx_loc,fxy_loc,fxz_loc
  real*8            :: Gamxa_loc,Gamya_loc,Gamza_loc
  real*8            :: f_loc,chin_loc
  real*8            :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
  double precision,parameter::FF = 0.75d0,eta=2.d0
  real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
  real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -104,11 +96,11 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta
 #endif
-#if (GAUGE == 6 || GAUGE == 7)
+#if (GAUGE == 6 || GAUGE == 7)
-  integer :: BHN
+  integer :: BHN,i,j,k
-  real*8, dimension(9) :: Porg
+  real*8, dimension(9) :: Porg
-  real*8, dimension(3) :: Mass
+  real*8, dimension(3) :: Mass
-  real*8 :: r1,r2,M,A,w1,w2,C1,C2
+  real*8 :: r1,r2,M,A,w1,w2,C1,C2
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta
  call getpbh(BHN,Porg,Mass)
@@ -153,204 +145,174 @@
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
-  do k=1,ex(3)
+  alpn1 = Lap + ONE
-  do j=1,ex(2)
+  chin1 = chi + ONE
-  do i=1,ex(1)
+  gxx = dxx + ONE
-    alpn1(i,j,k) = Lap(i,j,k) + ONE
+  gyy = dyy + ONE
-    chin1(i,j,k) = chi(i,j,k) + ONE
+  gzz = dzz + ONE
    gxx(i,j,k) = dxx(i,j,k) + ONE
    gyy(i,j,k) = dyy(i,j,k) + ONE
    gzz(i,j,k) = dzz(i,j,k) + ONE
  enddo
  enddo
  enddo
  call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
  call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
  call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
-  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
+  div_beta = betaxx + betayy + betazz
  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
-  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
-  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
+
-  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
+  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
-  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
+  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
-  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-
+  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
-  do k=1,ex(3)
+  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  do j=1,ex(2)
+
-  do i=1,ex(1)
+  gxx_rhs = - TWO * alpn1 * Axx    -  F2o3 * gxx * div_beta          + &
-    divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
+              TWO *(  gxx * betaxx +   gxy * betayx +   gxz * betazx)
-    div_beta(i,j,k) = divb_loc
+
-
+  gyy_rhs = - TWO * alpn1 * Ayy    -  F2o3 * gyy * div_beta          + &
-    chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
+              TWO *(  gxy * betaxy +   gyy * betayy +   gyz * betazy)
-
+
-    gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
+  gzz_rhs = - TWO * alpn1 * Azz    -  F2o3 * gzz * div_beta          + &
-         TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
+              TWO *(  gxz * betaxz +   gyz * betayz +   gzz * betazz)
-
+
-    gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
+  gxy_rhs = - TWO * alpn1 * Axy    +  F1o3 * gxy    * div_beta       + &
-         TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
+                      gxx * betaxy                  +   gxz * betazy + &
-
+                                       gyy * betayx +   gyz * betazx   &
-    gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
+                                                    -   gxy * betazz
-         TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
+
-
+  gyz_rhs = - TWO * alpn1 * Ayz    +  F1o3 * gyz    * div_beta       + &
-    gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
+                      gxy * betaxz +   gyy * betayz                  + &
-         gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
+                      gxz * betaxy                  +   gzz * betazy   &
-         gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
+                                                    -   gyz * betaxx
-
+ 
-    gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
+  gxz_rhs = - TWO * alpn1 * Axz    +  F1o3 * gxz    * div_beta       + &
-         gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
+                      gxx * betaxz +   gxy * betayz                  + &
-         gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
+                                       gyz * betayx +   gzz * betazx   &
-
+                                                    -   gxz * betayy     !rhs for gij
-    gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
+
-         gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
+! invert tilted metric
-         gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
+  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
-
+           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
-    det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
+  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
-         gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
+  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
-         gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
+  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
-    gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
+  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
-    gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
+  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
-    gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
+  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
-    gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
+
-    gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
+  if(co == 0)then
-    gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
+! Gam^i_Res = Gam^i + gup^ij_,j
-    gupxx(i,j,k) = gupxx_loc
+  Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
-    gupxy(i,j,k) = gupxy_loc
+                   +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
-    gupxz(i,j,k) = gupxz_loc
+                   +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
-    gupyy(i,j,k) = gupyy_loc
+                   +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
-    gupyz(i,j,k) = gupyz_loc
+                   +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
-    gupzz(i,j,k) = gupzz_loc
+                   +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
-
+                   +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-    if(co == 0)then
+                   +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
-      Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
+                   +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
-           gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
+  Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
-           gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
+                   +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
-           gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
+                   +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
-           gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
+                   +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
-           gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
+                   +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
-           gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
+                   +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
-           gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
+                   +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-           gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
+                   +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
-           gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
+                   +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
-      Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
+  Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
-           gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
+                   +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
-           gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
+                   +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
-           gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
+                   +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
-           gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
+                   +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
-           gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
+                   +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
-           gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
+                   +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
-           gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
+                   +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
-           gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
+                   +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
-           gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
+  endif
-      Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
+
-           gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
+! second kind of connection
-           gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
+  Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
-           gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
+  Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
-           gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
+  Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
-           gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
+ 
-           gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
+  Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
-           gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
+  Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
-           gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
+  Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
-           gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
+
-    endif
+  Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
-
+  Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
-    Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
+  Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
-    Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
+
-    Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
+  Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
-
+  Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
-    Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
+  Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
-    Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
+
-    Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
+  Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
-
+  Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
-    Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
+  Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
-    Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
+
-    Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
+  Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
-
+  Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
-    Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
+  Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
-    Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
+! Raise indices of \tilde A_{ij} and store in R_ij
-    Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
+
-
+  Rxx =    gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
-    Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
+      TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
-    Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
+
-    Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
+  Ryy =    gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
-
+      TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
-    Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
+
-    Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
+  Rzz =    gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
-    Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
+      TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
-  enddo
+
-  enddo
+  Rxy =    gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
-  enddo
+          (gupxx * gupyy       + gupxy * gupxy)* Axy                       + &
-! Raise indices of \tilde A_{ij} and store in R_ij
+          (gupxx * gupyz       + gupxz * gupxy)* Axz                       + &
-
+          (gupxy * gupyz       + gupxz * gupyy)* Ayz
-! Right hand side for Gam^i without shift terms...
+
-  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+  Rxz =    gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
-  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
+          (gupxx * gupyz       + gupxy * gupxz)* Axy                       + &
-  do k=1,ex(3)
+          (gupxx * gupzz       + gupxz * gupxz)* Axz                       + &
-  do j=1,ex(2)
+          (gupxy * gupzz       + gupxz * gupyz)* Ayz
-  do i=1,ex(1)
+
-    gupxx_loc = gupxx(i,j,k)
+  Ryz =    gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
-    gupxy_loc = gupxy(i,j,k)
+          (gupxy * gupyz       + gupyy * gupxz)* Axy                       + &
-    gupxz_loc = gupxz(i,j,k)
+          (gupxy * gupzz       + gupyz * gupxz)* Axz                       + &
-    gupyy_loc = gupyy(i,j,k)
+          (gupyy * gupzz       + gupyz * gupyz)* Ayz
-    gupyz_loc = gupyz(i,j,k)
+
-    gupzz_loc = gupzz(i,j,k)
+! Right hand side for Gam^i without shift terms...
-
+  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-    Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
+  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
-         TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
+
-    Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
+   Gamx_rhs = - TWO * (   Lapx * Rxx +   Lapy * Rxy +   Lapz * Rxz ) + &
-         TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
+        TWO * alpn1 * (                                                &
-    Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
+        -F3o2/chin1 * (   chix * Rxx +   chiy * Rxy +   chiz * Rxz ) - &
-         TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
+              gupxx * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-    Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
+              gupxy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-         (gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
+              gupxz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
-         (gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
+                        Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz   + &
-         (gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
+                TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
-    Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
+
-         (gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
+   Gamy_rhs = - TWO * (   Lapx * Rxy +   Lapy * Ryy +   Lapz * Ryz ) + &
-         (gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
+        TWO * alpn1 * (                                                &
-         (gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
+        -F3o2/chin1 * (   chix * Rxy +  chiy * Ryy +    chiz * Ryz ) - &
-    Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
+              gupxy * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-         (gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
+              gupyy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-         (gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
+              gupyz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
-         (gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
+                        Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz   + &
-    Rxx(i,j,k) = Rxx_loc
+                TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
-    Ryy(i,j,k) = Ryy_loc
+
-    Rzz(i,j,k) = Rzz_loc
+   Gamz_rhs = - TWO * (   Lapx * Rxz +   Lapy * Ryz +   Lapz * Rzz ) + &
-    Rxy(i,j,k) = Rxy_loc
+        TWO * alpn1 * (                                                &
-    Rxz(i,j,k) = Rxz_loc
+        -F3o2/chin1 * (   chix * Rxz +  chiy * Ryz +    chiz * Rzz ) - &
-    Ryz(i,j,k) = Ryz_loc
+              gupxz * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
-
+              gupyz * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
-    Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
+              gupzz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
-         TWO * alpn1(i,j,k) * ( &
+                        Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz   + &
-         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
+                TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
         gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
         TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
    Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
         TWO * alpn1(i,j,k) * ( &
         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
         gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
         TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
    Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
         TWO * alpn1(i,j,k) * ( &
         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
         gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
         gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
         gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
         Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
         TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
  enddo
  enddo
  enddo
  call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
                X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -359,54 +321,38 @@
  call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
                X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
-  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
+  fxx = gxxx + gxyy + gxzz
-  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
+  fxy = gxyx + gyyy + gyzz
-  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
+  fxz = gxzx + gyzy + gzzz
-  do k=1,ex(3)
+
-  do j=1,ex(2)
+  Gamxa =       gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
-  do i=1,ex(1)
+          TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
-    divb_loc = div_beta(i,j,k)
+  Gamya =       gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
-    fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
+          TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
-    fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
+  Gamza =       gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
-    fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
+          TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
-
+
-    gupxx_loc = gupxx(i,j,k)
+  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
-    gupxy_loc = gupxy(i,j,k)
+  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
-    gupxz_loc = gupxz(i,j,k)
+  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
-    gupyy_loc = gupyy(i,j,k)
+
-    gupyz_loc = gupyz(i,j,k)
+  Gamx_rhs =               Gamx_rhs +  F2o3 *  Gamxa * div_beta        - &
-    gupzz_loc = gupzz(i,j,k)
+                     Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz  + &
-
+             F1o3 * (gupxx * fxx    + gupxy * fxy    + gupxz * fxz    ) + &
-    Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
+                     gupxx * gxxx   + gupyy * gyyx   + gupzz * gzzx    + &
-         TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
+              TWO * (gupxy * gxyx   + gupxz * gxzx   + gupyz * gyzx  )
-    Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
+
-         TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
+  Gamy_rhs =               Gamy_rhs +  F2o3 *  Gamya * div_beta        - &
-    Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
+                     Gamxa * betayx - Gamya * betayy - Gamza * betayz  + &
-         TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
+             F1o3 * (gupxy * fxx    + gupyy * fxy    + gupyz * fxz    ) + &
-    Gamxa(i,j,k) = Gamxa_loc
+                     gupxx * gxxy   + gupyy * gyyy   + gupzz * gzzy    + &
-    Gamya(i,j,k) = Gamya_loc
+              TWO * (gupxy * gxyy   + gupxz * gxzy   + gupyz * gyzy  )
-    Gamza(i,j,k) = Gamza_loc
+
-
+  Gamz_rhs =               Gamz_rhs +  F2o3 *  Gamza * div_beta        - &
-    Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
+                     Gamxa * betazx - Gamya * betazy - Gamza * betazz  + &
-         Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
+             F1o3 * (gupxz * fxx    + gupyz * fxy    + gupzz * fxz    ) + &
-         F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
+                     gupxx * gxxz   + gupyy * gyyz   + gupzz * gzzz    + &
-         gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
+              TWO * (gupxy * gxyz   + gupxz * gxzz   + gupyz * gyzz  )    !rhs for Gam^i
         TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
    Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
         Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
         F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
         gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
         TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
    Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
         Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
         F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
         gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
         TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
  enddo
  enddo
  enddo
 !first kind of connection stored in gij,k
  gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -655,190 +601,192 @@
            Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz  + &
            Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy  + &
            Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
-!covariant second derivative of chi respect to tilted metric
+!covariant second derivative of chi respect to tilted metric
-  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-
+
-  do k=1,ex(3)
+  fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
-  do j=1,ex(2)
+  fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
-  do i=1,ex(1)
+  fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
-    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
+  fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
-    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
+  fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
-    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
+  fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
-    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
+! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
-    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
+
-    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
+  f =        gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
-
+             gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
-    chin_loc = chin1(i,j,k)
+             gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
-    f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
+       TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
-            gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
+       TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
-            gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
+       TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) 
-            TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
+! Add chi part to Ricci tensor:
-            TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
+
-            TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
+  Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
-    f(i,j,k) = f_loc
+  Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
-
+  Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
-    Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
+  Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
-    Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
+  Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
-    Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
+  Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
-    Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
+
-    Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
+! covariant second derivatives of the lapse respect to physical metric
-    Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
+  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
-  enddo
+                SYM,SYM,SYM,symmetry,Lev)
-  enddo
+
-  enddo
+  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
-
+  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
-! covariant second derivatives of the lapse respect to physical metric
+  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
-  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
+! now get physical second kind of connection
-                SYM,SYM,SYM,symmetry,Lev)
+  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
-
+  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
-  do k=1,ex(3)
+  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
-  do j=1,ex(2)
+  Gamxyy = Gamxyy - (                     - gyy * gxxx )*HALF
-  do i=1,ex(1)
+  Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
-    chin_loc = chin1(i,j,k)
+  Gamzyy = Gamzyy - (                     - gyy * gxxz )*HALF
-    gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
+  Gamxzz = Gamxzz - (                     - gzz * gxxx )*HALF
-    gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
+  Gamyzz = Gamyzz - (                     - gzz * gxxy )*HALF
-    gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
+  Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
-
+  Gamxxy = Gamxxy - (  chiy        /chin1 - gxy * gxxx )*HALF
-    Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
+  Gamyxy = Gamyxy - (         chix /chin1 - gxy * gxxy )*HALF
-    Gamyxx(i,j,k) = Gamyxx(i,j,k) - (                                   - gxx(i,j,k) * gxxy(i,j,k) )*HALF
+  Gamzxy = Gamzxy - (                     - gxy * gxxz )*HALF
-    Gamzxx(i,j,k) = Gamzxx(i,j,k) - (                                   - gxx(i,j,k) * gxxz(i,j,k) )*HALF
+  Gamxxz = Gamxxz - (  chiz        /chin1 - gxz * gxxx )*HALF
-    Gamxyy(i,j,k) = Gamxyy(i,j,k) - (                                   - gyy(i,j,k) * gxxx(i,j,k) )*HALF
+  Gamyxz = Gamyxz - (                     - gxz * gxxy )*HALF
-    Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
+  Gamzxz = Gamzxz - (         chix /chin1 - gxz * gxxz )*HALF
-    Gamzyy(i,j,k) = Gamzyy(i,j,k) - (                                   - gyy(i,j,k) * gxxz(i,j,k) )*HALF
+  Gamxyz = Gamxyz - (                     - gyz * gxxx )*HALF
-    Gamxzz(i,j,k) = Gamxzz(i,j,k) - (                                   - gzz(i,j,k) * gxxx(i,j,k) )*HALF
+  Gamyyz = Gamyyz - (  chiz        /chin1 - gyz * gxxy )*HALF
-    Gamyzz(i,j,k) = Gamyzz(i,j,k) - (                                   - gzz(i,j,k) * gxxy(i,j,k) )*HALF
+  Gamzyz = Gamzyz - (         chiy /chin1 - gyz * gxxz )*HALF
-    Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
+
-    Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
+  fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
-    Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
+  fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
-    Gamzxy(i,j,k) = Gamzxy(i,j,k) - (                     - gxy(i,j,k) * gxxz(i,j,k) )*HALF
+  fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
-    Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
+  fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
-    Gamyxz(i,j,k) = Gamyxz(i,j,k) - (                     - gxz(i,j,k) * gxxy(i,j,k) )*HALF
+  fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
-    Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
+  fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
-    Gamxyz(i,j,k) = Gamxyz(i,j,k) - (                     - gyz(i,j,k) * gxxx(i,j,k) )*HALF
+
-    Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
+! store D^i D_i Lap in trK_rhs upto chi
-    Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
+  trK_rhs =    gupxx * fxx + gupyy * fyy + gupzz * fzz + &
-
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
-    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
+#if 1        
-    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
+!! follow bam code
-    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
+  S =  chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
-    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
+     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
-    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
+  f = F2o3 * trK * trK -(&
-    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
+       gupxx * ( &
-
+       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
-    trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
+       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
-                     TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
+       gupyy * ( &
-  enddo
+       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
-  enddo
+       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
-  enddo
+       gupzz * ( &
-  do k=1,ex(3)
+       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
-  do j=1,ex(2)
+       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
-  do i=1,ex(1)
+       TWO * ( &
-    divb_loc = div_beta(i,j,k)
+       gupxy * ( &
-    chin_loc = chin1(i,j,k)
+       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
-
+       gupxy * (Axx * Ayy + Axy * Axy) + &
-    S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
+       gupxz * (Axx * Ayz + Axz * Axy) + &
-           TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
+       gupyz * (Axy * Ayz + Axz * Ayy) ) + &
-    S(i,j,k) = S_loc
+       gupxz * ( &
-
+       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
-    f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
+       gupxy * (Axx * Ayz + Axy * Axz) + &
-            gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
+       gupxz * (Axx * Azz + Axz * Axz) + &
-                             gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
+       gupyz * (Axy * Azz + Axz * Ayz) ) + &
-                             TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
+       gupyz * ( &
-                                    gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
+       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
-            gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
+       gupxy * (Axy * Ayz + Ayy * Axz) + &
-                             gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
+       gupxz * (Axy * Azz + Ayz * Axz) + &
-                             TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
+       gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
-                                    gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
+  f = - F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
-            gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
-                             gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
+  
-                             TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
+  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
-                                    gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
+  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
-            TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
+  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
-                                     gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
+  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
-                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
+  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
-                                     gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
+  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
-                                     gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
+#else        
-                    gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
+! Add lapse and S_ij parts to Ricci tensor:
-                                     gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
+
-                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
+  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
-                                     gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
+  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
-                                     gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
+  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
-                    gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
+  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
-                                     gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
+  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
-                                     gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
+  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
-                                     gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
+
-                                     gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
+! Compute trace-free part (note: chi^-1 and chi cancel!):
-            F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
+
-
+  f = F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
-    f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
-            TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
+#endif
-            alpn1(i,j,k)/chin_loc * f_loc )
+
-    f(i,j,k) = f_loc
+  Axx_rhs = fxx - gxx * f
-
+  Ayy_rhs = fyy - gyy * f
-    l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
+  Azz_rhs = fzz - gzz * f
-    l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
+  Axy_rhs = fxy - gxy * f
-    l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
+  Axz_rhs = fxz - gxz * f
-    l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
+  Ayz_rhs = fyz - gyz * f
-    l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
+
-    l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
+! Now: store A_il A^l_j into fij:
-
+
-    Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
+  fxx =       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
-    Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
+       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
-    Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
+  fyy =       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
-    Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
+       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
-    Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
+  fzz =       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
-    Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
+       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
-
+  fxy =       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
-    fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
+              gupxy *(Axx * Ayy + Axy * Axy)                            + &
-                 gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
+              gupxz *(Axx * Ayz + Axz * Axy)                            + &
-                 gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
+              gupyz *(Axy * Ayz + Axz * Ayy)
-    fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
+  fxz =       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
-                 gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
+              gupxy *(Axx * Ayz + Axy * Axz)                            + &
-                 gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
+              gupxz *(Axx * Azz + Axz * Axz)                            + &
-    fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
+              gupyz *(Axy * Azz + Axz * Ayz)
-                 gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
+  fyz =       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
-                 gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
+              gupxy *(Axy * Ayz + Ayy * Axz)                            + &
-    fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
+              gupxz *(Axy * Azz + Ayz * Axz)                            + &
-                 gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
+              gupyz *(Ayy * Azz + Ayz * Ayz)
-                 gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
+
-                 gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
+  f = chin1
-    fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
+! store D^i D_i Lap in trK_rhs
-                 gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
+  trK_rhs = f*trK_rhs
-                 gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
+          
-                 gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
+  Axx_rhs =           f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx)  + &
-    fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
+           TWO * (  Axx * betaxx +   Axy * betayx +   Axz * betazx )- &
-                 gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
+             F2o3 * Axx * div_beta
-                 gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
+
-                 gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
+  Ayy_rhs =           f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy)  + &
-
+           TWO * (  Axy * betaxy +   Ayy * betayy +   Ayz * betazy )- &
-    trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
+             F2o3 * Ayy * div_beta
-
+
-    Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
+  Azz_rhs =           f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz)  + &
-                     TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
+           TWO * (  Axz * betaxz +   Ayz * betayz +   Azz * betazz )- &
-                     F2o3 * Axx(i,j,k) * divb_loc
+             F2o3 * Azz * div_beta
-    Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
+
-                     TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
+  Axy_rhs =           f * Axy_rhs+ alpn1 *( trK * Axy  - TWO * fxy )+ &
-                     F2o3 * Ayy(i,j,k) * divb_loc
+                    Axx * betaxy                  +   Axz * betazy  + &
-    Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
+                                     Ayy * betayx +   Ayz * betazx  + &
-                     TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
+             F1o3 * Axy * div_beta                -   Axy * betazz
-                     F2o3 * Azz(i,j,k) * divb_loc
+
-    Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
+  Ayz_rhs =           f * Ayz_rhs+ alpn1 *( trK * Ayz  - TWO * fyz )+ &
-                     Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
+                    Axy * betaxz +   Ayy * betayz                   + &
-                     Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
+                    Axz * betaxy                  +   Azz * betazy  + &
-    Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
+             F1o3 * Ayz * div_beta                -   Ayz * betaxx
-                     Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
+ 
-                     Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
+  Axz_rhs =           f * Axz_rhs+ alpn1 *( trK * Axz  - TWO * fxz )+ &
-    Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
+                    Axx * betaxz +   Axy * betayz                   + &
-                     Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
+                                     Ayz * betayx +   Azz * betazx  + &
-                     Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
+             F1o3 * Axz * div_beta                -   Axz * betayy      !rhs for Aij
-
+
-    trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
+! Compute trace of S_ij
-                    gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
+
-                    TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
+  S =  f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
-                    FOUR * PI * (rho(i,j,k) + S_loc) )
+     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
-  enddo
+
-  enddo
+  trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK         + &
-  enddo
+                gupxx * fxx + gupyy * fyy + gupzz * fzz   + &
        TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
       FOUR * PI * ( rho + S ))                                !rhs for trK
 !!!! gauge variable part
@@ -1000,15 +948,15 @@
 !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
-! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
+! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
-! so the constant offset has no effect on dissipation.
+! so the constant offset has no effect on dissipation.
-
+
-  call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
-  call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
-  call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
--- a/AMSS_NCKU_source/bssn_rhs.h
+++ b/AMSS_NCKU_source/bssn_rhs.h
@@ -22,32 +22,19 @@
 #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
 #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR
 #endif
-#ifdef fortran3
+#ifdef fortran3
-#define f_compute_rhs_bssn compute_rhs_bssn_
+#define f_compute_rhs_bssn compute_rhs_bssn_
 #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
 #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
 #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
 #define f_compute_rhs_Z4c compute_rhs_z4c_
 #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
-#define f_compute_constraint_fr compute_constraint_fr_
+#define f_compute_constraint_fr compute_constraint_fr_
-#endif
+#endif
-
+extern "C"
-#ifdef __cplusplus
+{
-extern "C"
+        int f_compute_rhs_bssn(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
 {
 #endif
        void f_bssn_rhs_kernel_timing_reset();
        int f_bssn_rhs_kernel_timing_bucket_count();
        const double *f_bssn_rhs_kernel_timing_local_seconds();
        const char *f_bssn_rhs_kernel_timing_label(int);
 #ifdef __cplusplus
 }
 #endif
 extern "C"
 {
        int f_compute_rhs_bssn(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
                               double *, double *,                                                                                 // chi, trK
                               double *, double *, double *, double *, double *, double *,                                         // gij
                               double *, double *, double *, double *, double *, double *,                                         // Aij
@@ -63,34 +50,13 @@ extern "C"
                               double *, double *, double *, double *, double *, double *,                                         // Christoffel
                               double *, double *, double *, double *, double *, double *,                                         // Christoffel
                               double *, double *, double *, double *, double *, double *,                                         // Ricci
-                               double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
+                               double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
-                               int &, int &, double &, int &);
+                               int &, int &, double &, int &);
-}
+}
-
+
-int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
+extern "C"
-                                 double *, double *,                                                                                 // chi, trK
+{
-                                 double *, double *, double *, double *, double *, double *,                                         // gij
+        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Ricci
                                 double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
                                 int &, int &, double &, int &);
 extern "C"
 {
        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
                                  double *, double *, double *,                                                                       // X,Y,Z
                                  double *, double *, double *,                                                                       // drhodx,drhody,drhodz
                                  double *, double *, double *,                                                                       // dsigmadx,dsigmady,dsigmadz
@@ -117,10 +83,10 @@ extern "C"
                                  int &, int &, double &, int &, int &);
 }
-extern "C"
+extern "C"
-{
+{
-        int f_compute_rhs_bssn_escalar(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
+        int f_compute_rhs_bssn_escalar(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
-                                       double *, double *,                                                                                 // chi, trK
+                                       double *, double *,                                                                                 // chi, trK
                                       double *, double *, double *, double *, double *, double *,                                         // gij
                                       double *, double *, double *, double *, double *, double *,                                         // Aij
                                       double *, double *, double *,                                                                       // Gam
@@ -137,14 +103,14 @@ extern "C"
                                       double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                       double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                       double *, double *, double *, double *, double *, double *,                                         // Ricci
-                                       double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
+                                       double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
-                                       int &, int &, double &, int &);
+                                       int &, int &, double &, int &);
-}
+}
-
+
-extern "C"
+extern "C"
-{
+{
-        int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
+        int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
-                                          double *, double *, double *,                                                                       // X,Y,Z
+                                          double *, double *, double *,                                                                       // X,Y,Z
                                          double *, double *, double *,                                                                       // drhodx,drhody,drhodz
                                          double *, double *, double *,                                                                       // dsigmadx,dsigmady,dsigmadz
                                          double *, double *, double *,                                                                       // dRdx,dRdy,dRdz
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -0,0 +1,36 @@
 #ifndef BSSN_RHS_CUDA_H
 #define BSSN_RHS_CUDA_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 int f_compute_rhs_bssn(int *ex, double &T,
                       double *X, double *Y, double *Z,
                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
                       double *Lap, double *betax, double *betay, double *betaz,
                       double *dtSfx, double *dtSfy, double *dtSfz,
                       double *chi_rhs, double *trK_rhs,
                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                       double *rho, double *Sx, double *Sy, double *Sz,
                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                       int &Symmetry, int &Lev, double &eps, int &co);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/AMSS_NCKU_source/diff_newwb.f90
+++ b/AMSS_NCKU_source/diff_newwb.f90
@@ -33,7 +33,7 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -137,7 +137,7 @@
  real*8 :: dX
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -1512,9 +1512,8 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
-  integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
+  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1, F9=9.d0,  F45=4.5d1
@@ -1561,55 +1560,17 @@
  fxx = ZEO
  fyy = ZEO
-  fzz = ZEO
+  fzz = ZEO
-  fxy = ZEO
+  fxy = ZEO
-  fxz = ZEO
+  fxz = ZEO
-  fyz = ZEO
+  fyz = ZEO
-
+
-  i_core_min = max(1, imin+2)
+  do k=1,ex(3)
-  i_core_max = min(ex(1), imax-2)
+  do j=1,ex(2)
-  j_core_min = max(1, jmin+2)
+  do i=1,ex(1)
-  j_core_max = min(ex(2), jmax-2)
+!~~~~~~ fxx
-  k_core_min = max(1, kmin+2)
+        if(i+2 <= imax .and. i-2 >= imin)then
-  k_core_max = min(ex(3), kmax-2)
+!
  if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
   do k=k_core_min,k_core_max
   do j=j_core_min,j_core_max
   do i=i_core_min,i_core_max
 ! interior points always use 4th-order stencils without branch checks
      fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
                          -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
      fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
                          -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
      fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
                          -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
      fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
                          -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
                          +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
                          -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
      fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
                          -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
                          +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
                          -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
      fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
                          -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
                          +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
                          -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
   enddo
   enddo
   enddo
  endif
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
      if(i>=i_core_min .and. i<=i_core_max .and. &
         j>=j_core_min .and. j<=j_core_max .and. &
         k>=k_core_min .and. k<=k_core_max) cycle
 !~~~~~~ fxx
        if(i+2 <= imax .and. i-2 >= imin)then
 !
 !               - f(i-2) + 16 f(i-1) - 30 f(i) + 16 f(i+1) - f(i+2)
 !  fxx(i) = ----------------------------------------------------------
 !                                  12 dx^2 
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -81,63 +81,26 @@ void fderivs(const int ex[3],
    }
    /*
-     * 两段式：
+     * Fortran loops:
-     * 1) 先在二阶可用区域计算二阶模板
+     * do k=1,ex3-1
-     * 2) 再在高阶可用区域覆盖为四阶模板
+     * do j=1,ex2-1
     * do i=1,ex1-1
     *
-     * 与原 if/elseif 逻辑等价，但减少逐点分支判断。
+     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
     */
-    const int i2_lo = (iminF > 0) ? iminF : 0;
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
-    const int j2_lo = (jminF > 0) ? jminF : 0;
+        const int kF = k0 + 1;
-    const int k2_lo = (kminF > 0) ? kminF : 0;
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
-    const int i2_hi = ex1 - 2;
+            const int jF = j0 + 1;
-    const int j2_hi = ex2 - 2;
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
-    const int k2_hi = ex3 - 2;
+                const int iF = i0 + 1;
-
+                const size_t p = idx_ex(i0, j0, k0, ex);
    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
    const int i4_hi = ex1 - 3;
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;
    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
                {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
@@ -159,9 +122,29 @@ void fderivs(const int ex[3],
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
                // elseif(i+1 <= imax .and. i-1 >= imin ...)
                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
                {
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
-}
+}
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -1327,6 +1327,35 @@ end subroutine d2dump
  return
  end subroutine polint
  subroutine polint0(xa, ya, y, ordn)
  ! Lagrange interpolation at x=0, O(n) direct formula
  implicit none
  integer, intent(in) :: ordn
  real*8, dimension(ordn), intent(in) :: xa, ya
  real*8, intent(out) :: y
  integer :: j, k
  real*8 :: wj
  y = 0.d0
  do j = 1, ordn
    wj = 1.d0
    do k = 1, ordn
      if (k .ne. j) then
        wj = wj * xa(k) / (xa(k) - xa(j))
      endif
    enddo
    y = y + wj * ya(j)
  enddo
  return
  end subroutine polint0
 !------------------------------------------------------------------------------
 !
 ! interpolation in 2 dimensions, follow yx order
 !
 !------------------------------------------------------------------------------
 !------------------------------------------------------------------------------
 ! Compute Lagrange interpolation basis weights for one target point.
 !------------------------------------------------------------------------------
@@ -1511,88 +1540,13 @@ deallocate(f_flat)
 f_out = f_out*dX*dY*dZ
-  return
+  return
-
+
-  end subroutine l2normhelper
+  end subroutine l2normhelper
-!--------------------------------------------------------------------------------------
+!--------------------------------------------------------------------------------------
-  subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
+! calculate L2norm especially for shell Blocks
-                           f1,f2,f3,f4,f5,f6,f7,f_out,gw)
+  subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
-
+                          f,f_out,gw,ogw,Symmetry)
  implicit none
 !~~~~~~> Input parameters:
  integer,intent(in ):: ex(1:3)
  real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
  integer,intent(in)::gw
  real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
  real*8, intent(out) :: f_out(7)
 !~~~~~~> Other variables:
  real*8            :: dX, dY, dZ
  integer::imin,jmin,kmin
  integer::imax,jmax,kmax
  integer::i,j,k
  real*8 :: s1,s2,s3,s4,s5,s6,s7
  dX = X(2) - X(1)
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)
 ! for ghost zone
   imin = gw+1
   jmin = gw+1
   kmin = gw+1
   imax = ex(1) - gw
   jmax = ex(2) - gw
   kmax = ex(3) - gw
 !for patch boundary (i.e., not ghost boundary)
 if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
 if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
 if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
 if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
  s1 = 0.d0
  s2 = 0.d0
  s3 = 0.d0
  s4 = 0.d0
  s5 = 0.d0
  s6 = 0.d0
  s7 = 0.d0
  do k=kmin,kmax
    do j=jmin,jmax
 !DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
      do i=imin,imax
        s1 = s1 + f1(i,j,k)*f1(i,j,k)
        s2 = s2 + f2(i,j,k)*f2(i,j,k)
        s3 = s3 + f3(i,j,k)*f3(i,j,k)
        s4 = s4 + f4(i,j,k)*f4(i,j,k)
        s5 = s5 + f5(i,j,k)*f5(i,j,k)
        s6 = s6 + f6(i,j,k)*f6(i,j,k)
        s7 = s7 + f7(i,j,k)*f7(i,j,k)
      enddo
    enddo
  enddo
  f_out(1) = s1*dX*dY*dZ
  f_out(2) = s2*dX*dY*dZ
  f_out(3) = s3*dX*dY*dZ
  f_out(4) = s4*dX*dY*dZ
  f_out(5) = s5*dX*dY*dZ
  f_out(6) = s6*dX*dY*dZ
  f_out(7) = s7*dX*dY*dZ
  return
  end subroutine l2normhelper7
 !--------------------------------------------------------------------------------------
 ! calculate L2norm especially for shell Blocks
  subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
                          f,f_out,gw,ogw,Symmetry)
  implicit none
 !~~~~~~> Input parameters:
--- a/AMSS_NCKU_source/fmisc.h
+++ b/AMSS_NCKU_source/fmisc.h
@@ -12,10 +12,9 @@
 #define f_global_interpind global_interpind
 #define f_global_interpind2d global_interpind2d
 #define f_global_interpind1d global_interpind1d
-#define f_l2normhelper l2normhelper
+#define f_l2normhelper l2normhelper
-#define f_l2normhelper7 l2normhelper7
+#define f_l2normhelper_sh l2normhelper_sh
-#define f_l2normhelper_sh l2normhelper_sh
+#define f_l2normhelper_sh_rms l2normhelper_sh_rms
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms
 #define f_average average
 #define f_average3 average3
 #define f_average2 average2
@@ -42,10 +41,9 @@
 #define f_global_interpind GLOBAL_INTERPIND
 #define f_global_interpind2d GLOBAL_INTERPIND2D
 #define f_global_interpind1d GLOBAL_INTERPIND1D
-#define f_l2normhelper L2NORMHELPER
+#define f_l2normhelper L2NORMHELPER
-#define f_l2normhelper7 L2NORMHELPER7
+#define f_l2normhelper_sh L2NORMHELPER_SH
-#define f_l2normhelper_sh L2NORMHELPER_SH
+#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
 #define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
 #define f_average AVERAGE
 #define f_average3 AVERAGE3
 #define f_average2 AVERAGE2
@@ -72,10 +70,9 @@
 #define f_global_interpind global_interpind_
 #define f_global_interpind2d global_interpind2d_
 #define f_global_interpind1d global_interpind1d_
-#define f_l2normhelper l2normhelper_
+#define f_l2normhelper l2normhelper_
-#define f_l2normhelper7 l2normhelper7_
+#define f_l2normhelper_sh l2normhelper_sh_
-#define f_l2normhelper_sh l2normhelper_sh_
+#define f_l2normhelper_sh_rms l2normhelper_sh_rms_
 #define f_l2normhelper_sh_rms l2normhelper_sh_rms_
 #define f_average average_
 #define f_average3 average3_
 #define f_average2 average2_
@@ -159,29 +156,20 @@ extern "C"
 							  int *, double *, int &, int &);
 }
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper(int *, double *, double *, double *,
+	void f_l2normhelper(int *, double *, double *, double *,
-						double &, double &, double &,
+						double &, double &, double &,
-						double &, double &, double &,
+						double &, double &, double &,
-						double *, double &, int &);
+						double *, double &, int &);
-}
+}
-
+
-extern "C"
+extern "C"
-{
+{
-	void f_l2normhelper7(int *, double *, double *, double *,
+	void f_l2normhelper_sh(int *, double *, double *, double *,
-						 double &, double &, double &,
+						   double &, double &, double &,
-						 double &, double &, double &,
+						   double &, double &, double &,
-						 double *, double *, double *, double *,
+						   double *, double &, int &, int &, int &);
 						 double *, double *, double *, double *, int &);
 }
 extern "C"
 {
 	void f_l2normhelper_sh(int *, double *, double *, double *,
 						   double &, double &, double &,
 						   double &, double &, double &,
 						   double *, double &, int &, int &, int &);
 }
 extern "C"
--- a/AMSS_NCKU_source/interp_lb_profile_data.h
+++ b/AMSS_NCKU_source/interp_lb_profile_data.h
@@ -1,5 +1,3 @@
 /* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
 /* 更新：负载均衡问题已经通过优化插值函数解决，此profile静态均衡方案已弃用，本头文件现在未参与编译 */
 /* Auto-generated from interp_lb_profile.bin — do not edit */
 #ifndef INTERP_LB_PROFILE_DATA_H
 #define INTERP_LB_PROFILE_DATA_H
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -63,28 +63,19 @@ void kodis(const int ex[3],
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
-    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
+    for (int k0 = 0; k0 < ex3; ++k0) {
    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
    const int i0_hi = imaxF - 4;  // inclusive
    const int j0_hi = jmaxF - 4;
    const int k0_hi = kmaxF - 4;
    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
        free(fh);
        return;
    }
    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
        const int kF = k0 + 1;
-        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
+        for (int j0 = 0; j0 < ex2; ++j0) {
            const int jF = j0 + 1;
-            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
+            for (int i0 = 0; i0 < ex1; ++i0) {
                const int iF = i0 + 1;
                // Fortran if 条件：
                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
                {
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
@@ -109,6 +100,7 @@ void kodis(const int ex[3],
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                }
            }
        }
    }
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -1,248 +0,0 @@
 #include "tool.h"
 /*
 * Combined advection (lopsided) + KO dissipation (kodis).
 * Uses one shared symmetry_bd buffer per call.
 */
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps)
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0; // 2^6
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    symmetry_bd(3, ex, f, fh, SoA);
    // Advection (same stencil logic as lopsided_c.C)
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    } else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    // KO dissipation (same domain restriction as kodiss_c.C)
    if (eps > ZEO) {
        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
        const int i0_hi = imaxF - 4; // inclusive
        const int j0_hi = jmaxF - 4;
        const int k0_hi = kmaxF - 4;
        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        const double Dx_term =
                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
                        const double Dy_term =
                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
                        const double Dz_term =
                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                    }
                }
            }
        }
    }
    free(fh);
 }
--- a/AMSS_NCKU_source/macrodef.h
+++ b/AMSS_NCKU_source/macrodef.h
@@ -29,16 +29,6 @@
 #define REGLEV 0
 #define BSSN_FINE_TIMING 0
 #define BSSN_FINE_TIMING_EVERY 1
 #define BSSN_FINE_TIMING_TOPN 8
 #define BSSN_KERNEL_FINE_TIMING 0
 #define BSSN_ENABLE_STDIN_ABORT_POLL 0
 //#define USE_GPU
 //#define CHECKDETAIL
@@ -98,21 +88,6 @@
 //     0: for every level;
 //     1: for all
 //
 // define BSSN_FINE_TIMING
 //     enable fine-grained per-timestep timing monitor
 //
 // define BSSN_FINE_TIMING_EVERY
 //     report timing every N coarse timesteps
 //
 // define BSSN_FINE_TIMING_TOPN
 //     number of hottest timing buckets shown in stdout
 //
 // define BSSN_KERNEL_FINE_TIMING
 //     enable split timing inside compute_rhs_bssn
 //
 // define BSSN_ENABLE_STDIN_ABORT_POLL
 //     poll stdin and broadcast abort flag every coarse step
 //
 // define USE_GPU
 //     use gpu or not
 //
@@ -167,3 +142,4 @@
 #define TINY 1e-10
 #endif   /* MICRODEF_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -1,69 +1,33 @@
-include makefile.inc
+include makefile.inc
-
+
-include AMSS_NCKU_build.mk
+## polint(ordn=6) kernel selector:
-
+##   1 (default): barycentric fast path
-ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
+##   0          : fallback to Neville path
-
+POLINT6_USE_BARY ?= 1
-ifeq ($(USE_TRANSFER_CACHE),auto)
+POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-ifeq ($(ABE_TYPE),0)
+
-EFFECTIVE_USE_TRANSFER_CACHE = 1
+## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
-else
+##   make                        -> opt  (PGO-guided, maximum performance)
-EFFECTIVE_USE_TRANSFER_CACHE = 0
+##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
-endif
+PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 else
 EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
 endif
 ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(ABE_TYPE),1)
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(USE_CXX_KERNELS),0)
 $(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel)
 endif
 endif
 ## polint(ordn=6) kernel selector:
 ##   1 (default): barycentric fast path
 ##   0          : fallback to Neville path
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
 ## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
 ##   make                        -> opt  (PGO-guided, maximum performance)
 ##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
-CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
+f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
+              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
+else
-else
+## opt (default): maximum performance with PGO profile data
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
+CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-## PGO has been turned off, now tested and found to be negative optimization
+              -fprofile-instr-use=$(PROFDATA) \
-## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
-
+f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-
+              -fprofile-instr-use=$(PROFDATA) \
-CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
+endif
              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -79,12 +43,16 @@ endif
 .cu.o:
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
 bssn_rhs_cuda.o: bssn_rhs_cuda.cu macrodef.h
 	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
 # C rewrite of BSSN RHS kernel and helpers
-bssn_rhs_c.o: bssn_rhs_c.C
+bssn_rhs_c.o: bssn_rhs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
+
-fderivs_c.o: fderivs_c.C
+fderivs_c.o: fderivs_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fdderivs_c.o: fdderivs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -92,14 +60,11 @@ fdderivs_c.o: fdderivs_c.C
 kodiss_c.o: kodiss_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-lopsided_c.o: lopsided_c.C
+lopsided_c.o: lopsided_c.C
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-
+
-lopsided_kodis_c.o: lopsided_kodis_c.C
+interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
@@ -116,24 +81,25 @@ TwoPunctureABE.o: TwoPunctureABE.C
 # Input files
 ## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
-ifeq ($(USE_CXX_KERNELS),0)
+ifeq ($(USE_CXX_KERNELS),0)
-# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
+# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
-CFILES =
+CFILES =
-else
+else
-# C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels
+# C++ mode (default): C rewrite of bssn_rhs and helper kernels
-CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
+CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
-ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
+endif
-CFILES += bssn_escalar_rhs_c.o
+
-endif
+# CUDA rewrite: bssn_rhs_cuda.o replaces all CFILES (stencils are built-in)
-endif
+CFILES_CUDA = bssn_rhs_cuda.o
-
+
-## RK4 kernel switch (independent from USE_CXX_KERNELS)
+## RK4 kernel switch (independent from USE_CXX_KERNELS)
-ifeq ($(USE_CXX_RK4),1)
+ifeq ($(USE_CXX_RK4),1)
-CFILES += rungekutta4_rout_c.o
+CFILES += rungekutta4_rout_c.o
-RK4_F90_OBJ =
+CFILES_CUDA += rungekutta4_rout_c.o
-else
+RK4_F90_OBJ =
-RK4_F90_OBJ = rungekutta4_rout.o
+else
-endif
+RK4_F90_OBJ = rungekutta4_rout.o
 endif
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -150,12 +116,12 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 	   NullShellPatch2_Evo.o \
 	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
-F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
+F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
-	   prolongrestrict_cell.o prolongrestrict_vertex.o\
+	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
+	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
-	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
+	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
-	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
+	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
-           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
+           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
           fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
 	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
 	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
@@ -215,9 +181,12 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
 misc.o : zbesh.o
 # projects
-ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
+ABE: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
-	
+
 ABE_CUDA: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
 ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
@@ -225,4 +194,4 @@ TwoPunctureABE: $(TwoPunctureFILES)
 	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
-	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
+	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -48,18 +48,6 @@ endif
 ##   0           : fall back to original Fortran kernels
 USE_CXX_KERNELS ?= 1
 ## BSSN-EScalar RHS switch
 ##   1 (default) : use BSSN-EScalar C wrapper on the normal patch path
 ##   0           : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
 ## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
 USE_CXX_ESCALAR_KERNEL ?= 1
 ## Cached transfer switch
 ##   auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
 ##   1             : force cached Sync/Restrict/OutBd transfer on evolution hot paths
 ##   0             : force the original uncached transfer path
 USE_TRANSFER_CACHE ?= auto
 ## RK4 kernel implementation switch
 ##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
 ##   0           : use original Fortran rungekutta4_rout.o
@@ -74,4 +62,4 @@ CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -217,7 +217,6 @@
  real*8,dimension(2*ghost_width) :: X,Y,Z
  real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
  real*8, dimension(2*ghost_width) :: tmp1
  real*8 :: ddy
  real*8,dimension(3) :: ccp
 #if (ghost_width == 2)
@@ -580,7 +579,7 @@
            tmp1(ghost_width-cxI(1)+cxB(1)  :ghost_width-cxI(1)+cxT(1)  ) = funf(cxB(1):cxT(1),j,k)
         endif
-         call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
 ! for y direction
       elseif(sum(fg).eq.2.and.fg(2) .eq. 0.and. &
@@ -690,7 +689,7 @@
            tmp1(ghost_width-cxI(2)+cxB(2)  :ghost_width-cxI(2)+cxT(2)  ) = funf(i,cxB(2):cxT(2),k)
         endif
-         call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
 ! for z direction
       elseif(sum(fg).eq.2.and.fg(3) .eq. 0.and. &
@@ -802,7 +801,7 @@
            tmp1(ghost_width-cxI(3)+cxB(3)  :ghost_width-cxI(3)+cxT(3)  ) = funf(i,j,cxB(3):cxT(3))
         endif
-         call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
 #else
--- a/AMSS_NCKU_source/prolongrestrict_vertex.f90
+++ b/AMSS_NCKU_source/prolongrestrict_vertex.f90
@@ -217,7 +217,6 @@
  real*8,dimension(2*ghost_width) :: X,Y,Z
  real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
  real*8, dimension(2*ghost_width) :: tmp1
  real*8 :: ddy
 #if (ghost_width == 2)
  real*8, parameter :: C1=-1.d0/16,C2=9.d0/16
@@ -470,7 +469,7 @@
         tmp1(cxB(1)+ghost_width-i+1:cxT(1)+ghost_width-i+1) = fh(cxB(1):cxT(1),j,k)
-         call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
 ! for y direction
       elseif (fg(2) .eq. 0)then
@@ -529,7 +528,7 @@
         tmp1(cxB(2)+ghost_width-j+1:cxT(2)+ghost_width-j+1) = fh(i,cxB(2):cxT(2),k)
-         call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
 ! for z direction
       else
@@ -588,7 +587,7 @@
         tmp1(cxB(3)+ghost_width-k+1:cxT(3)+ghost_width-k+1) = fh(i,j,cxB(3):cxT(3))
-         call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
+         call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
       endif
--- a/AMSS_NCKU_source/rungekutta4_rout_c.C
+++ b/AMSS_NCKU_source/rungekutta4_rout_c.C
@@ -2,7 +2,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstddef>
 #include <complex>
 #include <immintrin.h>
 namespace {
@@ -118,62 +117,6 @@ inline void rk4_stage3(std::size_t n,
 extern "C" {
 void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 void rungekutta4_cplxscalar_(double &dT,
                             std::complex<double> &f0,
                             std::complex<double> &f1,
                             std::complex<double> &f_rhs,
                             int &RK4) {
    constexpr double F1o6 = 1.0 / 6.0;
    constexpr double HLF = 0.5;
    constexpr double TWO = 2.0;
    switch (RK4) {
    case 0:
        f1 = f0 + HLF * dT * f_rhs;
        break;
    case 1:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + HLF * dT * f1;
        break;
    case 2:
        f_rhs = f_rhs + TWO * f1;
        f1 = f0 + dT * f1;
        break;
    case 3:
        f1 = f0 + F1o6 * dT * (f1 + f_rhs);
        break;
    default:
        std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
        std::abort();
    }
 }
 int f_rungekutta4_rout(int *ex, double &dT,
                       double *f0, double *f1, double *f_rhs,
                       int &RK4) {
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
--- a/AMSS_NCKU_source/surface_integral.h
+++ b/AMSS_NCKU_source/surface_integral.h
@@ -27,24 +27,19 @@ using namespace std;
 class surface_integral
 {
-private:
+private:
-	int Symmetry, factor;
+	int Symmetry, factor;
-	int N_theta, N_phi; // Number of points in Theta & Phi directions
+	int N_theta, N_phi; // Number of points in Theta & Phi directions
-	double dphi, dcostheta;
+	double dphi, dcostheta;
-	double *arcostheta, *wtcostheta;
+	double *arcostheta, *wtcostheta;
-	int n_tot; // size of arrays
+	int n_tot; // size of arrays
-
+
-	double *nx_g, *ny_g, *nz_g; // global list of unit normals
+	double *nx_g, *ny_g, *nz_g; // global list of unit normals
-	int myrank, cpusize;
+	int myrank, cpusize;
-	int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
+
-	double *wave_theta_pos, *wave_theta_neg;
+public:
-	double *wave_phi_cos, *wave_phi_sin;
+	surface_integral(int iSymmetry);
-	void clear_wave_cache();
+	~surface_integral();
 	void build_wave_cache(int spinw, int maxl);
 public:
 	surface_integral(int iSymmetry);
 	~surface_integral();
 	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
 				   int spinw, int maxl, int NN, double *RP, double *IP,
@@ -82,37 +77,21 @@ public:
 								 double &, double &, double &, double &, double &, double &, double &,
 								 double &, double &, double &, double &, double &, double &,
 								 double &, double &)); // NN is the length of RP and IP
-	void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
+	void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
-					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-					   var *Gmx, var *Gmy, var *Gmz,
+					   var *Gmx, var *Gmy, var *Gmz,
-					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
+					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
-					   double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
+					   double *Rout, monitor *Monitor);
-	void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
+	void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
-					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-					   var *Gmx, var *Gmy, var *Gmz,
+					   var *Gmx, var *Gmy, var *Gmz,
-					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
+					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
-					   double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
+					   double *Rout, monitor *Monitor);
-	void surf_WaveMassPAng(double rex, int lev, cgh *GH,
+	void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
-					   var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
+				   var *chi, var *trK,
-					   var *chi, var *trK,
+				   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
 					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
 					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
 					   var *Gmx, var *Gmy, var *Gmz,
 					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
 					   double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
 	void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
 					   var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
 					   var *chi, var *trK,
 					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
 					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
 					   var *Gmx, var *Gmy, var *Gmz,
 					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
 					   double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
 	void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
 				   var *chi, var *trK,
 				   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
 				   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
 				   var *chix, var *chiy, var *chiz,
 				   var *trKx, var *trKy, var *trKz,
@@ -131,12 +110,12 @@ public:
 	bool SR_Interp_Points(MyList<var> *VarList, cgh *GH, ShellPatch *SH,
 						  int NN, double **XX, double *Shellf);
-	void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
+	void surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
-					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
+					   var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
-					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
+					   var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
-					   var *Gmx, var *Gmy, var *Gmz,
+					   var *Gmx, var *Gmy, var *Gmz,
-					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
+					   var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
-					   double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true);
+					   double *Rout, monitor *Monitor, MPI_Comm Comm_here);
 	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
 				   int spinw, int maxl, int NN, double *RP, double *IP,
 				   monitor *Monitor, MPI_Comm Comm_here);
--- a/AMSS_NCKU_source/tool.h
+++ b/AMSS_NCKU_source/tool.h
@@ -24,10 +24,4 @@ void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
-              int Symmetry, const double SoA[3]);
+              int Symmetry, const double SoA[3]);
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f, double *f_rhs,
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps);
--- a/BSSN_BUILD_CONFIG_MIGRATION.md
+++ b/BSSN_BUILD_CONFIG_MIGRATION.md
@@ -1,211 +0,0 @@
 # BSSN Build Config Migration
 This note records the build-configuration fix needed when replacing
 `AMSS_NCKU_Input.py` or `generate_macrodef.py` with a newer upstream version.
 ## Problem
 `AMSS_NCKU_source/macrodef.h` is not the authoritative file used by normal
 runs. `AMSS_NCKU_Program.py` first generates macro files under
 `input_data.File_directory`, copies `AMSS_NCKU_source` to
 `<File_directory>/AMSS_NCKU_source_copy`, then copies the generated macro files
 into that copied source tree and compiles there.
 Therefore, makefile logic must not depend only on the stale
 `AMSS_NCKU_source/macrodef.h`. The actual equation path must be passed to the
 copied build tree from the same generation step that creates `macrodef.h`.
 The performance regression was caused by compiling/linking the
 `BSSN-EScalar` C wrapper into BSSN vacuum builds. For BSSN vacuum (`ABEtype=0`),
 the build must use:
 ```make
 BSSN_USE_TRANSFER_CACHE=1
 BSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and must not link `bssn_escalar_rhs_c.o`.
 ## Required Migration Steps
 ### 1. Add an ABE type helper in `generate_macrodef.py`
 Add a helper that maps `input_data.Equation_Class` to the numeric `ABEtype`.
 Use the same mapping as `macrodef.h`:
 ```python
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ```
 Update `generate_macrodef_h()` to print `#define ABEtype {get_abe_type()}`
 instead of duplicating the if/elif mapping.
 ### 2. Generate a makefile fragment
 In `generate_macrodef.py`, add:
 ```python
 def generate_build_config():
    file1 = open(os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print("# Generated by generate_macrodef.py; do not edit manually.", file=file1)
    print(f"ABE_TYPE := {get_abe_type()}", file=file1)
    file1.close()
 ```
 This file is the build-time authority for the equation path.
 ### 3. Call and copy the generated build config
 In `AMSS_NCKU_Program.py`, after generating `macrodef.h` and `macrodef.fh`, call:
 ```python
 generate_macrodef.generate_build_config()
 print(" AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. ")
 ```
 When copying generated files into `AMSS_NCKU_source_copy`, also copy:
 ```python
 build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 ```
 ### 4. Make the source makefile consume the generated config
 At the top of `AMSS_NCKU_source/makefile`, after `include makefile.inc`, add:
 ```make
 -include AMSS_NCKU_build.mk
 ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
 ```
 The generated `AMSS_NCKU_build.mk` is used during normal Python-driven builds.
 The fallback keeps manual source-tree builds usable.
 ### 5. Gate path-specific build options by `ABE_TYPE`
 Use effective build switches:
 ```make
 ifeq ($(USE_TRANSFER_CACHE),auto)
 ifeq ($(ABE_TYPE),0)
 EFFECTIVE_USE_TRANSFER_CACHE = 1
 else
 EFFECTIVE_USE_TRANSFER_CACHE = 0
 endif
 else
 EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
 endif
 ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(ABE_TYPE),1)
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
 ```
 Only add `bssn_escalar_rhs_c.o` when the effective EScalar C kernel switch is
 enabled:
 ```make
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 CFILES += bssn_escalar_rhs_c.o
 endif
 ```
 ### 6. Use safe transfer-cache default
 In `AMSS_NCKU_source/makefile.inc`, keep:
 ```make
 USE_TRANSFER_CACHE ?= auto
 ```
 With the effective switch logic above, this enables cached transfer for BSSN
 vacuum while keeping non-BSSN paths on the uncached path by default.
 ## Verification Checklist
 Run these checks after migrating:
 ```bash
 python3 -c "import generate_macrodef; generate_macrodef.generate_build_config()"
 cat GW150914/AMSS_NCKU_build.mk
 ```
 For BSSN, the generated file should contain:
 ```make
 ABE_TYPE := 0
 ```
 Dry-run the copied or source makefile:
 ```bash
 make -n -B INTERP_LB_MODE=off ABE | grep -E 'BSSN_USE_TRANSFER_CACHE|BSSN_USE_ESCALAR_C_KERNEL|bssn_escalar_rhs_c'
 ```
 Expected BSSN result:
 ```text
 -DBSSN_USE_TRANSFER_CACHE=1 -DBSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and no `bssn_escalar_rhs_c.o` in the final link command.
 Run the full workflow:
 ```bash
 python3 AMSS_NCKU_Program.py
 ```
 For the 10-step BSSN test, compare coordinate output:
 ```bash
 python3 - <<'PY'
 from pathlib import Path
 old = Path('../GW150914-06457/AMSS_NCKU_output/bssn_BH.dat')
 new = Path('GW150914/AMSS_NCKU_output/bssn_BH.dat')
 def rows(path):
    out = []
    for line in path.read_text().splitlines():
        if not line.strip() or line.lstrip().startswith('#'):
            continue
        out.append([float(x) for x in line.split()])
    return out
 ro, rn = rows(old), rows(new)
 n = min(len(ro), len(rn))
 max_abs = 0.0
 for i in range(n):
    for a, b in zip(ro[i], rn[i]):
        max_abs = max(max_abs, abs(a - b))
 print(f"old_rows={len(ro)} new_rows={len(rn)} compared_rows={n}")
 print(f"max_abs_diff={max_abs:.17g}")
 PY
 ```
 For the validated migration, the first 10 rows matched exactly:
 ```text
 max_abs_diff=0
 ```
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -12,37 +12,6 @@ import os
 import AMSS_NCKU_Input as input_data          ## import program input file
 ##################################################################
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ##################################################################
 ## Generate the makefile fragment used by the copied source tree.
 ## The source-tree macrodef.h is not authoritative because macro files
 ## are regenerated under File_directory for each run.
 def generate_build_config():
    file1 = open( os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print( "# Generated by generate_macrodef.py; do not edit manually.", file=file1 )
    print( f"ABE_TYPE := {get_abe_type()}",                             file=file1 )
    file1.close()
 ##################################################################
 ## Generate the macro file macrodef.h according to user settings
@@ -89,10 +58,19 @@ def generate_macrodef_h():
    # 2: Z4c vacuum
    # 3: coupled to Maxwell field
-    try:
+    if ( input_data.Equation_Class == "BSSN" ):
-        print( f"#define ABEtype {get_abe_type()}", file=file1 )
+        print( "#define ABEtype 0", file=file1 )
-        print(                                      file=file1 )
+        print(                      file=file1 )
-    except ValueError:
+    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        print( "#define ABEtype 1", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        print( "#define ABEtype 3", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "Z4C" ):
        print( "#define ABEtype 2", file=file1 )
        print(                      file=file1 )
    else:
        print( "Equation_Class setting error!!!"                )
        print()
        print( "# Equation type #define ABEtype setting error!!!", file=file1 )
@@ -166,62 +144,6 @@ def generate_macrodef_h():
    print( "#define REGLEV 0",      file=file1 )
    print(                          file=file1 )
    # Define fine-grained timing/debug macros.
    # All of them default to OFF so production builds do not pay profiling overhead.
    fine_timing = getattr(input_data, "Fine_Timing",
                  getattr(input_data, "Finegrained_Timing", "no"))
    kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
                          getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
    stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
                       getattr(input_data, "Stdin_Abort_Poll", "no"))
    timing_report_every = max(1, int(getattr(
        input_data, "Timing_Every_Steps",
        getattr(input_data, "Timing_Report_Every", 1))))
    timing_top_hotspots = max(1, int(getattr(
        input_data, "Timing_Top_Hotspots", 8)))
    if ( fine_timing == "yes" ):
        print( "#define BSSN_FINE_TIMING 1", file=file1 )
        print(                               file=file1 )
    elif ( fine_timing == "no" ):
        print( "#define BSSN_FINE_TIMING 0", file=file1 )
        print(                               file=file1 )
    else:
        print( "Fine_Timing setting error!!!" )
        print()
        print( "# Fine_Timing setting error!!!", file=file1 )
        print(                                   file=file1 )
    print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
    print(                                                          file=file1 )
    print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}",  file=file1 )
    print(                                                          file=file1 )
    if ( kernel_fine_timing == "yes" ):
        print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
        print(                                      file=file1 )
    elif ( kernel_fine_timing == "no" ):
        print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
        print(                                      file=file1 )
    else:
        print( "Kernel_Fine_Timing setting error!!!" )
        print()
        print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
        print(                                          file=file1 )
    if ( stdin_abort_poll == "yes" ):
        print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
        print(                                           file=file1 )
    elif ( stdin_abort_poll == "no" ):
        print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
        print(                                           file=file1 )
    else:
        print( "Enable_Stdin_Abort_Poll setting error!!!" )
        print()
        print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
        print(                                               file=file1 )
    # Define macro USE_GPU
    # use GPU or not
@@ -302,21 +224,6 @@ def generate_macrodef_h():
    print( "//     0: for every level;",                                                 file=file1 ) 
    print( "//     1: for all",                                                          file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define BSSN_FINE_TIMING",                                                 file=file1 )
    print( "//     enable fine-grained per-timestep timing monitor",                     file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define BSSN_FINE_TIMING_EVERY",                                           file=file1 )
    print( "//     report timing every N coarse timesteps",                              file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define BSSN_FINE_TIMING_TOPN",                                            file=file1 )
    print( "//     number of hottest timing buckets shown in stdout",                    file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define BSSN_KERNEL_FINE_TIMING",                                          file=file1 )
    print( "//     enable split timing inside compute_rhs_bssn",                         file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define BSSN_ENABLE_STDIN_ABORT_POLL",                                     file=file1 )
    print( "//     poll stdin and broadcast abort flag every coarse step",               file=file1 )
    print( "//",                                                                         file=file1 )
    print( "// define USE_GPU",                                                          file=file1 )
    print( "//     use gpu or not",                                                      file=file1 )
    print( "//",                                                                         file=file1 )
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -70,7 +70,7 @@ def makefile_ABE():
    ## Build command with CPU binding to nohz_full cores
    if (input_data.GPU_Calculation == "no"):
-        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
+        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
    elif (input_data.GPU_Calculation == "yes"):
        makefile_command  = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
    else:
--- a/pgo_profile/PGO_Profile_Analysis.md
+++ b/pgo_profile/PGO_Profile_Analysis.md
@@ -0,0 +1,97 @@
 # AMSS-NCKU PGO Profile Analysis Report
 ## 1. Profiling Environment
 | Item | Value |
 |------|-------|
 | Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
 | Instrumentation Flag | `-fprofile-instr-generate` |
 | Optimization Level (instrumented) | `-O2 -xHost -fma` |
 | MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
 | Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
 | Merged Profile | `default.profdata` (394 KB) |
 | llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
 ## 2. Reduced Simulation Parameters (for profiling run)
 | Parameter | Production Value | Profiling Value |
 |-----------|-----------------|-----------------|
 | MPI_processes | 64 | 1 |
 | grid_level | 9 | 4 |
 | static_grid_level | 5 | 3 |
 | static_grid_number | 96 | 24 |
 | moving_grid_number | 48 | 16 |
 | largest_box_xyz_max | 320^3 | 160^3 |
 | Final_Evolution_Time | 1000.0 | 10.0 |
 | Evolution_Step_Number | 10,000,000 | 1,000 |
 | Detector_Number | 12 | 2 |
 ## 3. Profile Summary
 | Metric | Value |
 |--------|-------|
 | Total instrumented functions | 1,392 |
 | Functions with non-zero counts | 117 (8.4%) |
 | Functions with zero counts | 1,275 (91.6%) |
 | Maximum function entry count | 386,459,248 |
 | Maximum internal block count | 370,477,680 |
 | Total block count | 4,198,023,118 |
 ## 4. Top 20 Hotspot Functions
 | Rank | Total Count | Max Block Count | Function | Category |
 |------|------------|-----------------|----------|----------|
 | 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
 | 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
 | 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
 | 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
 | 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
 | 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
 | 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
 | 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
 | 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
 | 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
 | 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
 | 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
 | 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
 | 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
 | 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
 | 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
 | 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
 | 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
 | 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
 | 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
 ## 5. Hotspot Category Breakdown
 Top 20 functions account for ~98% of total execution counts:
 | Category | Functions | Combined Count | Share |
 |----------|-----------|---------------|-------|
 | Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
 | BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
 | Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
 | Time integration | rungekutta4_rout_ | ~119M | ~3% |
 | Dissipation | kodis_ | ~92M | ~2% |
 | Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
 ## 6. Conclusions
 1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
 2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
 3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
 4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
 5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
 ## 7. PGO Phase 2 Usage
 To apply the profile, use the following flags in `makefile.inc`:
 ```makefile
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
              -align array64byte -fpp -I${MKLROOT}/include
 ```
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default_9726853898452064389_0.profdata
+++ b/pgo_profile/default_9726853898452064389_0.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default.profdata.backup2
+++ b/pgo_profile/default.profdata.backup2
--- a/pgo_profile/default.profdatabackup3
+++ b/pgo_profile/default.profdatabackup3
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/pgo_profile/default_9725750769337483397_0.profraw
+++ b/pgo_profile/default_9725750769337483397_0.profraw
--- a/pgo_profile/default_9725923726611433605_0.profraw
+++ b/pgo_profile/default_9725923726611433605_0.profraw
--- a/pgo_profile/default_9726420327935033477_0.profraw
+++ b/pgo_profile/default_9726420327935033477_0.profraw
Author	SHA1	Message	Date
ianchb	43975017eb	prolong3 改为先算实际 stencil 窗口；只有窗口触及对称边界时才走全域 symmetry_bd，否则只复制必需窗口。restrict3 同样改成窗口判定，无触边时仅填 ii/jj/kk 必需窗口。	2026-03-02 18:10:38 +08:00
ianchb	485667ef4c	perf(restrict3): shrink X-pass ii sweep to required overlap window - compute fi_min/fi_max from output i-range and derive ii_lo/ii_hi - replace full ii sweep (-1:extf(1)) with windowed sweep in Z/Y precompute passes - keep stencil math unchanged; add bounds sanity check for ii window	2026-03-02 18:10:38 +08:00
ianchb	2a977ce82e	perf(MPatch): 用空间 bin 索引加速 Interp_Points 的 block 归属查找 - 为 Patch::Interp_Points 三个重载引入 BlockBinIndex（候选筛选 + 全扫回退） - 保持原 point-in-block 判定与后续插值/通信流程不变 - 将逐点线性扫块从 O(N_pointsN_blocks) 降为近似 O(N_pointsk) - 测试：bin 上限如果太大，会引入不必要的索引构建开销。将 bins 上限设为 16。 Co-authored-by: gpt-5.3-codex	2026-03-02 18:10:35 +08:00
CGH0S7	160e2a0369	fix prolong/restrict index bounds after cherry-pick `12e1f63`	2026-03-02 14:11:52 +08:00
ianchb	01410de05a	refactor(Parallel): streamline MPI communication by consolidating request handling and memory management	2026-03-02 14:08:27 +08:00
jaunatisblue	83c826eb49	prolong3: 减少Z-pass 冗余计算	2026-03-02 14:08:13 +08:00
ianchb	43ddaab903	fix: add C RK4 kernel to CFILES_CUDA	2026-03-02 12:19:52 +08:00
ianchb	5839755c2f	compute div_beta on-the-fly to remove temp array	2026-03-02 12:12:58 +08:00
ianchb	a893b4007c	merge lopsided+kodis	2026-03-02 12:12:26 +08:00
CGH0S7	ad5ff03615	build: switch allocator option to oneTBB tbbmalloc (cherry picked from commit `e29ca2dca9`)	2026-03-02 11:53:30 +08:00
CGH0S7	b4bc0ef269	先关闭绑核心，发现速度对比：不绑定核心+SCX>绑核心+SCX (cherry picked from commit `1eba73acbe`)	2026-03-02 11:53:13 +08:00
CGH0S7	b185f84cce	Add switchable C RK4 kernel and build toggle (cherry picked from commit `b91cfff301`)	2026-03-02 11:53:00 +08:00
ianchb	71f6eb7b44	Remove profiling code	2026-03-02 11:29:48 +08:00
CGH0S7	90620c2aec	Optimize fdderivs: skip redundant 2nd-order work in 4th-order overlap	2026-03-02 11:04:04 +08:00
jaunatisblue	f561522d89	prolong3：提升cache命中率	2026-03-02 11:02:19 +08:00
jaunatisblue	3f4715b8cc	修改prolong	2026-03-02 11:02:17 +08:00
jaunatisblue	710ea8f76b	对prolong3做访存优化	2026-03-02 11:02:12 +08:00
CGH0S7	5cf891359d	Optimize symmetry_bd with stride-based fast paths (cherry picked from commit `16013081e0`)	2026-03-02 11:01:49 +08:00
CGH0S7	222747449a	Optimize average2: use DO CONCURRENT loop form (cherry picked from commit `1a518cd3f6`)	2026-03-02 11:01:45 +08:00
CGH0S7	14de4d535e	Optimize average2: replace array expression with explicit loops (cherry picked from commit `1dc622e516`)	2026-03-02 11:01:42 +08:00
CGH0S7	787295692a	Optimize prolong3: hoist bounds check out of inner loop (cherry picked from commit `3046a0ccde`)	2026-03-02 11:01:39 +08:00
CGH0S7	335f2f23fe	Optimize prolong3: replace parity branches with coefficient lookup (cherry picked from commit `d4ec69c98a`)	2026-03-02 11:01:37 +08:00
CGH0S7	7109474a14	Optimize prolong3: precompute coarse index/parity maps (cherry picked from commit `2c0a3055d4`)	2026-03-02 11:01:31 +08:00
CGH0S7	e7a02e8f72	perf(polint): add uniform-grid fast path for barycentric n=6	2026-03-01 14:13:51 +08:00
CGH0S7	8dad910c6c	perf(polint): add switchable barycentric ordn=6 path	2026-03-01 14:13:51 +08:00
CGH0S7	01b4cf71d1	perf(polin3): switch to lagrange-weight tensor contraction	2026-03-01 14:13:04 +08:00
CGH0S7	66dabe8cc4	perf(polint): add ordn=6 specialized neville path	2026-03-01 14:12:22 +08:00
ianchb	abf2f640e4	add fused symmetry packing kernels for orders 2 and 3 in BSSN RHS	2026-02-28 15:35:14 +08:00
ianchb	94f40627aa	refine GPU dispatch initialization and optimize H2D/D2H data transfers	2026-02-28 15:23:41 +08:00
ianchb	d94c31c5c4	[WIP]Implement multi-GPU support in BSSN RHS and add profiling for H2D/D2H transfers	2026-02-28 11:12:14 +08:00
ianchb	724e9cd415	[WIP]Add CUDA support for BSSN RHS with new kernel and update makefiles	2026-02-28 11:12:13 +08:00
ianchb	c001939461	Add Lagrange interpolation subroutine and update calls in prolongrestrict modules	2026-02-28 11:12:13 +08:00
ianchb	94d236385d	Revert "skip redundant MPI ghost cell syncs for stages 0, 1 & 2" This reverts commit `f7ada421cf`.	2026-02-28 11:12:12 +08:00
ianchb	780f1c80d0	skip redundant MPI ghost cell syncs for stages 0, 1 & 2 BSSN 每个 RK4 时间步执行 4 次 MPI ghost zone 同步： Stage 0（预测）结束后：Parallel::Sync(SynchList_pre) Stage 1（校正 1）结束后：Parallel::Sync(SynchList_cor) Stage 2（校正 2）结束后：Parallel::Sync(SynchList_cor) Stage 3（校正 3）结束后：Parallel::Sync(SynchList_cor) ← 必要（为下一步提供 ghost） bssnEM_class.C、Z4c_class.C 结构相同，一起修改了	2026-02-28 11:12:09 +08:00