Switch legacy build to GCC and OpenMPI

2026-04-13 19:39:30 +08:00
39 changed files with 5547 additions and 12199 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__
 GW150914
-GW150914*
+GW150914-origin
 docs
 *.tmp
-.codex
+
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -177,9 +177,6 @@ print( " AMSS-NCKU macro file macrodef.h has been generated. " )
 generate_macrodef.generate_macrodef_fh()
 print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
 generate_macrodef.generate_build_config()
 print( " AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. " )
 ##################################################################
@@ -222,11 +219,9 @@ shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
 macrodef_h_path  = os.path.join(File_directory, "macrodef.h") 
 macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") 
 build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
 shutil.copy2(macrodef_h_path,  AMSS_NCKU_source_copy)
 shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 # Notes on copying files:
 # shutil.copy2 preserves file metadata such as modification time.
--- a/AMSS_NCKU_Program_Plot.py
+++ b/AMSS_NCKU_Program_Plot.py
@@ -1,100 +0,0 @@
 ##################################################################
 ##
 ## AMSS-NCKU Plot-Only Restart Script
 ## Author: Xiaoqu / Claude
 ## 2026/05/12
 ##
 ## This script checks for existing output data from AMSS_NCKU_Program.py.
 ## If data exists, it skips all computation and goes directly to plotting,
 ## saving time when plotting was interrupted.
 ## If no data is found, it exits with a message.
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 import os
 import sys
 import AMSS_NCKU_Input as input_data
 ##################################################################
 ## Construct paths from input configuration
 File_directory         = os.path.join(input_data.File_directory)
 output_directory       = os.path.join(File_directory, "AMSS_NCKU_output")
 binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
 figure_directory       = os.path.join(File_directory, "figure")
 ##################################################################
 ## Check whether the required output data files exist
 required_files = [
    os.path.join(binary_results_directory, "bssn_BH.dat"),
    os.path.join(binary_results_directory, "bssn_ADMQs.dat"),
    os.path.join(binary_results_directory, "bssn_psi4.dat"),
    os.path.join(binary_results_directory, "bssn_constraint.dat"),
 ]
 missing_files = [f for f in required_files if not os.path.exists(f)]
 if missing_files:
    print(" No existing AMSS_NCKU_Program.py output data found. ")
    print(" The following required files are missing: ")
    for f in missing_files:
        print(f"   {f}")
    print()
    print(" Please run AMSS_NCKU_Program.py first to generate the simulation data. ")
    print(" Exiting. ")
    sys.exit(1)
 print(" Found existing AMSS_NCKU_Program.py output data. "          )
 print(" Skipping all computation and going directly to plotting. " )
 print()
 ## Ensure the figure directory exists (it should, but be safe)
 os.makedirs(figure_directory, exist_ok=True)
 ##################################################################
 ## Plot the AMSS-NCKU program results
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory)))
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory)))
 ## Plot black hole separation vs. time
 plot_tasks.append((plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory)))
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i)))
    plot_tasks.append((plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i)))
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i)))
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
    plot_tasks.append((plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i)))
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data (runs serially, not in the parallel pool)
 plot_xiaoqu.generate_binary_data_plot(binary_results_directory, figure_directory)
 print()
 print(" Plotting completed successfully. ")
 print()
--- a/AMSS_NCKU_Verify_ASC26.py
+++ b/AMSS_NCKU_Verify_ASC26.py
@@ -9,11 +9,6 @@ Verification Requirements:
   - Y Component RMS
   - Z Component RMS
 2. ADM constraint violation < 2 (Grid Level 0)
 3. The following figure PDFs must match GW150914-origin exactly after rasterization:
   - ADM_Constraint_Grid_Level_0.pdf
   - BH_Trajectory_21_XY.pdf
   - BH_Trajectory_XY.pdf
   The script also reports the percentage of differing pixels for each figure.
 RMS Calculation Method:
 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
@@ -28,10 +23,6 @@ Reference: GW150914-origin (baseline simulation)
 import numpy as np
 import sys
 import os
 import shutil
 import subprocess
 import tempfile
 from PIL import Image
 # ANSI Color Codes
 class Color:
@@ -70,132 +61,6 @@ def load_constraint_data(filepath):
                data.append([float(x) for x in parts[:8]])
    return np.array(data)
 def resolve_figure_dir(path):
    """Resolve the sibling figure directory from an output or figure path."""
    normalized = os.path.normpath(path)
    if os.path.basename(normalized) == "figure":
        return normalized
    return os.path.join(os.path.dirname(normalized), "figure")
 def render_pdf_to_images(pdf_path, dpi=150):
    """Render a PDF to RGB images using Ghostscript."""
    gs_path = shutil.which("gs")
    if gs_path is None:
        raise RuntimeError("Ghostscript executable 'gs' was not found in PATH")
    with tempfile.TemporaryDirectory(prefix="amss_verify_pdf_") as temp_dir:
        output_pattern = os.path.join(temp_dir, "page-%03d.ppm")
        cmd = [
            gs_path,
            "-q",
            "-dSAFER",
            "-dBATCH",
            "-dNOPAUSE",
            "-sDEVICE=ppmraw",
            f"-r{dpi}",
            f"-o{output_pattern}",
            pdf_path
        ]
        try:
            subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
        except subprocess.CalledProcessError as exc:
            message = exc.stderr.strip() or str(exc)
            raise RuntimeError(f"Failed to render PDF '{pdf_path}': {message}") from exc
        ppm_files = sorted(
            os.path.join(temp_dir, filename)
            for filename in os.listdir(temp_dir)
            if filename.endswith(".ppm")
        )
        if not ppm_files:
            raise RuntimeError(f"No rendered pages were produced for '{pdf_path}'")
        images = []
        for ppm_file in ppm_files:
            with Image.open(ppm_file) as img:
                images.append(np.array(img.convert("RGB"), dtype=np.uint8))
        return images
 def compare_rendered_pages(ref_img, target_img):
    """Return (different_pixels, total_pixels) for two rendered RGB pages."""
    ref_h, ref_w = ref_img.shape[:2]
    tgt_h, tgt_w = target_img.shape[:2]
    total_pixels = max(ref_h, tgt_h) * max(ref_w, tgt_w)
    if ref_h == tgt_h and ref_w == tgt_w:
        different_pixels = int(np.count_nonzero(np.any(ref_img != target_img, axis=2)))
        return different_pixels, total_pixels
    diff_mask = np.ones((max(ref_h, tgt_h), max(ref_w, tgt_w)), dtype=bool)
    overlap_h = min(ref_h, tgt_h)
    overlap_w = min(ref_w, tgt_w)
    overlap_diff = np.any(ref_img[:overlap_h, :overlap_w] != target_img[:overlap_h, :overlap_w], axis=2)
    diff_mask[:overlap_h, :overlap_w] = overlap_diff
    different_pixels = int(np.count_nonzero(diff_mask))
    return different_pixels, total_pixels
 def compare_pdf_images(ref_pdf, target_pdf, dpi=150, threshold_percent=0.001):
    """Compare two PDFs by rasterizing them and counting differing pixels."""
    ref_pages = render_pdf_to_images(ref_pdf, dpi=dpi)
    target_pages = render_pdf_to_images(target_pdf, dpi=dpi)
    total_pixels = 0
    different_pixels = 0
    max_pages = max(len(ref_pages), len(target_pages))
    for page_idx in range(max_pages):
        if page_idx < len(ref_pages) and page_idx < len(target_pages):
            page_diff, page_total = compare_rendered_pages(ref_pages[page_idx], target_pages[page_idx])
        else:
            existing_page = ref_pages[page_idx] if page_idx < len(ref_pages) else target_pages[page_idx]
            page_total = existing_page.shape[0] * existing_page.shape[1]
            page_diff = page_total
        total_pixels += page_total
        different_pixels += page_diff
    diff_percent = (different_pixels / total_pixels * 100.0) if total_pixels else 0.0
    return {
        "different_pixels": different_pixels,
        "total_pixels": total_pixels,
        "diff_percent": diff_percent,
        "pages_ref": len(ref_pages),
        "pages_target": len(target_pages),
        "passed": diff_percent < threshold_percent
    }
 def compare_required_figures(reference_figure_dir, target_figure_dir):
    """Compare the required GW150914 figure PDFs."""
    figure_names = [
        "ADM_Constraint_Grid_Level_0.pdf",
        "BH_Trajectory_21_XY.pdf",
        "BH_Trajectory_XY.pdf"
    ]
    results = []
    for figure_name in figure_names:
        ref_pdf = os.path.join(reference_figure_dir, figure_name)
        target_pdf = os.path.join(target_figure_dir, figure_name)
        if not os.path.exists(ref_pdf):
            raise FileNotFoundError(f"Reference figure not found: {ref_pdf}")
        if not os.path.exists(target_pdf):
            raise FileNotFoundError(f"Target figure not found: {target_pdf}")
        comparison = compare_pdf_images(ref_pdf, target_pdf)
        comparison["name"] = figure_name
        results.append(comparison)
    return results
 def calculate_all_rms_errors(bh_data_ref, bh_data_target):
    """
    Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
@@ -319,45 +184,18 @@ def print_constraint_results(results, threshold=2.0):
    return passed
-def print_figure_results(results, threshold_percent=0.001):
+def print_summary(rms_passed, constraint_passed):
    print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
    print("-" * 65)
    print(f"   Requirement: < {threshold_percent:.3f}% differing pixels\n")
    all_passed = True
    for result in results:
        passed = result["passed"]
        all_passed = all_passed and passed
        status = get_status_text(passed)
        print(f"   {result['name']:32}: {result['diff_percent']:10.6f}%   |   Status: {status}")
        if result["pages_ref"] != result["pages_target"]:
            print(f"   {'':32}  pages(ref/target): {result['pages_ref']}/{result['pages_target']}")
    return all_passed
 def print_figure_error(error_message):
    print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
    print("-" * 65)
    print(f"   {Color.RED}Error: {error_message}{Color.RESET}")
    return False
 def print_summary(rms_passed, constraint_passed, figure_passed):
    print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
    print(Color.BOLD + "Verification Summary" + Color.RESET)
    print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
-    all_passed = rms_passed and constraint_passed and figure_passed
+    all_passed = rms_passed and constraint_passed
    res_rms = get_status_text(rms_passed)
    res_con = get_status_text(constraint_passed)
    res_fig = get_status_text(figure_passed)
    print(f"   [1] Comprehensive RMS check:      {res_rms}")
    print(f"   [2] ADM constraint check:         {res_con}")
    print(f"   [3] Figure pixel comparison:      {res_fig}")
    final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
    print(f"\n   Overall result: {final_status}")
@@ -374,8 +212,6 @@ def main():
    script_dir = os.path.dirname(os.path.abspath(__file__))
    reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
    target_figure_dir = resolve_figure_dir(target_dir)
    reference_figure_dir = os.path.join(script_dir, "GW150914-origin/figure")
    bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
    bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
@@ -394,8 +230,6 @@ def main():
    print_header()
    print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target (Optimized):  {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
    print(f"{Color.BOLD}Reference Figures:   {Color.RESET} {Color.BLUE}{reference_figure_dir}{Color.RESET}")
    print(f"{Color.BOLD}Target Figures:      {Color.RESET} {Color.BLUE}{target_figure_dir}{Color.RESET}")
    bh_data_ref = load_bh_trajectory(bh_file_ref)
    bh_data_target = load_bh_trajectory(bh_file_target)
@@ -409,13 +243,7 @@ def main():
    constraint_results = analyze_constraint_violation(constraint_data)
    constraint_passed = print_constraint_results(constraint_results)
-    try:
+    all_passed = print_summary(rms_passed, constraint_passed)
        figure_results = compare_required_figures(reference_figure_dir, target_figure_dir)
        figure_passed = print_figure_results(figure_results)
    except (FileNotFoundError, RuntimeError) as exc:
        figure_passed = print_figure_error(str(exc))
    all_passed = print_summary(rms_passed, constraint_passed, figure_passed)
    sys.exit(0 if all_passed else 1)
 if __name__ == "__main__":
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,56 @@ close(77)
 end program checkFFT
 #endif
 !-------------
 ! Optimized FFT using Intel oneMKL DFTI
 ! Mathematical equivalence: Standard DFT definition
 !   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
 !   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
 ! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
 !-------------
 SUBROUTINE four1(dataa,nn,isign)
 use MKL_DFTI
 implicit none
-INTEGER, intent(in) :: isign, nn
+INTEGER::isign,nn
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+double precision,dimension(2*nn)::dataa
-
+INTEGER::i,istep,j,m,mmax,n
-type(DFTI_DESCRIPTOR), pointer :: desc
+double precision::tempi,tempr
-integer :: status
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-
+n=2*nn
-! Create DFTI descriptor for 1D complex-to-complex transform
+j=1
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+do i=1,n,2
-if (status /= 0) return
+  if(j.gt.i)then
-
+     tempr=dataa(j)
-! Set input/output storage as interleaved complex (default)
+     tempi=dataa(j+1)
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+     dataa(j)=dataa(i)
-if (status /= 0) then
+     dataa(j+1)=dataa(i+1)
-   status = DftiFreeDescriptor(desc)
+     dataa(i)=tempr
-   return
+     dataa(i+1)=tempi
  endif
  m=nn
 1 if ((m.ge.2).and.(j.gt.m)) then
  j=j-m
  m=m/2
 goto 1
  endif
 j=j+m
 enddo
 mmax=2
 2  if (n.gt.mmax) then
     istep=2*mmax
     theta=6.28318530717959d0/(isign*mmax)
     wpr=-2.d0*sin(0.5d0*theta)**2
     wpi=sin(theta)
     wr=1.d0
     wi=0.d0
     do m=1,mmax,2
       do i=m,n,istep
         j=i+mmax
         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
         dataa(j)=dataa(i)-tempr
         dataa(j+1)=dataa(i+1)-tempi
         dataa(i)=dataa(i)+tempr
         dataa(i+1)=dataa(i+1)+tempi
       enddo
          wtemp=wr
          wr=wr*wpr-wi*wpi+wr
          wi=wi*wpr+wtemp*wpi+wi
     enddo
 mmax=istep
 goto 2
 endif
 ! Commit the descriptor
 status = DftiCommitDescriptor(desc)
 if (status /= 0) then
   status = DftiFreeDescriptor(desc)
   return
 endif
 ! Execute FFT based on direction
 if (isign == 1) then
   ! Forward FFT: exp(-2*pi*i*k*n/N)
   status = DftiComputeForward(desc, dataa)
 else
   ! Backward FFT: exp(+2*pi*i*k*n/N)
   status = DftiComputeBackward(desc, dataa)
 endif
 ! Free descriptor
 status = DftiFreeDescriptor(desc)
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -5,42 +5,6 @@
 #include "misc.h"
 #include "parameters.h"
 namespace
 {
 enum { MAX_DATA_PACKER_VARS = 64 };
 int expand_var_list_pack_info(MyList<var> *src_list, MyList<var> *dst_list,
                              int *src_sgfn, int *dst_sgfn, double **src_soa)
 {
  int count = 0;
  MyList<var> *src_it = src_list;
  MyList<var> *dst_it = dst_list;
  while (src_it && dst_it)
  {
    if (count >= MAX_DATA_PACKER_VARS)
    {
      cout << "Parallel::data_packer: too many variables in communication list." << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    src_sgfn[count] = src_it->data->sgfn;
    dst_sgfn[count] = dst_it->data->sgfn;
    src_soa[count] = src_it->data->SoA;
    count++;
    src_it = src_it->next;
    dst_it = dst_it->next;
  }
  if (src_it || dst_it)
  {
    cout << "error in short data packer, var lists does not match." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  return count;
 }
 }
 int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
 {
  nx = Mymax(1, shape / min_width);
@@ -3766,10 +3730,21 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
  if (!src || !dst)
    return size_out;
-  int src_sgfn[MAX_DATA_PACKER_VARS];
+  MyList<var> *varls, *varld;
-  int dst_sgfn[MAX_DATA_PACKER_VARS];
+
-  double *src_soa[MAX_DATA_PACKER_VARS];
+  varls = VarLists;
-  const int var_count = expand_var_list_pack_info(VarLists, VarListd, src_sgfn, dst_sgfn, src_soa);
+  varld = VarListd;
  while (varls && varld)
  {
    varls = varls->next;
    varld = varld->next;
  }
  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int type; /* 1 copy, 2 restrict, 3 prolong */
  if (src->data->Bg->lev == dst->data->Bg->lev)
@@ -3781,57 +3756,43 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
  while (src && dst)
  {
-    const bool rank_match =
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
-        (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
    if (rank_match)
    {
-      const int segment_size = dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
+      varls = VarLists;
-      int offset = size_out;
+      varld = VarListd;
-
+      while (varls && varld)
      if (data)
      {
-        if (dir == PACK)
+        if (data)
        {
-          switch (type)
+          if (dir == PACK)
-          {
+            switch (type)
            {
              // attention must be paied to the difference between src's llb,uub and dst's llb,uub
            case 1:
-              for (int iv = 0; iv < var_count; iv++, offset += segment_size)
+              f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                f_copy(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + offset,
+                     src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                       src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape,
+                     dst->data->llb, dst->data->uub);
                       src->data->Bg->fgfs[src_sgfn[iv]], dst->data->llb, dst->data->uub);
              break;
            case 2:
-              for (int iv = 0; iv < var_count; iv++, offset += segment_size)
+              f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                f_restrict3(DIM, dst->data->llb, dst->data->uub, dst->data->shape, data + offset,
+                          src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                            src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape,
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
                            src->data->Bg->fgfs[src_sgfn[iv]], dst->data->llb, dst->data->uub,
                            src_soa[iv], Symmetry);
              break;
            case 3:
-              for (int iv = 0; iv < var_count; iv++, offset += segment_size)
+              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape,
+                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-                           src->data->Bg->fgfs[src_sgfn[iv]], dst->data->llb, dst->data->uub,
+                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
-                           dst->data->shape, data + offset, dst->data->llb, dst->data->uub,
+            }
-                           src_soa[iv], Symmetry);
+          if (dir == UNPACK) // from target data to corresponding grid
-              break;
+            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-            default:
+                   dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
-              break;
+                   dst->data->llb, dst->data->uub);
          }
        }
        else
        {
          for (int iv = 0; iv < var_count; iv++, offset += segment_size)
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape,
                   dst->data->Bg->fgfs[dst_sgfn[iv]], dst->data->llb, dst->data->uub,
                   dst->data->shape, data + offset, dst->data->llb, dst->data->uub);
        }
        size_out += dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
        varls = varls->next;
        varld = varld->next;
      }
      size_out = offset + ((!data) ? segment_size * var_count : 0);
      if (data)
        size_out = offset;
    }
    dst = dst->next;
    src = src->next;
@@ -3858,10 +3819,21 @@ int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyLis
  if (!src || !dst)
    return size_out;
-  int src_sgfn[MAX_DATA_PACKER_VARS];
+  MyList<var> *varls, *varld;
-  int dst_sgfn[MAX_DATA_PACKER_VARS];
+
-  double *src_soa[MAX_DATA_PACKER_VARS];
+  varls = VarLists;
-  const int var_count = expand_var_list_pack_info(VarLists, VarListd, src_sgfn, dst_sgfn, src_soa);
+  varld = VarListd;
  while (varls && varld)
  {
    varls = varls->next;
    varld = varld->next;
  }
  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  int type; /* 1 copy, 2 restrict, 3 prolong */
  if (src->data->Bg->lev == dst->data->Bg->lev)
@@ -3879,41 +3851,30 @@ int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyLis
  while (src && dst)
  {
-    const bool rank_match =
+    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
-        (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
        (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
    if (rank_match)
    {
-      const int segment_size =
+      varls = VarLists;
-          (src->data->shape[0] + 2 * ghost_width) *
+      varld = VarListd;
-          (src->data->shape[1] + 2 * ghost_width) *
+      while (varls && varld)
          (src->data->shape[2] + 2 * ghost_width);
      int offset = size_out;
      if (data)
      {
-        if (dir == PACK)
+        if (data)
        {
-          for (int iv = 0; iv < var_count; iv++, offset += segment_size)
+          if (dir == PACK)
-            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape,
+            f_prolongcopy3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
-                           src->data->Bg->fgfs[src_sgfn[iv]], dst->data->llb, dst->data->uub,
+                           dst->data->llb, dst->data->uub, src->data->shape, data + size_out,
-                           src->data->shape, data + offset, src->data->llb, src->data->uub,
+                           src->data->llb, src->data->uub, varls->data->SoA, Symmetry);
-                           src_soa[iv], Symmetry);
+          if (dir == UNPACK) // from target data to corresponding grid
-        }
+            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-        else
+                          src->data->llb, src->data->uub, src->data->shape, data + size_out,
-        {
+                          dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry, dst->data->illb, dst->data->iuub);
          for (int iv = 0; iv < var_count; iv++, offset += segment_size)
            f_prolongmix3(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape,
                          dst->data->Bg->fgfs[dst_sgfn[iv]], src->data->llb, src->data->uub,
                          src->data->shape, data + offset, dst->data->llb, dst->data->uub,
                          src_soa[iv], Symmetry, dst->data->illb, dst->data->iuub);
        }
        // the symmetry problem should be dealt in prolongcopy3,
        // so we always have ghost_width for both sides
        size_out += (src->data->shape[0] + 2 * ghost_width) * (src->data->shape[1] + 2 * ghost_width) * (src->data->shape[2] + 2 * ghost_width);
        varls = varls->next;
        varld = varld->next;
      }
      size_out = offset + ((!data) ? segment_size * var_count : 0);
      if (data)
        size_out = offset;
    }
    dst = dst->next;
    src = src->next;
--- a/AMSS_NCKU_source/ShellPatch.C
+++ b/AMSS_NCKU_source/ShellPatch.C
--- a/AMSS_NCKU_source/ShellPatch.h
+++ b/AMSS_NCKU_source/ShellPatch.h
@@ -102,16 +102,6 @@ public:
                 //-1: means no dumy dimension at all; 0: means rho; 1: means sigma
   };
   // Thread-safe search result (no pointers to shared mutable state)
   struct PointSearchResult
   {
      bool found;
      Block *Bg;
      double gx, gy, gz; // global Cartesian coordinates
      double lx, ly, lz; // local coordinates within the found block
      int ssst;          // source shell-patch type (-1 = Cartesian)
   };
   int myrank;
   int shape[dim];   // for (rho, sigma, R), for rho and sigma means number of points for every pi/2
   double Rrange[2]; // for Rmin and Rmax
@@ -185,12 +175,6 @@ public:
                         MyList<Patch> *Pp, double CDH[dim], MyList<pointstru> *pss);
   bool prolongpointstru(MyList<pointstru> *&psul, bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                         MyList<Patch> *Pp, double CDH[dim], double x, double y, double z, int Symmetry, int rank_in);
   // Read-only point search — thread-safe (no shared mutable state modified)
   PointSearchResult prolongpointstru_search(bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                                             MyList<Patch> *Pp, double CDH[dim], double x, double y, double z,
                                             int Symmetry, int rank_in);
   // Append a search result to a linked list — use inside omp critical section
   void prolongpointstru_append(MyList<pointstru> *&psul, const PointSearchResult &sr, int tsst);
   void setupintintstuff(int cpusize, MyList<Patch> *CPatL, int Symmetry);
   void intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -27,7 +27,21 @@ using namespace std;
 #endif
 #include "TwoPunctures.h"
-#include <mkl_cblas.h>
+
 extern "C" {
 double cblas_ddot(const int, const double *, const int, const double *, const int);
 double cblas_dnrm2(const int, const double *, const int);
 void cblas_dgemm(const int, const int, const int,
                 const int, const int, const int,
                 const double, const double *, const int,
                 const double *, const int, const double,
                 double *, const int);
 }
 enum {
  CblasRowMajor = 101,
  CblasNoTrans = 111
 };
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
--- a/AMSS_NCKU_source/Z4c_rhs.f90
+++ b/AMSS_NCKU_source/Z4c_rhs.f90
@@ -94,31 +94,29 @@
               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon,                    &            
               Symmetry,Lev,eps,co)
  if (co == 0) then
 #if (ABV == 0)  
-    call ricci_gamma(ex, X, Y, Z,                                      &
+  call ricci_gamma(ex, X, Y, Z,                                      &
-                 chi,                                                  &
+               chi,                                                  &
-                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-                 Gamx   ,  Gamy    ,  Gamz    , &
+               Gamx   ,  Gamy    ,  Gamz    , &
-                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-                 Symmetry)
+               Symmetry)
 #endif
-    call constraint_bssn(ex, X, Y, Z,&
+  call constraint_bssn(ex, X, Y, Z,&
-                 chi,trK, &
+               chi,trK, &
-                 dxx,gxy,gxz,dyy,gyz,dzz, &
+               dxx,gxy,gxz,dyy,gyz,dzz, &
-                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
+               Axx,Axy,Axz,Ayy,Ayz,Azz, &
-                 Gamx,Gamy,Gamz,&
+               Gamx,Gamy,Gamz,&
-                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
-                 Symmetry)
+               Symmetry)
  endif
  return
@@ -229,7 +227,6 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
 !!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -264,7 +261,6 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -1267,32 +1263,30 @@
  endif
  if (co == 0) then
 #if (ABV == 0)  
-    call ricci_gamma(ex, X, Y, Z,                                      &
+  call ricci_gamma(ex, X, Y, Z,                                      &
-                 chi,                                                  &
+               chi,                                                  &
-                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-                 Gamx   ,  Gamy    ,  Gamz    , &
+               Gamx   ,  Gamy    ,  Gamz    , &
-                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-                 Symmetry)
+               Symmetry)
 #endif
-    call constraint_bssn(ex, X, Y, Z,&
+  call constraint_bssn(ex, X, Y, Z,&
-                 chi,trK, &
+               chi,trK, &
-                 dxx,gxy,gxz,dyy,gyz,dzz, &
+               dxx,gxy,gxz,dyy,gyz,dzz, &
-                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
+               Axx,Axy,Axz,Ayy,Ayz,Azz, &
-                 Gamx,Gamy,Gamz,&
+               Gamx,Gamy,Gamz,&
-                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
-                 Symmetry)
+               Symmetry)
  endif
  gont = 0
--- a/AMSS_NCKU_source/Z4c_rhs_ss.f90
+++ b/AMSS_NCKU_source/Z4c_rhs_ss.f90
@@ -122,7 +122,6 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
 !!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -157,7 +156,6 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -1390,43 +1388,41 @@
  call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
  endif
  if (co == 0) then
 #if (ABV == 1)  
-    call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                               &
+  call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                                 &
-                 drhodx, drhody, drhodz,                                       &
+               drhodx, drhody, drhodz,                                         &
-                 dsigmadx,dsigmady,dsigmadz,                                   &
+               dsigmadx,dsigmady,dsigmadz,                                     &
-                 dRdx,dRdy,dRdz,                                               &
+               dRdx,dRdy,dRdz,                                                 &
-                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
+               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
-                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
+               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
-                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
+               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
-                 chi,                                                          &
+               chi,                                                  &
-                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-                 Gamx   ,  Gamy    ,  Gamz    , &
+               Gamx   ,  Gamy    ,  Gamz    , &
-                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-                 Symmetry,Lev,sst)
+               Symmetry,Lev,sst)
  call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
               drhodx, drhody, drhodz,                                         &
               dsigmadx,dsigmady,dsigmadz,                                     &
               dRdx,dRdy,dRdz,                                                 &
               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
               chi,trK, &
               dxx,gxy,gxz,dyy,gyz,dzz, &
               Axx,Axy,Axz,Ayy,Ayz,Azz, &
               Gamx,Gamy,Gamz,&
               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
               Symmetry,Lev,sst)
 #endif
    call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
                 drhodx, drhody, drhodz,                                       &
                 dsigmadx,dsigmady,dsigmadz,                                   &
                 dRdx,dRdy,dRdz,                                               &
                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
                 chi,trK, &
                 dxx,gxy,gxz,dyy,gyz,dzz, &
                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
                 Gamx,Gamy,Gamz,&
                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
                 Symmetry,Lev,sst)
  endif
  gont = 0
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -258,8 +258,6 @@ void bssnEM_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  setup_transfer_caches();
 }
 //================================================================================================
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -26,12 +26,6 @@ using namespace std;
 #include "shellfunctions.h"
 #include "parameters.h"
 #if BSSN_USE_ESCALAR_C_KERNEL
 #define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar_c
 #else
 #define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
@@ -139,9 +133,6 @@ void bssnEScalar_class::Initialize()
    } 
  GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
  ConstraintRefreshLevels = new int[GH->levels];
  for (int il = 0; il < GH->levels; il++)
    ConstraintRefreshLevels[il] = 0;
  if (checkrun)
    CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
  else
@@ -174,8 +165,6 @@ void bssnEScalar_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  setup_transfer_caches();
 }
 //================================================================================================
@@ -241,9 +230,6 @@ void bssnEScalar_class::Read_Ansorg()
    }
    int BH_NM;
    double *Porg_here;
    double *pmom_local;
    double *spin_local;
    double *mass_local;
    // read parameter from file
    {
      const int LEN = 256;
@@ -285,9 +271,9 @@ void bssnEScalar_class::Read_Ansorg()
    }
    Porg_here = new double[3 * BH_NM];
-    pmom_local = new double[3 * BH_NM];
+    Pmom = new double[3 * BH_NM];
-    spin_local = new double[3 * BH_NM];
+    Spin = new double[3 * BH_NM];
-    mass_local = new double[BH_NM];
+    Mass = new double[BH_NM];
    // read parameter from file
    {
      const int LEN = 256;
@@ -322,7 +308,7 @@ void bssnEScalar_class::Read_Ansorg()
        if (sgrp == "BSSN" && sind < BH_NM)
        {
          if (skey == "Mass")
-            mass_local[sind] = atof(sval.c_str());
+            Mass[sind] = atof(sval.c_str());
          else if (skey == "Porgx")
            Porg_here[sind * 3] = atof(sval.c_str());
          else if (skey == "Porgy")
@@ -330,17 +316,17 @@ void bssnEScalar_class::Read_Ansorg()
          else if (skey == "Porgz")
            Porg_here[sind * 3 + 2] = atof(sval.c_str());
          else if (skey == "Spinx")
-            spin_local[sind * 3] = atof(sval.c_str());
+            Spin[sind * 3] = atof(sval.c_str());
          else if (skey == "Spiny")
-            spin_local[sind * 3 + 1] = atof(sval.c_str());
+            Spin[sind * 3 + 1] = atof(sval.c_str());
          else if (skey == "Spinz")
-            spin_local[sind * 3 + 2] = atof(sval.c_str());
+            Spin[sind * 3 + 2] = atof(sval.c_str());
          else if (skey == "Pmomx")
-            pmom_local[sind * 3] = atof(sval.c_str());
+            Pmom[sind * 3] = atof(sval.c_str());
          else if (skey == "Pmomy")
-            pmom_local[sind * 3 + 1] = atof(sval.c_str());
+            Pmom[sind * 3 + 1] = atof(sval.c_str());
          else if (skey == "Pmomz")
-            pmom_local[sind * 3 + 2] = atof(sval.c_str());
+            Pmom[sind * 3 + 2] = atof(sval.c_str());
        }
      }
      inf.close();
@@ -376,7 +362,7 @@ void bssnEScalar_class::Read_Ansorg()
                                      cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
                                      cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
                                      cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                      mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                      Mass, Porg_here, Pmom, Spin, BH_NM);
          }
          if (BL == Pp->data->ble)
            break;
@@ -418,7 +404,7 @@ void bssnEScalar_class::Read_Ansorg()
                                       cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
                                       cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
                                       cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                       mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                       Mass, Porg_here, Pmom, Spin, BH_NM);
        }
        if (BL == Pp->data->ble)
          break;
@@ -429,9 +415,6 @@ void bssnEScalar_class::Read_Ansorg()
 #endif
    delete[] Porg_here;
    delete[] pmom_local;
    delete[] spin_local;
    delete[] mass_local;
    // dump read_in initial data
    //   for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
  }
@@ -472,9 +455,6 @@ void bssnEScalar_class::Read_Pablo()
    }
    int BH_NM;
    double *Porg_here;
    double *pmom_local;
    double *spin_local;
    double *mass_local;
    // read parameter from file
    {
      const int LEN = 256;
@@ -516,9 +496,9 @@ void bssnEScalar_class::Read_Pablo()
    }
    Porg_here = new double[3 * BH_NM];
-    pmom_local = new double[3 * BH_NM];
+    Pmom = new double[3 * BH_NM];
-    spin_local = new double[3 * BH_NM];
+    Spin = new double[3 * BH_NM];
-    mass_local = new double[BH_NM];
+    Mass = new double[BH_NM];
    // read parameter from file
    {
      const int LEN = 256;
@@ -553,7 +533,7 @@ void bssnEScalar_class::Read_Pablo()
        if (sgrp == "BSSN" && sind < BH_NM)
        {
          if (skey == "Mass")
-            mass_local[sind] = atof(sval.c_str());
+            Mass[sind] = atof(sval.c_str());
          else if (skey == "Porgx")
            Porg_here[sind * 3] = atof(sval.c_str());
          else if (skey == "Porgy")
@@ -561,17 +541,17 @@ void bssnEScalar_class::Read_Pablo()
          else if (skey == "Porgz")
            Porg_here[sind * 3 + 2] = atof(sval.c_str());
          else if (skey == "Spinx")
-            spin_local[sind * 3] = atof(sval.c_str());
+            Spin[sind * 3] = atof(sval.c_str());
          else if (skey == "Spiny")
-            spin_local[sind * 3 + 1] = atof(sval.c_str());
+            Spin[sind * 3 + 1] = atof(sval.c_str());
          else if (skey == "Spinz")
-            spin_local[sind * 3 + 2] = atof(sval.c_str());
+            Spin[sind * 3 + 2] = atof(sval.c_str());
          else if (skey == "Pmomx")
-            pmom_local[sind * 3] = atof(sval.c_str());
+            Pmom[sind * 3] = atof(sval.c_str());
          else if (skey == "Pmomy")
-            pmom_local[sind * 3 + 1] = atof(sval.c_str());
+            Pmom[sind * 3 + 1] = atof(sval.c_str());
          else if (skey == "Pmomz")
-            pmom_local[sind * 3 + 2] = atof(sval.c_str());
+            Pmom[sind * 3 + 2] = atof(sval.c_str());
        }
      }
      inf.close();
@@ -618,7 +598,7 @@ void bssnEScalar_class::Read_Pablo()
                                        cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
                                        cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
                                        cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                        mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                        Mass, Porg_here, Pmom, Spin, BH_NM);
            }
            if (BL == Pp->data->ble)
              break;
@@ -682,7 +662,7 @@ void bssnEScalar_class::Read_Pablo()
                                         cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
                                         cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
                                         cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
-                                         mass_local, Porg_here, pmom_local, spin_local, BH_NM);
+                                         Mass, Porg_here, Pmom, Spin, BH_NM);
          }
          if (BL == Pp->data->ble)
            break;
@@ -706,9 +686,6 @@ void bssnEScalar_class::Read_Pablo()
 #endif
    delete[] Porg_here;
    delete[] pmom_local;
    delete[] spin_local;
    delete[] mass_local;
    if (flag && myrank == 0)
      MPI_Abort(MPI_COMM_WORLD, 1);
    // dump read_in initial data
@@ -762,7 +739,7 @@ void bssnEScalar_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
-        if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                       cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -1016,8 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN)
  }
 #endif
-  Parallel::AsyncSyncState async_pre;
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
  sync_predictor_start(lev, SynchList_pre, async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -1036,7 +1012,6 @@ void bssnEScalar_class::Step(int lev, int YN)
    }
  }
 #endif
  sync_predictor_finish(lev, async_pre, SynchList_pre);
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1106,7 +1081,7 @@ void bssnEScalar_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                         cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1374,8 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN)
    }
 #endif
-    Parallel::AsyncSyncState async_cor;
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
    sync_corrector_start(lev, SynchList_cor, async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -1394,7 +1368,6 @@ void bssnEScalar_class::Step(int lev, int YN)
      }
    }
 #endif
    sync_corrector_finish(lev, async_cor, SynchList_cor);
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
@@ -1862,11 +1835,8 @@ void bssnEScalar_class::AnalysisStuff_EScalar(int lev, double dT_lev)
 //================================================================================================
-void bssnEScalar_class::Interp_Constraint(bool infg)
+void bssnEScalar_class::Interp_Constraint()
 {
  if (!infg)
    return;
  // we do not support a_lev != 0 yet.
  if (a_lev > 0)
    return;
@@ -1888,7 +1858,7 @@ void bssnEScalar_class::Interp_Constraint(bool infg)
          if (myrank == cg->rank)
          {
            if (lev > 0)
-              BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                         cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                         cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                         cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -2108,7 +2078,7 @@ void bssnEScalar_class::Constraint_Out()
            if (myrank == cg->rank)
            {
              if (lev > 0)
-                BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                           cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
--- a/AMSS_NCKU_source/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/bssnEScalar_class.h
@@ -51,7 +51,7 @@ public:
     void Compute_Psi4(int lev);
     void Step(int lev, int YN);
     void AnalysisStuff_EScalar(int lev, double dT_lev);
-     void Interp_Constraint(bool infg);
+     void Interp_Constraint();
     void Constraint_Out(); 
 protected:
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -299,28 +299,6 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  // Derived classes override Initialize(), so ownership-sensitive members must
  // be in a known state before any specialized setup path runs.
  GH = 0;
  SH = 0;
  PhysTime = 0.0;
  BH_num = 0;
  BH_num_input = 0;
  Porg0 = 0;
  Porgbr = 0;
  Porg = 0;
  Porg1 = 0;
  Porg_rhs = 0;
  Mass = 0;
  Pmom = 0;
  Spin = 0;
  sync_cache_pre = 0;
  sync_cache_cor = 0;
  sync_cache_rp_coarse = 0;
  sync_cache_rp_fine = 0;
  sync_cache_restrict = 0;
  sync_cache_outbd = 0;
  // setup Monitors
  {
    stringstream a_stream;
@@ -1008,7 +986,13 @@ void bssn_class::Initialize()
    Setup_Black_Hole_position();
  }
-  setup_transfer_caches();
+  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -1263,7 +1247,30 @@ bssn_class::~bssn_class()
 #endif
  // Destroy sync caches before GH
-  destroy_transfer_caches();
+  if (sync_cache_pre)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_pre[i].destroy();
    delete[] sync_cache_pre;
  }
  if (sync_cache_cor)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_cor[i].destroy();
    delete[] sync_cache_cor;
  }
  if (sync_cache_rp_coarse)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
  if (sync_cache_rp_fine)
  {
    for (int i = 0; i < GH->levels; i++)
      sync_cache_rp_fine[i].destroy();
    delete[] sync_cache_rp_fine;
  }
  delete GH;
 #ifdef WithShell
@@ -2482,7 +2489,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    invalidate_transfer_caches();
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
    STEP_TIMER_ADD(TB_REGRID, timer_regrid);
 #endif
@@ -2723,7 +2730,7 @@ void bssn_class::RecursiveStep(int lev)
  {
  if (ConstraintRefreshLevels)
    ConstraintRefreshLevels[lev] = 1;
-  invalidate_transfer_caches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
  }
  STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
 #endif
@@ -2904,7 +2911,7 @@ void bssn_class::ParallelStep()
  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-    invalidate_transfer_caches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }
@@ -3068,10 +3075,10 @@ void bssn_class::ParallelStep()
      if (lev + 1 >= GH->movls)
      {
        //	       GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
-          if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
+        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
-                              SynchList_cor, OldStateList, StateList, SynchList_pre,
+                            SynchList_cor, OldStateList, StateList, SynchList_pre,
-                              fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
+                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-            invalidate_transfer_caches();
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
        //               a_stream.clear();
        //               a_stream.str("");
@@ -3086,7 +3093,7 @@ void bssn_class::ParallelStep()
      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-        invalidate_transfer_caches();
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
      //               a_stream.clear();
      //               a_stream.str("");
@@ -3105,7 +3112,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-            invalidate_transfer_caches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -3121,7 +3128,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-            invalidate_transfer_caches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
          //               a_stream.clear();
          //               a_stream.str("");
@@ -3652,7 +3659,7 @@ void bssn_class::Step(int lev, int YN)
  STEP_TIMER_DECL(timer_predictor_sync);
  Parallel::AsyncSyncState async_pre;
-  sync_predictor_start(lev, SynchList_pre, async_pre);
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3671,7 +3678,7 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
-  sync_predictor_finish(lev, async_pre, SynchList_pre);
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
@@ -4017,7 +4024,7 @@ void bssn_class::Step(int lev, int YN)
    STEP_TIMER_DECL(timer_corrector_sync);
    Parallel::AsyncSyncState async_cor;
-    sync_corrector_start(lev, SynchList_cor, async_cor);
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -4036,7 +4043,7 @@ void bssn_class::Step(int lev, int YN)
      }
    }
 #endif
-    sync_corrector_finish(lev, async_cor, SynchList_cor);
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
@@ -4525,7 +4532,7 @@ void bssn_class::Step(int lev, int YN)
 #endif
  Parallel::AsyncSyncState async_pre;
-  sync_predictor_start(lev, SynchList_pre, async_pre);
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -4544,7 +4551,7 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
-  sync_predictor_finish(lev, async_pre, SynchList_pre);
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
@@ -4873,7 +4880,7 @@ void bssn_class::Step(int lev, int YN)
 #endif
    Parallel::AsyncSyncState async_cor;
-    sync_corrector_start(lev, SynchList_cor, async_cor);
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -4892,7 +4899,7 @@ void bssn_class::Step(int lev, int YN)
      }
    }
 #endif
-    sync_corrector_finish(lev, async_cor, SynchList_cor);
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
@@ -5284,7 +5291,7 @@ void bssn_class::Step(int lev, int YN)
  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");
-  sync_evolution(lev, SynchList_pre, sync_cache_pre);
+  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  // Complete non-blocking error reduction and check
  MPI_Wait(&err_req, MPI_STATUS_IGNORE);
@@ -5485,7 +5492,7 @@ void bssn_class::Step(int lev, int YN)
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");
-    sync_evolution(lev, SynchList_cor, sync_cache_cor);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync");
@@ -6074,92 +6081,6 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //
 // SynchList_cor  old -----------
 {
 #if (ABEtype == 1 || ABEtype == 2)
 #if (PSTR == 1 || PSTR == 2)
 //  stringstream a_stream;
 //  a_stream.setf(ios::left);
 #endif
  if (lev > 0)
  {
    MyList<Patch> *Pp, *Ppc;
    if (lev > trfls && YN == 0)
    {
      Pp = GH->PatL[lev - 1];
      while (Pp)
      {
        if (BB)
          Parallel::prepare_inter_time_level(Pp->data, SL, OL, corL,
                                             SynchList_pre, 0);
        else
          Parallel::prepare_inter_time_level(Pp->data, SL, OL,
                                             SynchList_pre, 0);
        Pp = Pp->next;
      }
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
    else
    {
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
  }
  return;
 #endif
  STEP_TIMER_DECL(timer_restrict_prolong);
 #if (PSTR == 1 || PSTR == 2)
 //  stringstream a_stream;
@@ -6202,7 +6123,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      restrict_evolution(lev, SL, SynchList_pre);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6215,7 +6136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      sync_evolution(lev - 1, SynchList_pre, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -6226,7 +6147,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SynchList_pre, SL);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -6253,7 +6174,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
-      restrict_evolution(lev, SL, SL);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -6266,7 +6187,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
-      sync_evolution(lev - 1, SL, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
 #if (PSTR == 1 || PSTR == 2)
 //       a_stream.clear();
@@ -6277,7 +6198,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SL, SL);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6294,7 +6215,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }
-    sync_evolution(lev, SL, sync_cache_rp_fine);
+    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
 #if (PSTR == 1 || PSTR == 2)
 //    a_stream.clear();
@@ -6323,91 +6244,6 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 //
 // SynchList_cor  old -----------
 {
 #if (ABEtype == 1 || ABEtype == 2)
  if (lev >= GH->levels - 1)
    return;
  lev = lev + 1;
  if (lev > 0)
  {
    MyList<Patch> *Pp, *Ppc;
    if (lev > trfls && YN == 0)
    {
      Pp = GH->PatL[lev - 1];
      while (Pp)
      {
        if (BB)
          Parallel::prepare_inter_time_level(Pp->data, SL, OL, corL,
                                             SynchList_pre, 0);
        else
          Parallel::prepare_inter_time_level(Pp->data, SL, OL,
                                             SynchList_pre, 0);
        Pp = Pp->next;
      }
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
    else
    {
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
 #endif
    }
    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
  }
  return;
 #endif
  STEP_TIMER_DECL(timer_restrict_prolong);
  //  misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
@@ -6433,17 +6269,17 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }
 #if (RPB == 0)
-      restrict_evolution(lev, SL, SynchList_pre);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      sync_evolution(lev - 1, SynchList_pre, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SynchList_pre, SL);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -6455,17 +6291,17 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      restrict_evolution(lev, SL, SL);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif
-      sync_evolution(lev - 1, SL, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SL, SL);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6475,11 +6311,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #endif
    }
-#if (ABEtype == 1 || ABEtype == 2)
+    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #else
    sync_evolution(lev, SL, sync_cache_rp_fine);
 #endif
  }
  STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
 }
@@ -6492,93 +6324,8 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 {
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #if (ABEtype == 1 || ABEtype == 2)
  if (lev > 0)
  {
    MyList<Patch> *Pp, *Ppc;
    if (lev > trfls && YN == 0)
    {
      if (myrank == 0)
        cout << "/=: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
      Pp = GH->PatL[lev - 1];
      while (Pp)
      {
        if (BB)
          Parallel::prepare_inter_time_level(Pp->data, StateList, OldStateList, SynchList_cor,
                                             SynchList_pre, 0);
        else
          Parallel::prepare_inter_time_level(Pp->data, StateList, OldStateList,
                                             SynchList_pre, 0);
        Pp = Pp->next;
      }
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
 #endif
    }
    else
    {
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #elif (RPB == 1)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif
      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
 #endif
    }
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
  }
  return;
 #endif
  STEP_TIMER_DECL(timer_restrict_prolong);
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
  // we assume  for fine
  // SynchList_cor 1   -----------
  //
@@ -6611,17 +6358,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
-      restrict_evolution(lev, SynchList_cor, SynchList_pre);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif
-      sync_evolution(lev - 1, SynchList_pre, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SynchList_pre, SynchList_cor);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6635,17 +6382,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-      restrict_evolution(lev, SynchList_cor, StateList);
+      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif
-      sync_evolution(lev - 1, StateList, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, StateList, SynchList_cor);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6655,7 +6402,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #endif
    }
-    sync_evolution(lev, SynchList_cor, sync_cache_rp_fine);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
  }
  STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
 }
@@ -6687,7 +6434,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, SynchList_pre, SynchList_cor);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6700,7 +6447,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    {
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      outbdlow2hi_evolution(lev, StateList, SynchList_cor);
+      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6719,10 +6466,10 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
 #else
      Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #endif
-      sync_evolution(lev - 1, StateList, sync_cache_rp_coarse);
+      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
    }
-    sync_evolution(lev, SynchList_cor, sync_cache_rp_fine);
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
  }
 }
 #undef MIXOUTB
@@ -7452,169 +7199,6 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
    }
  }
 }
 bool bssn_class::use_transfer_cache() const
 {
 #if BSSN_USE_TRANSFER_CACHE
  return true;
 #else
  return false;
 #endif
 }
 void bssn_class::setup_transfer_caches()
 {
  sync_cache_pre = 0;
  sync_cache_cor = 0;
  sync_cache_rp_coarse = 0;
  sync_cache_rp_fine = 0;
  sync_cache_restrict = 0;
  sync_cache_outbd = 0;
  if (!use_transfer_cache() || !GH)
    return;
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 void bssn_class::invalidate_transfer_caches()
 {
  if (!use_transfer_cache() || !GH || !sync_cache_pre || !sync_cache_cor ||
      !sync_cache_rp_coarse || !sync_cache_rp_fine || !sync_cache_restrict || !sync_cache_outbd)
    return;
  for (int il = 0; il < GH->levels; il++)
  {
    sync_cache_pre[il].invalidate();
    sync_cache_cor[il].invalidate();
    sync_cache_rp_coarse[il].invalidate();
    sync_cache_rp_fine[il].invalidate();
    sync_cache_restrict[il].invalidate();
    sync_cache_outbd[il].invalidate();
  }
 }
 void bssn_class::destroy_transfer_caches()
 {
  if (sync_cache_pre)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_pre[i].destroy();
    delete[] sync_cache_pre;
    sync_cache_pre = 0;
  }
  if (sync_cache_cor)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_cor[i].destroy();
    delete[] sync_cache_cor;
    sync_cache_cor = 0;
  }
  if (sync_cache_rp_coarse)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
    sync_cache_rp_coarse = 0;
  }
  if (sync_cache_rp_fine)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_rp_fine[i].destroy();
    delete[] sync_cache_rp_fine;
    sync_cache_rp_fine = 0;
  }
  if (sync_cache_restrict)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_restrict[i].destroy();
    delete[] sync_cache_restrict;
    sync_cache_restrict = 0;
  }
  if (sync_cache_outbd)
  {
    if (use_transfer_cache() && GH)
      for (int i = 0; i < GH->levels; i++)
        sync_cache_outbd[i].destroy();
    delete[] sync_cache_outbd;
    sync_cache_outbd = 0;
  }
 }
 void bssn_class::sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state)
 {
  if (use_transfer_cache())
    Parallel::Sync_start(GH->PatL[lev], VarList, Symmetry, sync_cache_pre[lev], async_state);
  else
    Parallel::Sync(GH->PatL[lev], VarList, Symmetry);
 }
 void bssn_class::sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList)
 {
  if (use_transfer_cache())
    Parallel::Sync_finish(sync_cache_pre[lev], async_state, VarList, Symmetry);
 }
 void bssn_class::sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state)
 {
  if (use_transfer_cache())
    Parallel::Sync_start(GH->PatL[lev], VarList, Symmetry, sync_cache_cor[lev], async_state);
  else
    Parallel::Sync(GH->PatL[lev], VarList, Symmetry);
 }
 void bssn_class::sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList)
 {
  if (use_transfer_cache())
    Parallel::Sync_finish(sync_cache_cor[lev], async_state, VarList, Symmetry);
 }
 void bssn_class::sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array)
 {
  if (use_transfer_cache() && cache_array)
    Parallel::Sync_cached(GH->PatL[lev], VarList, Symmetry, cache_array[lev]);
  else
    Parallel::Sync(GH->PatL[lev], VarList, Symmetry);
 }
 void bssn_class::restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list)
 {
  if (use_transfer_cache())
    Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry, sync_cache_restrict[lev]);
  else
    Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry);
 }
 void bssn_class::outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list)
 {
  if (use_transfer_cache())
  {
    Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry, sync_cache_outbd[lev]);
    return;
  }
  MyList<Patch> *Ppc = GH->PatL[lev - 1];
  while (Ppc)
  {
    MyList<Patch> *Pp = GH->PatL[lev];
    while (Pp)
    {
      Parallel::OutBdLow2Hi(Ppc->data, Pp->data, src_var_list, dst_var_list, Symmetry);
      Pp = Pp->next;
    }
    Ppc = Ppc->next;
  }
 }
 #endif
 //================================================================================================
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -33,14 +33,6 @@ using namespace std;
 extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
 #ifndef BSSN_USE_TRANSFER_CACHE
 #define BSSN_USE_TRANSFER_CACHE 1
 #endif
 #ifndef BSSN_USE_ESCALAR_C_KERNEL
 #define BSSN_USE_ESCALAR_C_KERNEL 1
 #endif
 class bssn_class
 {
 public:
@@ -179,17 +171,6 @@ public:
       void testOutBd();
       bool check_Stdin_Abort(); 
       bool use_transfer_cache() const;
       void setup_transfer_caches();
       void invalidate_transfer_caches();
       void destroy_transfer_caches();
       void sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
       void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
       void sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array = 0);
       void restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       void outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
       virtual void Setup_Initial_Data_Cao();
       virtual void Setup_Initial_Data_Lousto();
--- a/AMSS_NCKU_source/bssn_em_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_em_rhs_c.C
@@ -1,323 +0,0 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
 #include <cstddef>
 /*
 * C 版 BSSN-EM RHS kernel — replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell.
 *
 * Computes:
 *   1. All metric and EM field derivatives
 *   2. Physical metric, Christoffel-like terms
 *   3. EM field RHS (E, B, Kpsi, Kphi)
 *   4. Stress-energy tensor (rho, Si, Sij)
 *   5. Calls f_compute_rhs_bssn (C BSSN RHS) with stress-energy
 *   6. Advection + KO dissipation for EM fields
 *   7. NaN check
 */
 int f_compute_rhs_bssn_em_c(int *ex, double &T,
                            double *X, double *Y, double *Z,
                            double *chi, double *trK,
                            double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                            double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                            double *Gamx, double *Gamy, double *Gamz,
                            double *Lap, double *betax, double *betay, double *betaz,
                            double *dtSfx, double *dtSfy, double *dtSfz,
                            double *Ex,  double *Ey,  double *Ez,
                            double *Bx,  double *By,  double *Bz,
                            double *Kpsi, double *Kphi,
                            double *Jx, double *Jy, double *Jz, double *qchar,
                            double *chi_rhs, double *trK_rhs,
                            double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
                            double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                            double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
                            double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                            double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                            double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                            double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                            double *Ex_rhs,  double *Ey_rhs,  double *Ez_rhs,
                            double *Bx_rhs,  double *By_rhs,  double *Bz_rhs,
                            double *Kpsi_rhs, double *Kphi_rhs,
                            double *rho, double *Sx, double *Sy, double *Sz,
                            double *Sxx, double *Sxy, double *Sxz,
                            double *Syy, double *Syz, double *Szz,
                            double *Gamxxx, double *Gamxxy, double *Gamxxz,
                            double *Gamxyy, double *Gamxyz, double *Gamxzz,
                            double *Gamyxx, double *Gamyxy, double *Gamyxz,
                            double *Gamyyy, double *Gamyyz, double *Gamyzz,
                            double *Gamzxx, double *Gamzxy, double *Gamzxz,
                            double *Gamzyy, double *Gamzyz, double *Gamzzz,
                            double *Rxx, double *Rxy, double *Rxz,
                            double *Ryy, double *Ryz, double *Rzz,
                            double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                            int &Symmetry, int &Lev, double &eps, int &co)
 {
    (void)T;
    int gont = 0;
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const size_t n = (size_t)all;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, FOUR = 4.0, EIT = 8.0;
    const double HALF = 0.5, THR = 3.0, F3o2 = 1.5, PI = 3.14159265358979323846;
    const double SYM = 1.0, ANTI = -1.0;
    const double kappa = 1.0;
    const double SSS[3]={SYM,SYM,SYM},   AAS[3]={ANTI,ANTI,SYM};
    const double ASA[3]={ANTI,SYM,ANTI}, SAA[3]={SYM,ANTI,ANTI};
    const double ASS[3]={ANTI,SYM,SYM},   SAS[3]={SYM,ANTI,SYM};
    const double SSA[3]={SYM,SYM,ANTI};
    /* ---- allocate temporary arrays ---- */
    double *chix = (double*)malloc(n*sizeof(double));
    double *chiy = (double*)malloc(n*sizeof(double));
    double *chiz = (double*)malloc(n*sizeof(double));
    double *Exx=(double*)malloc(n*sizeof(double)),*Exy=(double*)malloc(n*sizeof(double)),*Exz=(double*)malloc(n*sizeof(double));
    double *Eyx=(double*)malloc(n*sizeof(double)),*Eyy=(double*)malloc(n*sizeof(double)),*Eyz=(double*)malloc(n*sizeof(double));
    double *Ezx=(double*)malloc(n*sizeof(double)),*Ezy=(double*)malloc(n*sizeof(double)),*Ezz=(double*)malloc(n*sizeof(double));
    double *Bxx=(double*)malloc(n*sizeof(double)),*Bxy=(double*)malloc(n*sizeof(double)),*Bxz=(double*)malloc(n*sizeof(double));
    double *Byx=(double*)malloc(n*sizeof(double)),*Byy=(double*)malloc(n*sizeof(double)),*Byz=(double*)malloc(n*sizeof(double));
    double *Bzx=(double*)malloc(n*sizeof(double)),*Bzy=(double*)malloc(n*sizeof(double)),*Bzz=(double*)malloc(n*sizeof(double));
    double *Kpsix=(double*)malloc(n*sizeof(double)),*Kpsiy=(double*)malloc(n*sizeof(double)),*Kpsiz=(double*)malloc(n*sizeof(double));
    double *Kphix=(double*)malloc(n*sizeof(double)),*Kphiy=(double*)malloc(n*sizeof(double)),*Kphiz=(double*)malloc(n*sizeof(double));
    double *Lapx=(double*)malloc(n*sizeof(double)),*Lapy=(double*)malloc(n*sizeof(double)),*Lapz=(double*)malloc(n*sizeof(double));
    double *betaxx=(double*)malloc(n*sizeof(double)),*betaxy=(double*)malloc(n*sizeof(double)),*betaxz=(double*)malloc(n*sizeof(double));
    double *betayx=(double*)malloc(n*sizeof(double)),*betayy=(double*)malloc(n*sizeof(double)),*betayz=(double*)malloc(n*sizeof(double));
    double *betazx=(double*)malloc(n*sizeof(double)),*betazy=(double*)malloc(n*sizeof(double)),*betazz=(double*)malloc(n*sizeof(double));
    double *gxxx=(double*)malloc(n*sizeof(double)),*gxxy=(double*)malloc(n*sizeof(double)),*gxxz=(double*)malloc(n*sizeof(double));
    double *gxyx=(double*)malloc(n*sizeof(double)),*gxyy=(double*)malloc(n*sizeof(double)),*gxyz=(double*)malloc(n*sizeof(double));
    double *gxzx=(double*)malloc(n*sizeof(double)),*gxzy=(double*)malloc(n*sizeof(double)),*gxzz=(double*)malloc(n*sizeof(double));
    double *gyyx=(double*)malloc(n*sizeof(double)),*gyyy=(double*)malloc(n*sizeof(double)),*gyyz=(double*)malloc(n*sizeof(double));
    double *gyzx=(double*)malloc(n*sizeof(double)),*gyzy=(double*)malloc(n*sizeof(double)),*gyzz=(double*)malloc(n*sizeof(double));
    double *gzzx=(double*)malloc(n*sizeof(double)),*gzzy=(double*)malloc(n*sizeof(double)),*gzzz=(double*)malloc(n*sizeof(double));
    double *gupxx=(double*)malloc(n*sizeof(double)),*gupxy=(double*)malloc(n*sizeof(double)),*gupxz=(double*)malloc(n*sizeof(double));
    double *gupyy=(double*)malloc(n*sizeof(double)),*gupyz=(double*)malloc(n*sizeof(double)),*gupzz=(double*)malloc(n*sizeof(double));
    if (!chix||!chiy||!chiz||!Exx||!Exy||!Exz||!Eyx||!Eyy||!Eyz||!Ezx||!Ezy||!Ezz||
        !Bxx||!Bxy||!Bxz||!Byx||!Byy||!Byz||!Bzx||!Bzy||!Bzz||
        !Kpsix||!Kpsiy||!Kpsiz||!Kphix||!Kphiy||!Kphiz||
        !Lapx||!Lapy||!Lapz||
        !betaxx||!betaxy||!betaxz||!betayx||!betayy||!betayz||!betazx||!betazy||!betazz||
        !gxxx||!gxxy||!gxxz||!gxyx||!gxyy||!gxyz||!gxzx||!gxzy||!gxzz||
        !gyyx||!gyyy||!gyyz||!gyzx||!gyzy||!gyzz||!gzzx||!gzzy||!gzzz||
        !gupxx||!gupxy||!gupxz||!gupyy||!gupyz||!gupzz) {
        gont = 1;
    }
    /* ==== 1. Compute all derivatives ==== */
    if (!gont) {
    /* metric derivatives */
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
    fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
    fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    /* EM field derivatives */
    fderivs(ex, Kpsi, Kpsix, Kpsiy, Kpsiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Kphi, Kphix, Kphiy, Kphiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Ex, Exx, Exy, Exz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Ey, Eyx, Eyy, Eyz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, Ez, Ezx, Ezy, Ezz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, Bx, Bxx, Bxy, Bxz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
    fderivs(ex, By, Byx, Byy, Byz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, Bz, Bzx, Bzy, Bzz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
    /* ==== 2. Compute EM RHS and stress-energy ==== */
    const double F1o4PI = ONE / (FOUR * PI);
    for (size_t i = 0; i < n; ++i) {
        const double alpn1 = Lap[i] + ONE;
        const double chin1 = chi[i] + ONE;
        const double chi3o2 = sqrt(chin1) * chin1;  // chi^{3/2}
        const double ichi = ONE / chin1;
        /* physical metric */
        const double pgxx = (dxx[i] + ONE) * ichi;
        const double pgyy = (dyy[i] + ONE) * ichi;
        const double pgzz = (dzz[i] + ONE) * ichi;
        const double pgxy = gxy[i] * ichi;
        const double pgxz = gxz[i] * ichi;
        const double pgyz = gyz[i] * ichi;
        /* inverse physical metric */
        const double det = pgxx * pgyy * pgzz + pgxy * pgyz * pgxz + pgxz * pgxy * pgyz
                         - pgxz * pgyy * pgxz - pgxy * pgxy * pgzz - pgxx * pgyz * pgyz;
        const double idet = ONE / det;
        const double upxx = (pgyy * pgzz - pgyz * pgyz) * idet;
        const double upxy = -(pgxy * pgzz - pgyz * pgxz) * idet;
        const double upxz = (pgxy * pgyz - pgyy * pgxz) * idet;
        const double upyy = (pgxx * pgzz - pgxz * pgxz) * idet;
        const double upyz = -(pgxx * pgyz - pgxy * pgxz) * idet;
        const double upzz = (pgxx * pgyy - pgxy * pgxy) * idet;
        gupxx[i]=upxx; gupxy[i]=upxy; gupxz[i]=upxz;
        gupyy[i]=upyy; gupyz[i]=upyz; gupzz[i]=upzz;
        /* E-field RHS */
        /* curl(B) part: epsilon^{ijk} ∂_j (alpha * B_k) in coordinate basis */
        /* Using lower-index B fields: B_i_lower = pg_{ij} * B^j */
        const double BxL = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
        const double ByL = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
        const double BzL = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
        /* Physical metric derivatives (chain rule from conformal) */
        const double pgxx_x = (gxxx[i] - pgxx*chix[i]) * ichi;
        /* const double pgxx_y = (gxxy[i] - pgxx*chiy[i]) * ichi; */
        const double pgxy_x = (gxyx[i] - pgxy*chix[i]) * ichi;
        const double pgxy_y = (gxyy[i] - pgxy*chiy[i]) * ichi;
        const double pgxz_x = (gxzx[i] - pgxz*chix[i]) * ichi;
        const double pgxz_z = (gxzz[i] - pgxz*chiz[i]) * ichi;
        const double pgyy_y = (gyyy[i] - pgyy*chiy[i]) * ichi;
        const double pgyz_y = (gyzy[i] - pgyz*chiy[i]) * ichi;
        const double pgyz_z = (gyzz[i] - pgyz*chiz[i]) * ichi;
        const double pgzz_z = (gzzz[i] - pgzz*chiz[i]) * ichi;
        /* Curl_x(B) = ∂_y (alpha*BzL) - ∂_z (alpha*ByL) */
        const double aBx = alpn1*BxL, aBy = alpn1*ByL, aBz = alpn1*BzL;
        const double curlBx = (aBz*Lapy[i] + alpn1*(pgxz*Bxy[i]+pgyz*Byy[i]+pgzz*Bzy[i]) + alpn1*(Bx[i]*gxzy[i]+By[i]*gyzy[i]+Bz[i]*gzzy[i]))
                            - (aBy*Lapz[i] + alpn1*(pgxy*Bxz[i]+pgyy*Byz[i]+pgyz*Bzz[i]) + alpn1*(Bx[i]*gxyz[i]+By[i]*gyyz[i]+Bz[i]*gyzz[i]));
        double curlBy = (aBx*Lapz[i] + alpn1*(pgxx*Bxz[i]+pgxy*Byz[i]+pgxz*Bzz[i]) + alpn1*(Bx[i]*gxxz[i]+By[i]*gxyz[i]+Bz[i]*gxzz[i]))
                      - (aBz*Lapx[i] + alpn1*(pgxz*Bxx[i]+pgyz*Byx[i]+pgzz*Bzx[i]) + alpn1*(Bx[i]*gxzx[i]+By[i]*gyzx[i]+Bz[i]*gzzx[i]));
        double curlBz = (aBy*Lapx[i] + alpn1*(pgxy*Bxx[i]+pgyy*Byx[i]+pgyz*Bzx[i]) + alpn1*(Bx[i]*gxyx[i]+By[i]*gyyx[i]+Bz[i]*gyzx[i]))
                      - (aBx*Lapy[i] + alpn1*(pgxx*Bxy[i]+pgxy*Byy[i]+pgxz*Bzy[i]) + alpn1*(Bx[i]*gxxy[i]+By[i]*gxyy[i]+Bz[i]*gxzy[i]));
        /* Advection part: -beta^j * ∂_j E^i */
        const double advEx = Ex[i]*betaxx[i] + Ey[i]*betaxy[i] + Ez[i]*betaxz[i];
        const double advEy = Ex[i]*betayx[i] + Ey[i]*betayy[i] + Ez[i]*betayz[i];
        const double advEz = Ex[i]*betazx[i] + Ey[i]*betazy[i] + Ez[i]*betazz[i];
        /* grad(Kpsi) contracted with inverse metric */
        const double gupKx = upxx*Kpsix[i] + upxy*Kpsiy[i] + upxz*Kpsiz[i];
        const double gupKy = upxy*Kpsix[i] + upyy*Kpsiy[i] + upyz*Kpsiz[i];
        const double gupKz = upxz*Kpsix[i] + upyz*Kpsiy[i] + upzz*Kpsiz[i];
        Ex_rhs[i] = alpn1*trK[i]*Ex[i] - advEx - FOUR*PI*alpn1*Jx[i] - alpn1*gupKx + chi3o2*curlBx;
        Ey_rhs[i] = alpn1*trK[i]*Ey[i] - advEy - FOUR*PI*alpn1*Jy[i] - alpn1*gupKy + chi3o2*curlBy;
        Ez_rhs[i] = alpn1*trK[i]*Ez[i] - advEz - FOUR*PI*alpn1*Jz[i] - alpn1*gupKz + chi3o2*curlBz;
        /* B-field RHS: similar but with -chi^{3/2} * curl(E) and grad(Kphi) */
        const double ExL = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
        const double EyL = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
        const double EzL = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
        const double aEx = alpn1*ExL, aEy = alpn1*EyL, aEz = alpn1*EzL;
        const double curlEx = (aEz*Lapy[i] + alpn1*(pgxz*Exy[i]+pgyz*Eyy[i]+pgzz*Ezy[i]) + alpn1*(Ex[i]*gxzy[i]+Ey[i]*gyzy[i]+Ez[i]*gzzy[i]))
                            - (aEy*Lapz[i] + alpn1*(pgxy*Exz[i]+pgyy*Eyz[i]+pgyz*Ezz[i]) + alpn1*(Ex[i]*gxyz[i]+Ey[i]*gyyz[i]+Ez[i]*gyzz[i]));
        double curlEy = (aEx*Lapz[i] + alpn1*(pgxx*Exz[i]+pgxy*Eyz[i]+pgxz*Ezz[i]) + alpn1*(Ex[i]*gxxz[i]+Ey[i]*gxyz[i]+Ez[i]*gxzz[i]))
                      - (aEz*Lapx[i] + alpn1*(pgxz*Exx[i]+pgyz*Eyx[i]+pgzz*Ezx[i]) + alpn1*(Ex[i]*gxzx[i]+Ey[i]*gyzx[i]+Ez[i]*gzzx[i]));
        double curlEz = (aEy*Lapx[i] + alpn1*(pgxy*Exx[i]+pgyy*Eyx[i]+pgyz*Ezx[i]) + alpn1*(Ex[i]*gxyx[i]+Ey[i]*gyyx[i]+Ez[i]*gyzx[i]))
                      - (aEx*Lapy[i] + alpn1*(pgxx*Exy[i]+pgxy*Eyy[i]+pgxz*Ezy[i]) + alpn1*(Ex[i]*gxxy[i]+Ey[i]*gxyy[i]+Ez[i]*gxzy[i]));
        const double advBx = Bx[i]*betaxx[i] + By[i]*betaxy[i] + Bz[i]*betaxz[i];
        const double advBy = Bx[i]*betayx[i] + By[i]*betayy[i] + Bz[i]*betayz[i];
        const double advBz = Bx[i]*betazx[i] + By[i]*betazy[i] + Bz[i]*betazz[i];
        const double gupKphix = upxx*Kphix[i] + upxy*Kphiy[i] + upxz*Kphiz[i];
        const double gupKphiy = upxy*Kphix[i] + upyy*Kphiy[i] + upyz*Kphiz[i];
        const double gupKphiz = upxz*Kphix[i] + upyz*Kphiy[i] + upzz*Kphiz[i];
        Bx_rhs[i] = alpn1*trK[i]*Bx[i] - advBx - alpn1*gupKphix - chi3o2*curlEx;
        By_rhs[i] = alpn1*trK[i]*By[i] - advBy - alpn1*gupKphiy - chi3o2*curlEy;
        Bz_rhs[i] = alpn1*trK[i]*Bz[i] - advBz - alpn1*gupKphiz - chi3o2*curlEz;
        /* Scalar potential RHS */
        const double divE = Exx[i] + Eyy[i] + Ezz[i];
        const double divB = Bxx[i] + Byy[i] + Bzz[i];
        const double chiCont = F3o2 * ichi * (chix[i]*Ex[i] + chiy[i]*Ey[i] + chiz[i]*Ez[i]);
        Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i] - alpn1*kappa*Kpsi[i] - alpn1*(divE - chiCont);
        Kphi_rhs[i] = -alpn1*kappa*Kphi[i] - alpn1*(divB - F3o2*ichi*(chix[i]*Bx[i] + chiy[i]*By[i] + chiz[i]*Bz[i]));
        /* Stress-energy tensor */
        const double E2 = pgxx*Ex[i]*Ex[i] + pgyy*Ey[i]*Ey[i] + pgzz*Ez[i]*Ez[i]
                        + TWO*(pgxy*Ex[i]*Ey[i] + pgxz*Ex[i]*Ez[i] + pgyz*Ey[i]*Ez[i]);
        const double B2 = pgxx*Bx[i]*Bx[i] + pgyy*By[i]*By[i] + pgzz*Bz[i]*Bz[i]
                        + TWO*(pgxy*Bx[i]*By[i] + pgxz*Bx[i]*Bz[i] + pgyz*By[i]*Bz[i]);
        rho[i] = (E2 + B2) / (EIT * PI);
        const double ichi3o2 = ONE / chi3o2;
        Sx[i] = (Ey[i]*Bz[i] - Ez[i]*By[i]) * F1o4PI * ichi3o2;
        Sy[i] = (Ez[i]*Bx[i] - Ex[i]*Bz[i]) * F1o4PI * ichi3o2;
        Sz[i] = (Ex[i]*By[i] - Ey[i]*Bx[i]) * F1o4PI * ichi3o2;
        const double lExi = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
        const double lEyi = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
        const double lEzi = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
        const double lBxi = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
        const double lByi = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
        const double lBzi = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
        Sxx[i] = rho[i]*pgxx - (lExi*lExi + lBxi*lBxi) * F1o4PI;
        Sxy[i] = rho[i]*pgxy - (lExi*lEyi + lBxi*lByi) * F1o4PI;
        Sxz[i] = rho[i]*pgxz - (lExi*lEzi + lBxi*lBzi) * F1o4PI;
        Syy[i] = rho[i]*pgyy - (lEyi*lEyi + lByi*lByi) * F1o4PI;
        Syz[i] = rho[i]*pgyz - (lEyi*lEzi + lByi*lBzi) * F1o4PI;
        Szz[i] = rho[i]*pgzz - (lEzi*lEzi + lBzi*lBzi) * F1o4PI;
    }
    /* ==== 3. Call BSSN RHS with EM stress-energy ==== */
    gont = f_compute_rhs_bssn(ex, T, X, Y, Z,
        chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz, Lap, betax, betay, betaz, dtSfx, dtSfy, dtSfz,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs, Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        ham_Res, movx_Res, movy_Res, movz_Res,
        Gmx_Res, Gmy_Res, Gmz_Res,
        Symmetry, Lev, eps, co);
    if (!gont) {
    /* ==== 4. Advection terms for EM fields ==== */
    lopsided(ex, X, Y, Z, Kpsi, Kpsi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Kphi, Kphi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Ex,   Ex_rhs,   betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, Ey,   Ey_rhs,   betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, Ez,   Ez_rhs,   betax, betay, betaz, Symmetry, SSA);
    lopsided(ex, X, Y, Z, Bx,   Bx_rhs,   betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, By,   By_rhs,   betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, Bz,   Bz_rhs,   betax, betay, betaz, Symmetry, AAS);
    /* ==== 5. KO dissipation for EM fields ==== */
    if (eps > ZEO) {
        kodis(ex, X, Y, Z, Kpsi, Kpsi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Kphi, Kphi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ex,   Ex_rhs,   ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ey,   Ey_rhs,   SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ez,   Ez_rhs,   SSA, Symmetry, eps);
        kodis(ex, X, Y, Z, Bx,   Bx_rhs,   SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, By,   By_rhs,   ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, Bz,   Bz_rhs,   AAS, Symmetry, eps);
    }
    /* ==== 6. NaN check ==== */
        for (int i = 0; i < all; ++i) {
            if (!isfinite(Ex_rhs[i]+Ey_rhs[i]+Ez_rhs[i]+Bx_rhs[i]+By_rhs[i]+Bz_rhs[i]+Kpsi_rhs[i]+Kphi_rhs[i])) {
                gont = 1; break;
            }
        }
        } /* inner if (!gont) */
    } /* outer if (!gont) */
    free(chix);free(chiy);free(chiz);
    free(Exx);free(Exy);free(Exz);free(Eyx);free(Eyy);free(Eyz);free(Ezx);free(Ezy);free(Ezz);
    free(Bxx);free(Bxy);free(Bxz);free(Byx);free(Byy);free(Byz);free(Bzx);free(Bzy);free(Bzz);
    free(Kpsix);free(Kpsiy);free(Kpsiz);
    free(Kphix);free(Kphiy);free(Kphiz);
    free(Lapx);free(Lapy);free(Lapz);
    free(betaxx);free(betaxy);free(betaxz);free(betayx);free(betayy);free(betayz);free(betazx);free(betazy);free(betazz);
    free(gxxx);free(gxxy);free(gxxz);free(gxyx);free(gxyy);free(gxyz);free(gxzx);free(gxzy);free(gxzz);
    free(gyyx);free(gyyy);free(gyyz);free(gyzx);free(gyzy);free(gyzz);free(gzzx);free(gzzy);free(gzzz);
    free(gupxx);free(gupxy);free(gupxz);free(gupyy);free(gupyz);free(gupzz);
    return gont;
 }
--- a/AMSS_NCKU_source/bssn_escalar_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_escalar_rhs_c.C
@@ -1,169 +0,0 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
 #include <vector>
 namespace
 {
    // Reuse the temporary workspace across block calls to avoid repeated heap churn
    // in the EScalar wrapper. MPI ranks execute this path sequentially, so a single
    // process-local buffer is sufficient here.
    std::vector<double> g_escalar_tmp_store;
 }
 #ifdef fortran1
 #define f_frpotential frpotential
 #endif
 #ifdef fortran2
 #define f_frpotential FRPOTENTIAL
 #endif
 #ifdef fortran3
 #define f_frpotential frpotential_
 #endif
 extern "C"
 {
    void f_frpotential(int *, double *, double *, double *);
 }
 int f_compute_rhs_bssn_escalar_c(int *ex, double &T,
                                 double *X, double *Y, double *Z,
                                 double *chi, double *trK,
                                 double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                 double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                 double *Gamx, double *Gamy, double *Gamz,
                                 double *Lap, double *betax, double *betay, double *betaz,
                                 double *dtSfx, double *dtSfy, double *dtSfz,
                                 double *Sphi, double *Spi,
                                 double *chi_rhs, double *trK_rhs,
                                 double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                 double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                 double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                 double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                 double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                 double *Sphi_rhs, double *Spi_rhs,
                                 double *rho, double *Sx, double *Sy, double *Sz,
                                 double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                 double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                 double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                 double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                 double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                 double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                                 double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                                 int &Symmetry, int &Lev, double &eps, int &co)
 {
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const size_t workspace_size = size_t(all) * 17;
    if (g_escalar_tmp_store.size() < workspace_size)
        g_escalar_tmp_store.resize(workspace_size);
    double *tmp_ptr = g_escalar_tmp_store.data();
    auto alloc_tmp = [&](int n = 1) -> double *
    {
        double *ptr = tmp_ptr;
        tmp_ptr += size_t(all) * n;
        return ptr;
    };
    double *chix = alloc_tmp(), *chiy = alloc_tmp(), *chiz = alloc_tmp();
    double *Kx = alloc_tmp(), *Ky = alloc_tmp(), *Kz = alloc_tmp();
    double *fxx = alloc_tmp(), *fxy = alloc_tmp(), *fxz = alloc_tmp();
    double *fyy = alloc_tmp(), *fyz = alloc_tmp(), *fzz = alloc_tmp();
    double *Lapx = alloc_tmp(), *Lapy = alloc_tmp(), *Lapz = alloc_tmp();
    double *V = alloc_tmp(), *dVdSphi = alloc_tmp();
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, HALF = 0.5;
    const double SSS[3] = {1.0, 1.0, 1.0};
    fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Sphi, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, Sphi, fxx, fxy, fxz, fyy, fyz, fzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    f_frpotential(ex, Sphi, V, dVdSphi);
    for (int i = 0; i < all; ++i)
    {
        const double alpn1 = Lap[i] + ONE;
        const double chin1 = chi[i] + ONE;
        const double gxx = dxx[i] + ONE;
        const double gyy = dyy[i] + ONE;
        const double gzz = dzz[i] + ONE;
        const double det = gxx * gyy * gzz + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i]
                         - gxz[i] * gyy * gxz[i] - gxy[i] * gxy[i] * gzz - gxx * gyz[i] * gyz[i];
        const double gupxx = (gyy * gzz - gyz[i] * gyz[i]) / det;
        const double gupxy = -(gxy[i] * gzz - gyz[i] * gxz[i]) / det;
        const double gupxz = (gxy[i] * gyz[i] - gyy * gxz[i]) / det;
        const double gupyy = (gxx * gzz - gxz[i] * gxz[i]) / det;
        const double gupyz = -(gxx * gyz[i] - gxy[i] * gxz[i]) / det;
        const double gupzz = (gxx * gyy - gxy[i] * gxy[i]) / det;
        Sphi_rhs[i] = alpn1 * Spi[i];
        Spi_rhs[i] = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
                   + TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
                   - ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) / TWO / chin1) * Kx[i]
                   +  (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) / TWO / chin1) * Ky[i]
                   +  (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) / TWO / chin1) * Kz[i]);
        Spi_rhs[i] = Spi_rhs[i] * alpn1
                   + gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
                   + gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
                   + gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i];
        Spi_rhs[i] = Spi_rhs[i] * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi[i]);
        rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
               + gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
               + Spi[i] * Spi[i] * HALF + V[i];
        Sx[i] = -Spi[i] * Kx[i];
        Sy[i] = -Spi[i] * Ky[i];
        Sz[i] = -Spi[i] * Kz[i];
        const double pressure = (rho[i] - Spi[i] * Spi[i]) / chin1;
        Sxx[i] = Kx[i] * Kx[i] - pressure * gxx;
        Sxy[i] = Kx[i] * Ky[i] - pressure * gxy[i];
        Sxz[i] = Kx[i] * Kz[i] - pressure * gxz[i];
        Syy[i] = Ky[i] * Ky[i] - pressure * gyy;
        Syz[i] = Ky[i] * Kz[i] - pressure * gyz[i];
        Szz[i] = Kz[i] * Kz[i] - pressure * gzz;
    }
    if (f_compute_rhs_bssn(ex, T, X, Y, Z,
                           chi, trK,
                           dxx, gxy, gxz, dyy, gyz, dzz,
                           Axx, Axy, Axz, Ayy, Ayz, Azz,
                           Gamx, Gamy, Gamz,
                           Lap, betax, betay, betaz,
                           dtSfx, dtSfy, dtSfz,
                           chi_rhs, trK_rhs,
                           gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
                           Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
                           Gamx_rhs, Gamy_rhs, Gamz_rhs,
                           Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
                           dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
                           rho, Sx, Sy, Sz,
                           Sxx, Sxy, Sxz, Syy, Syz, Szz,
                           Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                           Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                           Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                           Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                           ham_Res, movx_Res, movy_Res, movz_Res,
                           Gmx_Res, Gmy_Res, Gmz_Res,
                           Symmetry, Lev, eps, co))
        return 1;
    lopsided_kodis(ex, X, Y, Z, Sphi, Sphi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    lopsided_kodis(ex, X, Y, Z, Spi, Spi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
    for (int i = 0; i < all; ++i)
    {
        if (Sphi_rhs[i] != Sphi_rhs[i] || Spi_rhs[i] != Spi_rhs[i] || rho[i] != rho[i])
            return 1;
    }
    return 0;
 }
--- a/AMSS_NCKU_source/bssn_rhs.h
+++ b/AMSS_NCKU_source/bssn_rhs.h
@@ -67,27 +67,6 @@ extern "C"
                               int &, int &, double &, int &);
 }
 int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Ricci
                                 double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
                                 int &, int &, double &, int &);
 extern "C"
 {
        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
@@ -262,31 +241,4 @@ extern "C"
                                     double *);
 } // FR_cons
 // BSSN-EM C kernel (replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell)
 int f_compute_rhs_bssn_em_c(int *, double &, double *, double *, double *,
                             double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *,
                             double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             int &, int &, double &, int &);
 #endif /* BSSN_H */
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
--- a/AMSS_NCKU_source/fdderivs_sh_c.C
+++ b/AMSS_NCKU_source/fdderivs_sh_c.C
@@ -1,321 +0,0 @@
 #include "macrodef.h"
 #include "share_func.h"
 /*
 * fdderivs_sh — second derivatives on shell patch in (rho, sigma, R) coords.
 * Same stencil coefficients as Cartesian fdderivs. Uses symmetry_stbd.
 */
 extern "C" void fdderivs_sh_(const int ex[3],
                 const double *f,
                 double *fxx, double *fxy, double *fxz,
                 double *fyy, double *fyz, double *fzz,
                 const double *X, const double *Y, const double *Z,
                 double SYM1, double SYM2, double SYM3,
                 int Symmetry, int onoff, int sst)
 {
    (void)SYM3; (void)onoff; (void)sst;
    const int NO_SYMM=0, EQ_SYMM=1, OCTANT=2;
    const double ZEO=0.0, ONE=1.0, TWO=2.0, F1o4=2.5e-1;
    const double F8=8.0, F16=16.0, F30=30.0, F1o12=ONE/12.0, F1o144=ONE/144.0;
    const double F9=9.0, F45=45.0, F60=60.0, F27=27.0, F270=270.0, F490=490.0;
    const double F1o180=ONE/180.0, F1o3600=ONE/3600.0;
    const double F32=32.0, F128=128.0, F168=168.0, F672=672.0, F840=840.0;
    const double F1008=1008.0, F8064=8064.0, F14350=14350.0;
    const double F1o5040=ONE/5040.0, F1o705600=ONE/705600.0;
    const int ex1=ex[0], ex2=ex[1], ex3=ex[2];
    const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0];
    const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3;
    const double SoA[2]={SYM1,SYM2};
 #if (ghost_width == 2)
    {
        const int ord=1;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=0;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=0;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=0;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
        const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=ZEO;fxy[p]=fxz[p]=fyz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
            for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
            for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
            for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
                fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
                fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
                fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
                fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
                fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
            }}}
        }
        #undef FH
        return;
    }
 #elif (ghost_width == 3)
    {
        const int ord=2;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-1;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-1;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-1;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
        const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
        const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
        const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
        const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi);
        #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
            for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
            for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
            for(int i0=i2_lo;i0<=i2_hi;++i0){
                if(has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi)continue;
                const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
                fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
                fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
                fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
                fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
                fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
            }}}
        }
        if(has4){
            for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
            for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
            for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
                fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
                fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
                {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
                 const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
                 const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
                 const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
                 fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
                {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
                 const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
                 const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
                 const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
                 fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
                {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
                 const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
                 const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
                 const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
                 fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
            }}}
        }
        #undef FH
        return;
    }
 #elif (ghost_width == 4)
    {
        const int ord=3;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
        const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
        const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ);
        const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
        const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
        const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ);
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
        const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
        const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi);
        #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
        for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
        for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
            const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
            fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
            fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
            fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
            fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
            fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
        }}}}
        if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
        for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
        for(int i0=i4_lo;i0<=i4_hi;++i0){if(has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi)continue;
            const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
            fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
            fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
            {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
             const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
             const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
             const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
             fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
            {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
             const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
             const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
             const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
             fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
            {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
             const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
             const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
             const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
             fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
        }}}}
        if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
        for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
        for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF));
            fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF));
            fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3));
            #define XS6(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY))
            fxy[p]=Xdxdy*(-XS6(jF-3,kF)+F9*XS6(jF-2,kF)-F45*XS6(jF-1,kF)+F45*XS6(jF+1,kF)-F9*XS6(jF+2,kF)+XS6(jF+3,kF));
            fxz[p]=Xdxdz*(-XS6(jF,kF-3)+F9*XS6(jF,kF-2)-F45*XS6(jF,kF-1)+F45*XS6(jF,kF+1)-F9*XS6(jF,kF+2)+XS6(jF,kF+3));
            #undef XS6
            #define YS6(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY))
            fyz[p]=Xdydz*(-YS6(jF,kF-3)+F9*YS6(jF,kF-2)-F45*YS6(jF,kF-1)+F45*YS6(jF,kF+1)-F9*YS6(jF,kF+2)+YS6(jF,kF+3));
            #undef YS6
        }}}}
        #undef FH
        return;
    }
 #elif (ghost_width == 5)
    {
        /* 8th-order shell second derivatives — inherits 8th-order stencil coeffs from Cartesian */
        const int ord=4;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
        const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
        const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ);
        const double Edxdx=F1o5040/(dX*dX),Edydy=F1o5040/(dY*dY),Edzdz=F1o5040/(dZ*dZ);
        const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
        const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
        const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ);
        const double Edxdy=F1o705600/(dX*dY),Edxdz=F1o705600/(dX*dZ),Edydz=F1o705600/(dY*dZ);
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
        const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
        const int i8_lo=(iminF+3>0)?iminF+3:0,j8_lo=(jminF+3>0)?jminF+3:0,k8_lo=4,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5;
        const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi),has8=(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi);
        #define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
        /* 2nd-order pass */
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
        for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
        for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
            const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
            fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
            fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
            fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
            fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
            fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
        }}}}
        /* 4th-order pass */
        if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
        for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
        for(int i0=i4_lo;i0<=i4_hi;++i0){_Bool in6=has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi;if(in6)continue;
            const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
            fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
            fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
            {const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
             const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
             const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
             const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
             fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
            {const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
             const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
             const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
             const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
             fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
            {const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
             const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
             const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
             const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
             fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
        }}}}
        /* 6th-order pass */
        if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
        for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
        for(int i0=i6_lo;i0<=i6_hi;++i0){if(has8&&i0>=i8_lo&&i0<=i8_hi&&j0>=j8_lo&&j0<=j8_hi&&k0>=k8_lo&&k0<=k8_hi)continue;
            const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF));
            fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF));
            fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3));
            #define XS6_8(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY))
            fxy[p]=Xdxdy*(-XS6_8(jF-3,kF)+F9*XS6_8(jF-2,kF)-F45*XS6_8(jF-1,kF)+F45*XS6_8(jF+1,kF)-F9*XS6_8(jF+2,kF)+XS6_8(jF+3,kF));
            fxz[p]=Xdxdz*(-XS6_8(jF,kF-3)+F9*XS6_8(jF,kF-2)-F45*XS6_8(jF,kF-1)+F45*XS6_8(jF,kF+1)-F9*XS6_8(jF,kF+2)+XS6_8(jF,kF+3));
            #undef XS6_8
            #define YS6_8(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY))
            fyz[p]=Xdydz*(-YS6_8(jF,kF-3)+F9*YS6_8(jF,kF-2)-F45*YS6_8(jF,kF-1)+F45*YS6_8(jF,kF+1)-F9*YS6_8(jF,kF+2)+YS6_8(jF,kF+3));
            #undef YS6_8
        }}}}
        /* 8th-order pass */
        if(has8){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1;
        for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1;
        for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fxx[p]=Edxdx*(-(double)9*FH(iF-4,jF,kF)+F128*FH(iF-3,jF,kF)-F1008*FH(iF-2,jF,kF)+F8064*FH(iF-1,jF,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF+1,jF,kF)-F1008*FH(iF+2,jF,kF)+F128*FH(iF+3,jF,kF)-(double)9*FH(iF+4,jF,kF));
            fyy[p]=Edydy*(-(double)9*FH(iF,jF-4,kF)+F128*FH(iF,jF-3,kF)-F1008*FH(iF,jF-2,kF)+F8064*FH(iF,jF-1,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF+1,kF)-F1008*FH(iF,jF+2,kF)+F128*FH(iF,jF+3,kF)-(double)9*FH(iF,jF+4,kF));
            fzz[p]=Edzdz*(-(double)9*FH(iF,jF,kF-4)+F128*FH(iF,jF,kF-3)-F1008*FH(iF,jF,kF-2)+F8064*FH(iF,jF,kF-1)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF,kF+1)-F1008*FH(iF,jF,kF+2)+F128*FH(iF,jF,kF+3)-(double)9*FH(iF,jF,kF+4));
            #define XS8(JF,KFDUMMY) (+(double)3*FH(iF-4,JF,KFDUMMY)-F32*FH(iF-3,JF,KFDUMMY)+F168*FH(iF-2,JF,KFDUMMY)-F672*FH(iF-1,JF,KFDUMMY)+F672*FH(iF+1,JF,KFDUMMY)-F168*FH(iF+2,JF,KFDUMMY)+F32*FH(iF+3,JF,KFDUMMY)-(double)3*FH(iF+4,JF,KFDUMMY))
            fxy[p]=Edxdy*(+(double)3*XS8(jF-4,kF)-F32*XS8(jF-3,kF)+F168*XS8(jF-2,kF)-F672*XS8(jF-1,kF)+F672*XS8(jF+1,kF)-F168*XS8(jF+2,kF)+F32*XS8(jF+3,kF)-(double)3*XS8(jF+4,kF));
            fxz[p]=Edxdz*(+(double)3*XS8(jF,kF-4)-F32*XS8(jF,kF-3)+F168*XS8(jF,kF-2)-F672*XS8(jF,kF-1)+F672*XS8(jF,kF+1)-F168*XS8(jF,kF+2)+F32*XS8(jF,kF+3)-(double)3*XS8(jF,kF+4));
            #undef XS8
            #define YS8(JF,KFDUMMY) (+(double)3*FH(iF,JF-4,KFDUMMY)-F32*FH(iF,JF-3,KFDUMMY)+F168*FH(iF,JF-2,KFDUMMY)-F672*FH(iF,JF-1,KFDUMMY)+F672*FH(iF,JF+1,KFDUMMY)-F168*FH(iF,JF+2,KFDUMMY)+F32*FH(iF,JF+3,KFDUMMY)-(double)3*FH(iF,JF+4,KFDUMMY))
            fyz[p]=Edydz*(+(double)3*YS8(jF,kF-4)-F32*YS8(jF,kF-3)+F168*YS8(jF,kF-2)-F672*YS8(jF,kF-1)+F672*YS8(jF,kF+1)-F168*YS8(jF,kF+2)+F32*YS8(jF,kF+3)-(double)3*YS8(jF,kF+4));
            #undef YS8
        }}}}
        #undef FH
        return;
    }
 #else
 #error "fdderivs_sh_c.C: unsupported ghost_width"
 #endif
 }
--- a/AMSS_NCKU_source/fdderivs_shc_c.C
+++ b/AMSS_NCKU_source/fdderivs_shc_c.C
@@ -1,107 +0,0 @@
 #include "macrodef.h"
 #include "share_func.h"
 #include <cstddef>
 /* Forward declarations — Fortran-mangled names from shell C kernels */
 extern "C" {
 void fderivs_sh_(const int ex[3], const double *f,
                 double *fx, double *fy, double *fz,
                 const double *X, const double *Y, const double *Z,
                 double SYM1, double SYM2, double SYM3,
                 int Symmetry, int onoff, int sst);
 void fdderivs_sh_(const int ex[3], const double *f,
                  double *fxx, double *fxy, double *fxz,
                  double *fyy, double *fyz, double *fzz,
                  const double *X, const double *Y, const double *Z,
                  double SYM1, double SYM2, double SYM3,
                  int Symmetry, int onoff, int sst);
 void fdderivs_shc_(int *ex,
                    double *f,
                    double *fxx, double *fxy, double *fxz,
                    double *fyy, double *fyz, double *fzz,
                    double *crho, double *sigma, double *R,
                    double &SYM1, double &SYM2, double &SYM3,
                    int &Symmetry, int &Lev, int &sst,
                    double *drhodx, double *drhody, double *drhodz,
                    double *dsigmadx, double *dsigmady, double *dsigmadz,
                    double *dRdx, double *dRdy, double *dRdz,
                    double *drhodxx, double *drhodxy, double *drhodxz,
                    double *drhodyy, double *drhodyz, double *drhodzz,
                    double *dsigmadxx, double *dsigmadxy, double *dsigmadxz,
                    double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
                    double *dRdxx, double *dRdxy, double *dRdxz,
                    double *dRdyy, double *dRdyz, double *dRdzz)
 {
    const int ex3[3] = { ex[0], ex[1], ex[2] };
    const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2];
    double *gx  = (double*)malloc(n * sizeof(double));
    double *gy  = (double*)malloc(n * sizeof(double));
    double *gz  = (double*)malloc(n * sizeof(double));
    double *gxx = (double*)malloc(n * sizeof(double));
    double *gxy = (double*)malloc(n * sizeof(double));
    double *gxz = (double*)malloc(n * sizeof(double));
    double *gyy = (double*)malloc(n * sizeof(double));
    double *gyz = (double*)malloc(n * sizeof(double));
    double *gzz = (double*)malloc(n * sizeof(double));
    if (!gx||!gy||!gz||!gxx||!gxy||!gxz||!gyy||!gyz||!gzz) {
        free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz);
        return;
    }
    fderivs_sh_(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
    fdderivs_sh_(ex3, f, gxx, gxy, gxz, gyy, gyz, gzz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
    for (size_t i = 0; i < n; ++i) {
        const double rx=drhodx[i], ry=drhody[i], rz=drhodz[i];
        const double sx=dsigmadx[i], sy=dsigmady[i], sz=dsigmadz[i];
        const double Rx=dRdx[i], Ry=dRdy[i], Rz=dRdz[i];
        const double rxx=drhodxx[i], rxy=drhodxy[i], rxz=drhodxz[i];
        const double ryy=drhodyy[i], ryz=drhodyz[i], rzz=drhodzz[i];
        const double sxx=dsigmadxx[i], sxy=dsigmadxy[i], sxz=dsigmadxz[i];
        const double syy=dsigmadyy[i], syz=dsigmadyz[i], szz=dsigmadzz[i];
        const double Rxx=dRdxx[i], Rxy=dRdxy[i], Rxz=dRdxz[i];
        const double Ryy=dRdyy[i], Ryz=dRdyz[i], Rzz=dRdzz[i];
        const double Gr=gx[i], Gs=gy[i], GR=gz[i];
        const double Grr=gxx[i], Grs=gxy[i], GrR=gxz[i];
        const double Gss=gyy[i], GsR=gyz[i], GRR=gzz[i];
        /* fxx */
        fxx[i] = rx*rx*Grr + sx*sx*Gss + Rx*Rx*GRR
               + 2.0*(rx*sx*Grs + rx*Rx*GrR + sx*Rx*GsR)
               + rxx*Gr + sxx*Gs + Rxx*GR;
        /* fxy */
        fxy[i] = rx*ry*Grr + sx*sy*Gss + Rx*Ry*GRR
               + rx*sy*Grs + ry*sx*Grs + rx*Ry*GrR + ry*Rx*GrR + sx*Ry*GsR + sy*Rx*GsR
               + rxy*Gr + sxy*Gs + Rxy*GR;
        /* fxz */
        fxz[i] = rx*rz*Grr + sx*sz*Gss + Rx*Rz*GRR
               + rx*sz*Grs + rz*sx*Grs + rx*Rz*GrR + rz*Rx*GrR + sx*Rz*GsR + sz*Rx*GsR
               + rxz*Gr + sxz*Gs + Rxz*GR;
        /* fyy */
        fyy[i] = ry*ry*Grr + sy*sy*Gss + Ry*Ry*GRR
               + 2.0*(ry*sy*Grs + ry*Ry*GrR + sy*Ry*GsR)
               + ryy*Gr + syy*Gs + Ryy*GR;
        /* fyz */
        fyz[i] = ry*rz*Grr + sy*sz*Gss + Ry*Rz*GRR
               + ry*sz*Grs + rz*sy*Grs + ry*Rz*GrR + rz*Ry*GrR + sy*Rz*GsR + sz*Ry*GsR
               + ryz*Gr + syz*Gs + Ryz*GR;
        /* fzz */
        fzz[i] = rz*rz*Grr + sz*sz*Gss + Rz*Rz*GRR
               + 2.0*(rz*sz*Grs + rz*Rz*GrR + sz*Rz*GsR)
               + rzz*Gr + szz*Gs + Rzz*GR;
    }
    free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz);
 }
 } // extern "C"
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -1,18 +1,14 @@
 #include "macrodef.h"
 #include "tool.h"
 /*
- * C 版 fderivs — first derivatives df/dx, df/dy, df/dz.
+ * C 版 fderivs
 *
- * Finite difference order is selected at compile time via the ghost_width macro
+ * Fortran:
- * (defined in macrodef.fh):
+ * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
 *   ghost_width = 2 → 2nd-order
 *   ghost_width = 3 → 4th-order
 *   ghost_width = 4 → 6th-order
 *   ghost_width = 5 → 8th-order
 *
- * Multi-pass overwrite strategy: compute the widest (lowest-order) stencil first,
+ * 约定：
- * then overwrite interior regions with progressively higher-order stencils.
+ *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
 *   X: ex1, Y: ex2, Z: ex3
 */
 void fderivs(const int ex[3],
             const double *f,
@@ -21,596 +17,151 @@ void fderivs(const int ex[3],
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff)
 {
-    (void)onoff;
+    (void)onoff; // Fortran 里没用到
-    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0;
+    const double ZEO = 0.0, ONE = 1.0;
-    const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0;
+    const double TWO = 2.0, EIT = 8.0;
-    const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
+    const double F12 = 12.0;
-    const int NO_SYMM = 0, EQ_SYMM = 1;
+    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
-    const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
+    // Fortran 1-based bounds
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
-    const int gw = ghost_width; // compile-time constant
+    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
-#if (ghost_width == 2)
+    // SoA(1:3) = SYM1,SYM2,SYM3
-    /* ---- 2nd-order ------------------------------------------------------ */
+    const double SoA[3] = { SYM1, SYM2, SYM3 };
    {
        const int ord = 1; // symmetry_bd ord = ghost_width - 1
-        int iminF = 1, jminF = 1, kminF = 1;
+    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = 0;
+    const size_t nx = (size_t)ex1 + 2;
-        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = 0;
+    const size_t ny = (size_t)ex2 + 2;
-        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = 0;
+    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static double *fh = NULL;
    static size_t cap = 0;
-        const double SoA[3] = { SYM1, SYM2, SYM3 };
+    if (fh_size > cap) {
-
+        free(fh);
-        const size_t nx = (size_t)ex1 + ord;
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
-        const size_t ny = (size_t)ex2 + ord;
+        cap = fh_size;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        static double *fh_buf = NULL;
        static size_t cap = 0;
        if (fh_size > cap) {
            free(fh_buf);
            fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
            cap = fh_size;
        }
        double *fh = fh_buf;
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d2dx = ONE / TWO / dX;
        const double d2dy = ONE / TWO / dY;
        const double d2dz = ONE / TWO / dZ;
        const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
        for (size_t p = 0; p < all; ++p) {
            fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
        }
        /* 2nd-order pass: [-1, 0, +1] / (2*dx) */
        const int i2_lo = (iminF > 0) ? iminF : 0;
        const int j2_lo = (jminF > 0) ? jminF : 0;
        const int k2_lo = (kminF > 0) ? kminF : 0;
        const int i2_hi = ex1 - 2;
        const int j2_hi = ex2 - 2;
        const int k2_hi = ex3 - 2;
        if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
            for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d2dx * (
                            -fh[idx_fh_F_ord1(iF - 1, jF,     kF,     ex)] +
                             fh[idx_fh_F_ord1(iF + 1, jF,     kF,     ex)]
                        );
                        fy[p] = d2dy * (
                            -fh[idx_fh_F_ord1(iF,     jF - 1, kF,     ex)] +
                             fh[idx_fh_F_ord1(iF,     jF + 1, kF,     ex)]
                        );
                        fz[p] = d2dz * (
                            -fh[idx_fh_F_ord1(iF,     jF,     kF - 1, ex)] +
                             fh[idx_fh_F_ord1(iF,     jF,     kF + 1, ex)]
                        );
                    }
                }
            }
        }
        return;
    }
-#elif (ghost_width == 3)
+    // double *fh = (double*)malloc(fh_size * sizeof(double));
-    /* ---- 4th-order (original code) ------------------------------------ */
+    if (!fh) return;
    {
        const int ord = 2; // symmetry_bd ord
-        int iminF = 1, jminF = 1, kminF = 1;
+    // call symmetry_bd(2,ex,f,fh,SoA)
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    symmetry_bd(2, ex, f, fh, SoA);
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
-        const double SoA[3] = { SYM1, SYM2, SYM3 };
+    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
-        const size_t nx = (size_t)ex1 + ord;
+    const double d2dx  = ONE / TWO / dX;
-        const size_t ny = (size_t)ex2 + ord;
+    const double d2dy  = ONE / TWO / dY;
-        const size_t nz = (size_t)ex3 + ord;
+    const double d2dz  = ONE / TWO / dZ;
        const size_t fh_size = nx * ny * nz;
-        static double *fh_buf = NULL;
+    // fx = fy = fz = 0
-        static size_t cap = 0;
+    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
-        if (fh_size > cap) {
+    for (size_t p = 0; p < all; ++p) {
-            free(fh_buf);
+        fx[p] = ZEO;
-            fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        fy[p] = ZEO;
-            cap = fh_size;
+        fz[p] = ZEO;
        }
        double *fh = fh_buf;
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d12dx = ONE / F12 / dX;
        const double d12dy = ONE / F12 / dY;
        const double d12dz = ONE / F12 / dZ;
        const double d2dx  = ONE / TWO / dX;
        const double d2dy  = ONE / TWO / dY;
        const double d2dz  = ONE / TWO / dZ;
        const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
        for (size_t p = 0; p < all; ++p) {
            fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
        }
        const int i2_lo = (iminF > 0) ? iminF : 0;
        const int j2_lo = (jminF > 0) ? jminF : 0;
        const int k2_lo = (kminF > 0) ? kminF : 0;
        const int i2_hi = ex1 - 2;
        const int j2_hi = ex2 - 2;
        const int k2_hi = ex3 - 2;
        const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
        const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
        const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
        const int i4_hi = ex1 - 3;
        const int j4_hi = ex2 - 3;
        const int k4_hi = ex3 - 3;
        if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
            for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d2dx * (
                            -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                             fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                        );
                        fy[p] = d2dy * (
                            -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                             fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                        );
                        fz[p] = d2dz * (
                            -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                             fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                        );
                    }
                }
            }
        }
        if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
            for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d12dx * (
                            fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                            EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                            EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                            fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                        );
                        fy[p] = d12dy * (
                            fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
                            EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                            EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                            fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                        );
                        fz[p] = d12dz * (
                            fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
                            EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                            EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
                            fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                        );
                    }
                }
            }
        }
        return;
    }
 #elif (ghost_width == 4)
    /* ---- 6th-order ----------------------------------------------------- */
    {
        const int ord = 3;
-        int iminF = 1, jminF = 1, kminF = 1;
+    /*
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
+     * 两段式：
-        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
+     * 1) 先在二阶可用区域计算二阶模板
-        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
+     * 2) 再在高阶可用区域覆盖为四阶模板
     *
     * 与原 if/elseif 逻辑等价，但减少逐点分支判断。
     */
    const int i2_lo = (iminF > 0) ? iminF : 0;
    const int j2_lo = (jminF > 0) ? jminF : 0;
    const int k2_lo = (kminF > 0) ? kminF : 0;
    const int i2_hi = ex1 - 2;
    const int j2_hi = ex2 - 2;
    const int k2_hi = ex3 - 2;
-        const double SoA[3] = { SYM1, SYM2, SYM3 };
+    const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
    const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
    const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
    const int i4_hi = ex1 - 3;
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;
-        const size_t nx = (size_t)ex1 + ord;
+    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
-        const size_t ny = (size_t)ex2 + ord;
+        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
-        const size_t nz = (size_t)ex3 + ord;
+            const int kF = k0 + 1;
-        const size_t fh_size = nx * ny * nz;
+            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
-        static double *fh_buf = NULL;
+                    fx[p] = d2dx * (
-        static size_t cap = 0;
+                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
-        if (fh_size > cap) {
+                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
-            free(fh_buf);
+                    );
            fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
            cap = fh_size;
        }
        double *fh = fh_buf;
        if (!fh) return;
-        symmetry_bd(ord, ex, f, fh, SoA);
+                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
-        /* Denominators */
+                    fz[p] = d2dz * (
-        const double d60dx = ONE / F60 / dX;
+                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
-        const double d60dy = ONE / F60 / dY;
+                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
-        const double d60dz = ONE / F60 / dZ;
+                    );
        const double d12dx = ONE / F12 / dX;
        const double d12dy = ONE / F12 / dY;
        const double d12dz = ONE / F12 / dZ;
        const double d2dx  = ONE / TWO / dX;
        const double d2dy  = ONE / TWO / dY;
        const double d2dz  = ONE / TWO / dZ;
        const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
        for (size_t p = 0; p < all; ++p) {
            fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
        }
        /* 2nd-order pass: 3pt, widest */
        const int i2_lo = (iminF > 0) ? iminF : 0;
        const int j2_lo = (jminF > 0) ? jminF : 0;
        const int k2_lo = (kminF > 0) ? kminF : 0;
        const int i2_hi = ex1 - 2;
        const int j2_hi = ex2 - 2;
        const int k2_hi = ex3 - 2;
        /* 4th-order pass: 5pt */
        const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
        const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
        const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
        const int i4_hi = ex1 - 3;
        const int j4_hi = ex2 - 3;
        const int k4_hi = ex3 - 3;
        /* 6th-order pass: 7pt, narrowest interior */
        const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
        const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
        const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
        const int i6_hi = ex1 - 4;
        const int j6_hi = ex2 - 4;
        const int k6_hi = ex3 - 4;
        /* 2nd-order */
        if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
            for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d2dx * (
                            -fh[idx_fh_F(iF - 1, jF,     kF,     ex)] +
                             fh[idx_fh_F(iF + 1, jF,     kF,     ex)]);
                        fy[p] = d2dy * (
                            -fh[idx_fh_F(iF,     jF - 1, kF,     ex)] +
                             fh[idx_fh_F(iF,     jF + 1, kF,     ex)]);
                        fz[p] = d2dz * (
                            -fh[idx_fh_F(iF,     jF,     kF - 1, ex)] +
                             fh[idx_fh_F(iF,     jF,     kF + 1, ex)]);
                    }
                }
            }
        }
        /* 4th-order overwrite */
        if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
            for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d12dx * (
                            fh[idx_fh_F(iF - 2, jF,     kF,     ex)] -
                            EIT * fh[idx_fh_F(iF - 1, jF,     kF,     ex)] +
                            EIT * fh[idx_fh_F(iF + 1, jF,     kF,     ex)] -
                            fh[idx_fh_F(iF + 2, jF,     kF,     ex)]);
                        fy[p] = d12dy * (
                            fh[idx_fh_F(iF,     jF - 2, kF,     ex)] -
                            EIT * fh[idx_fh_F(iF,     jF - 1, kF,     ex)] +
                            EIT * fh[idx_fh_F(iF,     jF + 1, kF,     ex)] -
                            fh[idx_fh_F(iF,     jF + 2, kF,     ex)]);
                        fz[p] = d12dz * (
                            fh[idx_fh_F(iF,     jF,     kF - 2, ex)] -
                            EIT * fh[idx_fh_F(iF,     jF,     kF - 1, ex)] +
                            EIT * fh[idx_fh_F(iF,     jF,     kF + 1, ex)] -
                            fh[idx_fh_F(iF,     jF,     kF + 2, ex)]);
                    }
                }
            }
        }
        /* 6th-order overwrite: [-1,+9,-45,0,+45,-9,+1] / (60*dx) */
        if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) {
            for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d60dx * (
                            -fh[idx_fh_F(iF - 3, jF,     kF,     ex)] +
                            F9 * fh[idx_fh_F(iF - 2, jF,     kF,     ex)] -
                            F45 * fh[idx_fh_F(iF - 1, jF,     kF,     ex)] +
                            F45 * fh[idx_fh_F(iF + 1, jF,     kF,     ex)] -
                            F9 * fh[idx_fh_F(iF + 2, jF,     kF,     ex)] +
                            fh[idx_fh_F(iF + 3, jF,     kF,     ex)]);
                        fy[p] = d60dy * (
                            -fh[idx_fh_F(iF,     jF - 3, kF,     ex)] +
                            F9 * fh[idx_fh_F(iF,     jF - 2, kF,     ex)] -
                            F45 * fh[idx_fh_F(iF,     jF - 1, kF,     ex)] +
                            F45 * fh[idx_fh_F(iF,     jF + 1, kF,     ex)] -
                            F9 * fh[idx_fh_F(iF,     jF + 2, kF,     ex)] +
                            fh[idx_fh_F(iF,     jF + 3, kF,     ex)]);
                        fz[p] = d60dz * (
                            -fh[idx_fh_F(iF,     jF,     kF - 3, ex)] +
                            F9 * fh[idx_fh_F(iF,     jF,     kF - 2, ex)] -
                            F45 * fh[idx_fh_F(iF,     jF,     kF - 1, ex)] +
                            F45 * fh[idx_fh_F(iF,     jF,     kF + 1, ex)] -
                            F9 * fh[idx_fh_F(iF,     jF,     kF + 2, ex)] +
                            fh[idx_fh_F(iF,     jF,     kF + 3, ex)]);
                    }
                }
            }
        }
        return;
    }
 #elif (ghost_width == 5)
    /* ---- 8th-order ----------------------------------------------------- */
    {
        const int ord = 4;
-        int iminF = 1, jminF = 1, kminF = 1;
+    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
+        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
-        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
+            const int kF = k0 + 1;
-        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
+            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
-        const double SoA[3] = { SYM1, SYM2, SYM3 };
+                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                    );
-        const size_t nx = (size_t)ex1 + ord;
+                    fy[p] = d12dy * (
-        const size_t ny = (size_t)ex2 + ord;
+                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
-        const size_t nz = (size_t)ex3 + ord;
+                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
-        const size_t fh_size = nx * ny * nz;
+                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                    );
-        static double *fh_buf = NULL;
+                    fz[p] = d12dz * (
-        static size_t cap = 0;
+                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
-        if (fh_size > cap) {
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
-            free(fh_buf);
+                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
-            fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
+                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
-            cap = fh_size;
+                    );
        }
        double *fh = fh_buf;
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d840dx = ONE / F840 / dX;
        const double d840dy = ONE / F840 / dY;
        const double d840dz = ONE / F840 / dZ;
        const double d60dx  = ONE / F60  / dX;
        const double d60dy  = ONE / F60  / dY;
        const double d60dz  = ONE / F60  / dZ;
        const double d12dx  = ONE / F12  / dX;
        const double d12dy  = ONE / F12  / dY;
        const double d12dz  = ONE / F12  / dZ;
        const double d2dx   = ONE / TWO  / dX;
        const double d2dy   = ONE / TWO  / dY;
        const double d2dz   = ONE / TWO  / dZ;
        const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
        for (size_t p = 0; p < all; ++p) {
            fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
        }
        /* 2nd: 3pt, widest */
        const int i2_lo = (iminF > 0) ? iminF : 0;
        const int j2_lo = (jminF > 0) ? jminF : 0;
        const int k2_lo = (kminF > 0) ? kminF : 0;
        const int i2_hi = ex1 - 2;
        const int j2_hi = ex2 - 2;
        const int k2_hi = ex3 - 2;
        /* 4th: 5pt */
        const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
        const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
        const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
        const int i4_hi = ex1 - 3;
        const int j4_hi = ex2 - 3;
        const int k4_hi = ex3 - 3;
        /* 6th: 7pt */
        const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
        const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
        const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
        const int i6_hi = ex1 - 4;
        const int j6_hi = ex2 - 4;
        const int k6_hi = ex3 - 4;
        /* 8th: 9pt, narrowest */
        const int i8_lo = (iminF + 3 > 0) ? (iminF + 3) : 0;
        const int j8_lo = (jminF + 3 > 0) ? (jminF + 3) : 0;
        const int k8_lo = (kminF + 3 > 0) ? (kminF + 3) : 0;
        const int i8_hi = ex1 - 5;
        const int j8_hi = ex2 - 5;
        const int k8_hi = ex3 - 5;
        if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
            for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d2dx * (
                            -fh[idx_fh_F_ord4(iF - 1, jF,     kF,     ex)] +
                             fh[idx_fh_F_ord4(iF + 1, jF,     kF,     ex)]);
                        fy[p] = d2dy * (
                            -fh[idx_fh_F_ord4(iF,     jF - 1, kF,     ex)] +
                             fh[idx_fh_F_ord4(iF,     jF + 1, kF,     ex)]);
                        fz[p] = d2dz * (
                            -fh[idx_fh_F_ord4(iF,     jF,     kF - 1, ex)] +
                             fh[idx_fh_F_ord4(iF,     jF,     kF + 1, ex)]);
                    }
                }
            }
        }
        if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
            for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d12dx * (
                            fh[idx_fh_F_ord4(iF - 2, jF,     kF,     ex)] -
                            EIT * fh[idx_fh_F_ord4(iF - 1, jF,     kF,     ex)] +
                            EIT * fh[idx_fh_F_ord4(iF + 1, jF,     kF,     ex)] -
                            fh[idx_fh_F_ord4(iF + 2, jF,     kF,     ex)]);
                        fy[p] = d12dy * (
                            fh[idx_fh_F_ord4(iF,     jF - 2, kF,     ex)] -
                            EIT * fh[idx_fh_F_ord4(iF,     jF - 1, kF,     ex)] +
                            EIT * fh[idx_fh_F_ord4(iF,     jF + 1, kF,     ex)] -
                            fh[idx_fh_F_ord4(iF,     jF + 2, kF,     ex)]);
                        fz[p] = d12dz * (
                            fh[idx_fh_F_ord4(iF,     jF,     kF - 2, ex)] -
                            EIT * fh[idx_fh_F_ord4(iF,     jF,     kF - 1, ex)] +
                            EIT * fh[idx_fh_F_ord4(iF,     jF,     kF + 1, ex)] -
                            fh[idx_fh_F_ord4(iF,     jF,     kF + 2, ex)]);
                    }
                }
            }
        }
        if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) {
            for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d60dx * (
                            -fh[idx_fh_F_ord4(iF - 3, jF,     kF,     ex)] +
                            F9 * fh[idx_fh_F_ord4(iF - 2, jF,     kF,     ex)] -
                            F45 * fh[idx_fh_F_ord4(iF - 1, jF,     kF,     ex)] +
                            F45 * fh[idx_fh_F_ord4(iF + 1, jF,     kF,     ex)] -
                            F9 * fh[idx_fh_F_ord4(iF + 2, jF,     kF,     ex)] +
                            fh[idx_fh_F_ord4(iF + 3, jF,     kF,     ex)]);
                        fy[p] = d60dy * (
                            -fh[idx_fh_F_ord4(iF,     jF - 3, kF,     ex)] +
                            F9 * fh[idx_fh_F_ord4(iF,     jF - 2, kF,     ex)] -
                            F45 * fh[idx_fh_F_ord4(iF,     jF - 1, kF,     ex)] +
                            F45 * fh[idx_fh_F_ord4(iF,     jF + 1, kF,     ex)] -
                            F9 * fh[idx_fh_F_ord4(iF,     jF + 2, kF,     ex)] +
                            fh[idx_fh_F_ord4(iF,     jF + 3, kF,     ex)]);
                        fz[p] = d60dz * (
                            -fh[idx_fh_F_ord4(iF,     jF,     kF - 3, ex)] +
                            F9 * fh[idx_fh_F_ord4(iF,     jF,     kF - 2, ex)] -
                            F45 * fh[idx_fh_F_ord4(iF,     jF,     kF - 1, ex)] +
                            F45 * fh[idx_fh_F_ord4(iF,     jF,     kF + 1, ex)] -
                            F9 * fh[idx_fh_F_ord4(iF,     jF,     kF + 2, ex)] +
                            fh[idx_fh_F_ord4(iF,     jF,     kF + 3, ex)]);
                    }
                }
            }
        }
        /* 8th-order overwrite: [+3,-32,+168,-672,0,+672,-168,+32,-3] / (840*dx) */
        if (i8_lo <= i8_hi && j8_lo <= j8_hi && k8_lo <= k8_hi) {
            for (int k0 = k8_lo; k0 <= k8_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j8_lo; j0 <= j8_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i8_lo; i0 <= i8_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        fx[p] = d840dx * (
                            +(double)3 * fh[idx_fh_F_ord4(iF - 4, jF,     kF,     ex)] -
                            F32  * fh[idx_fh_F_ord4(iF - 3, jF,     kF,     ex)] +
                            F168 * fh[idx_fh_F_ord4(iF - 2, jF,     kF,     ex)] -
                            F672 * fh[idx_fh_F_ord4(iF - 1, jF,     kF,     ex)] +
                            F672 * fh[idx_fh_F_ord4(iF + 1, jF,     kF,     ex)] -
                            F168 * fh[idx_fh_F_ord4(iF + 2, jF,     kF,     ex)] +
                            F32  * fh[idx_fh_F_ord4(iF + 3, jF,     kF,     ex)] -
                            (double)3 * fh[idx_fh_F_ord4(iF + 4, jF,     kF,     ex)]);
                        fy[p] = d840dy * (
                            +(double)3 * fh[idx_fh_F_ord4(iF,     jF - 4, kF,     ex)] -
                            F32  * fh[idx_fh_F_ord4(iF,     jF - 3, kF,     ex)] +
                            F168 * fh[idx_fh_F_ord4(iF,     jF - 2, kF,     ex)] -
                            F672 * fh[idx_fh_F_ord4(iF,     jF - 1, kF,     ex)] +
                            F672 * fh[idx_fh_F_ord4(iF,     jF + 1, kF,     ex)] -
                            F168 * fh[idx_fh_F_ord4(iF,     jF + 2, kF,     ex)] +
                            F32  * fh[idx_fh_F_ord4(iF,     jF + 3, kF,     ex)] -
                            (double)3 * fh[idx_fh_F_ord4(iF,     jF + 4, kF,     ex)]);
                        fz[p] = d840dz * (
                            +(double)3 * fh[idx_fh_F_ord4(iF,     jF,     kF - 4, ex)] -
                            F32  * fh[idx_fh_F_ord4(iF,     jF,     kF - 3, ex)] +
                            F168 * fh[idx_fh_F_ord4(iF,     jF,     kF - 2, ex)] -
                            F672 * fh[idx_fh_F_ord4(iF,     jF,     kF - 1, ex)] +
                            F672 * fh[idx_fh_F_ord4(iF,     jF,     kF + 1, ex)] -
                            F168 * fh[idx_fh_F_ord4(iF,     jF,     kF + 2, ex)] +
                            F32  * fh[idx_fh_F_ord4(iF,     jF,     kF + 3, ex)] -
                            (double)3 * fh[idx_fh_F_ord4(iF,     jF,     kF + 4, ex)]);
                    }
                }
            }
        }
        return;
    }
-#else
+
-#error "fderivs_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
+    // free(fh);
 #endif
 }
--- a/AMSS_NCKU_source/fderivs_sh_c.C
+++ b/AMSS_NCKU_source/fderivs_sh_c.C
@@ -1,234 +0,0 @@
 #include "macrodef.h"
 #include "share_func.h"
 /*
 * C 版 fderivs_sh — first derivatives on shell patch in (rho, sigma, R) coords.
 *
 * Same stencil coefficients as Cartesian fderivs, but:
 * - Uses symmetry_stbd (ghost on BOTH sides of x/y, none in z)
 * - fh buffer: (-ord+1:ex+ord) in x/y, (1:ex) in z
 * - SoA is 2-element only (x/y), no z-symmetry
 * - sst parameter (shell surface type, not used in stencil computation)
 */
 extern "C" void fderivs_sh_(const int ex[3],
                const double *f,
                double *fx, double *fy, double *fz,
                const double *X, const double *Y, const double *Z,
                double SYM1, double SYM2, double SYM3,
                int Symmetry, int onoff, int sst)
 {
    (void)SYM3; (void)onoff; (void)sst;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0;
    const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0;
    const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
    const double SoA[2] = { SYM1, SYM2 };
 #if (ghost_width == 2)
    {
        const int ord = 1;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = 0;
        if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = 0;
        if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = 0; // EQ reflection
        const size_t nx = (size_t)ex1 + 2 * ord;
        const size_t ny = (size_t)ex2 + 2 * ord;
        const size_t nz = (size_t)ex3;
        const size_t fh_size = nx * ny * nz;
        static double *fh_buf = NULL; static size_t cap = 0;
        if (fh_size > cap) { free(fh_buf); fh_buf = (double*)aligned_alloc(64, fh_size*sizeof(double)); cap = fh_size; }
        double *fh = fh_buf; if (!fh) return;
        symmetry_stbd(ord, ex, f, fh, SoA);
        const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ;
        const size_t all = (size_t)ex1*ex2*ex3;
        for (size_t p=0;p<all;++p) { fx[p]=ZEO; fy[p]=ZEO; fz[p]=ZEO; }
        const int i2_lo=(iminF>0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=1;
        const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2;
        if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) {
            for (int k0=k2_lo;k0<=k2_hi;++k0) { const int kF=k0+1;
            for (int j0=j2_lo;j0<=j2_hi;++j0) { const int jF=j0+1;
            for (int i0=i2_lo;i0<=i2_hi;++i0) { const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
                fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
                fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
            }}}
        }
        return;
    }
 #elif (ghost_width == 3)
    {
        const int ord = 2;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -1;
        if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -1;
        if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = -1;
        const size_t nx=(size_t)ex1+2*ord, ny=(size_t)ex2+2*ord, nz=(size_t)ex3;
        const size_t fh_size=nx*ny*nz;
        static double *fh_buf=NULL; static size_t cap=0;
        if (fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf; if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
        const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=1;
        const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0, j4_lo=(jminF+1>0)?jminF+1:0, k4_lo=2;
        const int i4_hi=ex1-3, j4_hi=ex2-3, k4_hi=ex3-3;
        if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) {
            for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
            for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
            for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
                fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
                fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
            }}}
        }
        if (i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi) {
            for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
            for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
            for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]);
                fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]);
                fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]);
            }}}
        }
        return;
    }
 #elif (ghost_width == 4)
    {
        const int ord = 3;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3;
        const size_t fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ;
        const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ;
        const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ;
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
        const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
            for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
            for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
            for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
                fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
                fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
            }}}
        }
        if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){
            for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
            for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
            for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]);
                fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]);
                fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]);
            }}}
        }
        if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){
            for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
            for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
            for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;
                const size_t p=idx_ex(i0,j0,k0,ex);
                fx[p]=d60dx*(-fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)]);
                fy[p]=d60dy*(-fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)]);
                fz[p]=d60dz*(-fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)]);
            }}}
        }
        return;
    }
 #elif (ghost_width == 5)
    {
        const int ord = 4;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3;
        const size_t fh_size=nx*ny*nz;
        static double *fh_buf=NULL;static size_t cap=0;
        if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
        double *fh=fh_buf;if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const double d840dx=ONE/F840/dX,d840dy=ONE/F840/dY,d840dz=ONE/F840/dZ;
        const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ;
        const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ;
        const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ;
        const size_t all=(size_t)ex1*ex2*ex3;
        for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
        const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
        const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
        const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
        const int i8_lo=(iminF+3>0)?iminF+3:0,j8_lo=(jminF+3>0)?jminF+3:0,k8_lo=4,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5;
        #define FH_S(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
        if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
        for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
        for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fx[p]=d2dx*(-FH_S(iF-1,jF,kF)+FH_S(iF+1,jF,kF));
            fy[p]=d2dy*(-FH_S(iF,jF-1,kF)+FH_S(iF,jF+1,kF));
            fz[p]=d2dz*(-FH_S(iF,jF,kF-1)+FH_S(iF,jF,kF+1));}}}}
        if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
        for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
        for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fx[p]=d12dx*(FH_S(iF-2,jF,kF)-EIT*FH_S(iF-1,jF,kF)+EIT*FH_S(iF+1,jF,kF)-FH_S(iF+2,jF,kF));
            fy[p]=d12dy*(FH_S(iF,jF-2,kF)-EIT*FH_S(iF,jF-1,kF)+EIT*FH_S(iF,jF+1,kF)-FH_S(iF,jF+2,kF));
            fz[p]=d12dz*(FH_S(iF,jF,kF-2)-EIT*FH_S(iF,jF,kF-1)+EIT*FH_S(iF,jF,kF+1)-FH_S(iF,jF,kF+2));}}}}
        if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
        for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
        for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fx[p]=d60dx*(-FH_S(iF-3,jF,kF)+F9*FH_S(iF-2,jF,kF)-F45*FH_S(iF-1,jF,kF)+F45*FH_S(iF+1,jF,kF)-F9*FH_S(iF+2,jF,kF)+FH_S(iF+3,jF,kF));
            fy[p]=d60dy*(-FH_S(iF,jF-3,kF)+F9*FH_S(iF,jF-2,kF)-F45*FH_S(iF,jF-1,kF)+F45*FH_S(iF,jF+1,kF)-F9*FH_S(iF,jF+2,kF)+FH_S(iF,jF+3,kF));
            fz[p]=d60dz*(-FH_S(iF,jF,kF-3)+F9*FH_S(iF,jF,kF-2)-F45*FH_S(iF,jF,kF-1)+F45*FH_S(iF,jF,kF+1)-F9*FH_S(iF,jF,kF+2)+FH_S(iF,jF,kF+3));}}}}
        if(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1;
        for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1;
        for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
            fx[p]=d840dx*(+(double)3*FH_S(iF-4,jF,kF)-F32*FH_S(iF-3,jF,kF)+F168*FH_S(iF-2,jF,kF)-F672*FH_S(iF-1,jF,kF)+F672*FH_S(iF+1,jF,kF)-F168*FH_S(iF+2,jF,kF)+F32*FH_S(iF+3,jF,kF)-(double)3*FH_S(iF+4,jF,kF));
            fy[p]=d840dy*(+(double)3*FH_S(iF,jF-4,kF)-F32*FH_S(iF,jF-3,kF)+F168*FH_S(iF,jF-2,kF)-F672*FH_S(iF,jF-1,kF)+F672*FH_S(iF,jF+1,kF)-F168*FH_S(iF,jF+2,kF)+F32*FH_S(iF,jF+3,kF)-(double)3*FH_S(iF,jF+4,kF));
            fz[p]=d840dz*(+(double)3*FH_S(iF,jF,kF-4)-F32*FH_S(iF,jF,kF-3)+F168*FH_S(iF,jF,kF-2)-F672*FH_S(iF,jF,kF-1)+F672*FH_S(iF,jF,kF+1)-F168*FH_S(iF,jF,kF+2)+F32*FH_S(iF,jF,kF+3)-(double)3*FH_S(iF,jF,kF+4));}}}}
        #undef FH_S
        return;
    }
 #else
 #error "fderivs_sh_c.C: unsupported ghost_width"
 #endif
 }
--- a/AMSS_NCKU_source/fderivs_shc_c.C
+++ b/AMSS_NCKU_source/fderivs_shc_c.C
@@ -1,54 +0,0 @@
 #include "macrodef.h"
 #include "share_func.h"
 #include <cstddef>
 /*
 * fderivs_shc — shell first derivatives converted to Cartesian via chain rule.
 *
 * Calls fderivs_sh internally, then:
 *   fx = drhodx * df/drho + dsigmadx * df/dsigma + dRdx * df/dR
 *   fy = drhody * df/drho + dsigmady * df/dsigma + dRdy * df/dR
 *   fz = drhodz * df/drho + dsigmadz * df/dsigma + dRdz * df/dR
 */
 // Forward declaration (defined in fderivs_sh_c.C with extern "C" name fderivs_sh_)
 extern "C" {
 void fderivs_sh_(const int ex[3], const double *f,
                 double *fx, double *fy, double *fz,
                 const double *X, const double *Y, const double *Z,
                 double SYM1, double SYM2, double SYM3,
                 int Symmetry, int onoff, int sst);
 void fderivs_shc_(int *ex,
                   double *f,
                   double *fx, double *fy, double *fz,
                   double *crho, double *sigma, double *R,
                   double &SYM1, double &SYM2, double &SYM3,
                   int &Symmetry, int &Lev, int &sst,
                   double *drhodx, double *drhody, double *drhodz,
                   double *dsigmadx, double *dsigmady, double *dsigmadz,
                   double *dRdx, double *dRdy, double *dRdz)
 {
    const int ex3[3] = { ex[0], ex[1], ex[2] };
    const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2];
    // Temporary shell-coordinate derivatives
    double *gx = (double*)malloc(n * sizeof(double));
    double *gy = (double*)malloc(n * sizeof(double));
    double *gz = (double*)malloc(n * sizeof(double));
    if (!gx || !gy || !gz) { free(gx); free(gy); free(gz); return; }
    // Compute shell-coordinate derivatives
    fderivs_sh_(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
    // Chain rule to Cartesian
    for (size_t i = 0; i < n; ++i) {
        fx[i] = drhodx[i]   * gx[i] + dsigmadx[i] * gy[i] + dRdx[i] * gz[i];
        fy[i] = drhody[i]   * gx[i] + dsigmady[i] * gy[i] + dRdy[i] * gz[i];
        fz[i] = drhodz[i]   * gx[i] + dsigmadz[i] * gy[i] + dRdz[i] * gz[i];
    }
    free(gx); free(gy); free(gz);
 }
 } // extern "C"
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -324,9 +324,6 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
 #if USE_FMISC_SAFE_MODE
  funcc = 0.d0
 #endif
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -352,9 +349,6 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
  integer::i
 #if USE_FMISC_SAFE_MODE
  funcc = 0.d0
 #endif
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -383,9 +377,6 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
  integer::i
 #if USE_FMISC_SAFE_MODE
  funcc = 0.d0
 #endif
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -1128,10 +1119,6 @@ end subroutine d2dump
 #define POLINT6_USE_BARYCENTRIC 1
 #endif
 #ifndef USE_FMISC_SAFE_MODE
 #define USE_FMISC_SAFE_MODE 0
 #endif
 !DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
  subroutine polint6_neville(xa, ya, x, y, dy)
  implicit none
@@ -1284,9 +1271,7 @@ end subroutine d2dump
  real*8 :: dif, dift, hp, h, den_val
  if (ordn == 6) then
-#if USE_FMISC_SAFE_MODE
+#if POLINT6_USE_BARYCENTRIC
    call polint6_neville(xa, ya, x, y, dy)
 #elif POLINT6_USE_BARYCENTRIC
    call polint6_barycentric(xa, ya, x, y, dy)
 #else
    call polint6_neville(xa, ya, x, y, dy)
@@ -1391,7 +1376,7 @@ end subroutine d2dump
  real*8, intent(in) :: x1,x2
  real*8, intent(out) :: y,dy
-#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
+#ifdef POLINT_LEGACY_ORDER
  integer  :: i,m
  real*8, dimension(ordn) :: ymtmp
  real*8, dimension(ordn) :: yntmp
@@ -1429,7 +1414,7 @@ end subroutine d2dump
  real*8, intent(in) :: x1,x2,x3
  real*8, intent(out) :: y,dy
-#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
+#ifdef POLINT_LEGACY_ORDER
  integer  :: i,j,m,n
  real*8, dimension(ordn,ordn) :: yatmp
  real*8, dimension(ordn) :: ymtmp
@@ -1517,23 +1502,12 @@ if(dabs(X(1)-xmin) < dX) imin = 1
 if(dabs(Y(1)-ymin) < dY) jmin = 1
 if(dabs(Z(1)-zmin) < dZ) kmin = 1
 #if USE_FMISC_SAFE_MODE
  f_out = 0.d0
  do k = kmin, kmax
    do j = jmin, jmax
      do i = imin, imax
        f_out = f_out + f(i,j,k)*f(i,j,k)
      end do
    end do
  end do
 #else
 ! Optimized with oneMKL BLAS DDOT for dot product
-  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-  allocate(f_flat(n_elements))
+allocate(f_flat(n_elements))
-  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-  deallocate(f_flat)
+deallocate(f_flat)
 #endif
 f_out = f_out*dX*dY*dZ
@@ -1591,9 +1565,7 @@ if(dabs(Z(1)-zmin) < dZ) kmin = 1
  do k=kmin,kmax
    do j=jmin,jmax
 #if !USE_FMISC_SAFE_MODE
 !DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
 #endif
      do i=imin,imax
        s1 = s1 + f1(i,j,k)*f1(i,j,k)
        s2 = s2 + f2(i,j,k)*f2(i,j,k)
@@ -1700,23 +1672,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
 #if USE_FMISC_SAFE_MODE
  f_out = 0.d0
  do k = kmin, kmax
    do j = jmin, jmax
      do i = imin, imax
        f_out = f_out + f(i,j,k)*f(i,j,k)
      end do
    end do
  end do
 #else
 ! Optimized with oneMKL BLAS DDOT for dot product
-  n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-  allocate(f_flat(n_elements))
+allocate(f_flat(n_elements))
-  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
-  f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
+f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
-  deallocate(f_flat)
+deallocate(f_flat)
 #endif
 f_out = f_out*dX*dY*dZ
@@ -1808,23 +1769,12 @@ if(Symmetry==2)then
  if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
 endif
 #if USE_FMISC_SAFE_MODE
  f_out = 0.d0
  do k = kmin, kmax
    do j = jmin, jmax
      do i = imin, imax
        f_out = f_out + f(i,j,k)*f(i,j,k)
      end do
    end do
  end do
 #else
 ! Optimized with oneMKL BLAS DDOT for dot product
-  Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
+Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
-  allocate(f_flat(Nout))
+allocate(f_flat(Nout))
-  f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
+f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
-  f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
+f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
-  deallocate(f_flat)
+deallocate(f_flat)
 #endif
  return
@@ -1928,19 +1878,9 @@ endif
  real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
  integer :: i,j,k
 #if USE_FMISC_SAFE_MODE
  do k=1,ext(3)
   do j=1,ext(2)
    do i=1,ext(1)
      fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
    enddo
   enddo
  enddo
 #else
  do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
    fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
  end do
 #endif
  return
@@ -2084,15 +2024,8 @@ endif
    tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
  enddo
 #if USE_FMISC_SAFE_MODE
  f_int = 0.d0
  do m = 1, ORDN
    f_int = f_int + coef(m) * tmp1(m)
  end do
 #else
  ! Third dimension: x-direction weighted sum using BLAS DDOT
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
 #endif
  return
@@ -2158,15 +2091,8 @@ endif
    tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
  enddo
 #if USE_FMISC_SAFE_MODE
  f_int = 0.d0
  do m = 1, ORDN
    f_int = f_int + coef(m) * tmp1(m)
  end do
 #else
  ! Use BLAS DDOT for final weighted sum
  f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
 #endif
  return
@@ -2258,15 +2184,8 @@ endif
          write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
  endif
 #if USE_FMISC_SAFE_MODE
  f_int = 0.d0
  do m = 1, ORDN
    f_int = f_int + coef(m) * ya(m)
  end do
 #else
  ! Optimized with BLAS DDOT for weighted sum
  f_int = DDOT(ORDN, coef, 1, ya, 1)
 #endif
  return
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -17,65 +17,103 @@ using namespace std;
 #include <math.h>
 #endif
-// Intel oneMKL LAPACK interface
+/* Linear equation solution by Gauss-Jordan elimination.
 #include <mkl_lapacke.h>
 /* Linear equation solution using Intel oneMKL LAPACK.
 a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
 containing the right-hand side vectors. On output a is
 replaced by its matrix inverse, and b is replaced by the
-corresponding set of solution vectors.
+corresponding set of solution vectors. */
 Mathematical equivalence:
  Solves: A * x = b  =>  x = A^(-1) * b
  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
  within numerical precision. */
 int gaussj(double *a, double *b, int n)
 {
-  // Allocate pivot array and workspace
+  double swap;
  lapack_int *ipiv = new lapack_int[n];
  lapack_int info;
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+  int *indxc, *indxr, *ipiv;
-  double *a_copy = new double[n * n];
+  indxc = new int[n];
-  for (int i = 0; i < n * n; i++) {
+  indxr = new int[n];
-    a_copy[i] = a[i];
+  ipiv = new int[n];
  int i, icol, irow, j, k, l, ll;
  double big, dum, pivinv;
  for (j = 0; j < n; j++)
    ipiv[j] = 0;
  for (i = 0; i < n; i++)
  {
    big = 0.0;
    for (j = 0; j < n; j++)
      if (ipiv[j] != 1)
        for (k = 0; k < n; k++)
        {
          if (ipiv[k] == 0)
          {
            if (fabs(a[j * n + k]) >= big)
            {
              big = fabs(a[j * n + k]);
              irow = j;
              icol = k;
            }
          }
          else if (ipiv[k] > 1)
          {
            cout << "gaussj: Singular Matrix-1" << endl;
            return 1;
          }
        }
    ipiv[icol] = ipiv[icol] + 1;
    if (irow != icol)
    {
      for (l = 0; l < n; l++)
      {
        swap = a[irow * n + l];
        a[irow * n + l] = a[icol * n + l];
        a[icol * n + l] = swap;
      }
      swap = b[irow];
      b[irow] = b[icol];
      b[icol] = swap;
    }
    indxr[i] = irow;
    indxc[i] = icol;
    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      return 1;
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
-  // Step 1: Solve linear system A*x = b using LU decomposition
+  for (l = n - 1; l >= 0; l--)
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+  {
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+    if (indxr[l] != indxc[l])
-
+      for (k = 0; k < n; k++)
-  if (info != 0) {
+      {
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+        swap = a[k * n + indxr[l]];
-    delete[] ipiv;
+        a[k * n + indxr[l]] = a[k * n + indxc[l]];
-    delete[] a_copy;
+        a[k * n + indxc[l]] = swap;
-    return 1;
+      }
  }
  // Step 2: Compute matrix inverse A^(-1) using LU factorization
  // First do LU factorization of original matrix a
  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  // Then compute inverse from LU factorization
  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
  if (info != 0) {
    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
    delete[] ipiv;
    delete[] a_copy;
    return 1;
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  delete[] a_copy;
  return 0;
 }
--- a/AMSS_NCKU_source/kodiss_c.C
+++ b/AMSS_NCKU_source/kodiss_c.C
@@ -1,16 +1,16 @@
 #include "macrodef.h"
 #include "tool.h"
 /*
- * C 版 kodis — Kreiss-Oliger numerical dissipation (Cartesian patches).
+ * C 版 kodis
 *
- * The KO operator is (D₊D₋)^r applied to f_rhs with alternating sign (-1)^(r-1).
+ * Fortran signature:
 * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
 *
- * FD order → r → cof=2^(2r) mapping:
+ * 约定：
- *   ghost_width=2 (2nd) → r=2, cof=16,  sign=-
+ *   X: ex1, Y: ex2, Z: ex3
- *   ghost_width=3 (4th) → r=3, cof=64,  sign=+
+ *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
- *   ghost_width=4 (6th) → r=4, cof=256, sign=-
+ *   SoA[3]
- *   ghost_width=5 (8th) → r=5, cof=1024,sign=+
+ *   eps: double
 */
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
@@ -18,304 +18,100 @@ void kodis(const int ex[3],
           const double SoA[3],
           int Symmetry, double eps)
 {
-    const double ZEO = 0.0;
+    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0;             // 2^6
    const int NO_SYMM = 0, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
-    const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
+    // Fortran: imax=ex(1) 等是 1-based 上界
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
-#if (ghost_width == 2)
+    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
-    /* ---- r=2, cof=16, sign=-, 5pt stencil ----------------------------- */
+    int iminF = 1, jminF = 1, kminF = 1;
    {
        const int ord = 2;
        const int r = 2;
        const double cof = 16.0;
        const double F4 = 4.0, F6 = 6.0;
        const int NO_SYMM = 0, EQ_SYMM = 1;
-        int iminF = 1, jminF = 1, kminF = 1;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
+    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
-        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
+    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
-        const size_t nx = (size_t)ex1 + ord;
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
-        const size_t ny = (size_t)ex2 + ord;
+    const size_t nx = (size_t)ex1 + 3;
-        const size_t nz = (size_t)ex3 + ord;
+    const size_t ny = (size_t)ex2 + 3;
-        const size_t fh_size = nx * ny * nz;
+    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
-        double *fh = (double*)malloc(fh_size * sizeof(double));
+    double *fh = (double*)malloc(fh_size * sizeof(double));
-        if (!fh) return;
+    if (!fh) return;
-        symmetry_bd(ord, ex, f, fh, SoA);
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
-        /* i±2 must be valid: i-2 >= iminF && i+2 <= imaxF
+    /*
-           C 0-based: i0 >= iminF+1, i0 <= ex1-3 */
+     * Fortran loops:
-        const int i0_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
+     * do k=1,ex3
-        const int j0_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
+     * do j=1,ex2
-        const int k0_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
+     * do i=1,ex1
-        const int i0_hi = imaxF - 3;
+     *
-        const int j0_hi = jmaxF - 3;
+     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
-        const int k0_hi = kmaxF - 3;
+     * 并定义 Fortran index: iF=i0+1, ...
     */
    // 收紧循环范围：只遍历满足 iF±3/jF±3/kF±3 条件的内部点
    // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
    // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
    const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
    const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
    const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
    const int i0_hi = imaxF - 4;  // inclusive
    const int j0_hi = jmaxF - 4;
    const int k0_hi = kmaxF - 4;
-        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
+    if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        const double Dx = (
                            (fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]) -
                            F4 * (fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]) +
                            F6 *  fh[idx_fh_F_ord2(iF,     jF, kF, ex)]
                        ) / dX;
                        const double Dy = (
                            (fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]) -
                            F4 * (fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]) +
                            F6 *  fh[idx_fh_F_ord2(iF, jF,     kF, ex)]
                        ) / dY;
                        const double Dz = (
                            (fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]) -
                            F4 * (fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]) +
                            F6 *  fh[idx_fh_F_ord2(iF, jF, kF,     ex)]
                        ) / dZ;
                        f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 3)
    /* ---- r=3, cof=64, sign=+, 7pt stencil (current default) ---------- */
    {
        const int ord = 3;
        const int r = 3;
        const double cof = 64.0;
        const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
        const int NO_SYMM = 0, OCTANT = 2;
-        int iminF = 1, jminF = 1, kminF = 1;
+    for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
-        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
+        const int kF = k0 + 1;
-        if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
+        for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
-        if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
+            const int jF = j0 + 1;
            for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                const int iF = i0 + 1;
-        const size_t nx = (size_t)ex1 + ord;
+                    const size_t p = idx_ex(i0, j0, k0, ex);
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
-        double *fh = (double*)malloc(fh_size * sizeof(double));
+                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
-        if (!fh) return;
+                    const double Dx_term =
                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
-        symmetry_bd(ord, ex, f, fh, SoA);
+                    const double Dy_term =
                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
-        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
+                    const double Dz_term =
-        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
+                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
-        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
+                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
-        const int i0_hi = imaxF - 4;
+                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
-        const int j0_hi = jmaxF - 4;
+                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
        const int k0_hi = kmaxF - 4;
-        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
+                    // Fortran:
-            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
+                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
-                const int kF = k0 + 1;
+                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        const double Dx = (
                            (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                            SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                            FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                            TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]
                        ) / dX;
                        const double Dy = (
                            (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                            SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                            FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                            TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]
                        ) / dY;
                        const double Dz = (
                            (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                            SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                            FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                            TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]
                        ) / dZ;
                        f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 4)
    /* ---- r=4, cof=256, sign=-, 9pt stencil ---------------------------- */
    {
        const int ord = 4;
        const int r = 4;
        const double cof = 256.0;
        const double F8 = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0;
        const int NO_SYMM = 0, EQ_SYMM = 1;
-        int iminF = 1, jminF = 1, kminF = 1;
+    free(fh);
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        double *fh = (double*)malloc(fh_size * sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        /* i±4 valid: i-4>=iminF → i0>=iminF+3, i+4<=imaxF → i0<=ex1-5 */
        const int i0_lo = (iminF + 3 > 0) ? iminF + 3 : 0;
        const int j0_lo = (jminF + 3 > 0) ? jminF + 3 : 0;
        const int k0_lo = (kminF + 3 > 0) ? kminF + 3 : 0;
        const int i0_hi = imaxF - 5;
        const int j0_hi = jmaxF - 5;
        const int k0_hi = kmaxF - 5;
        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        /* Stencil: [1,-8,28,-56,70,-56,28,-8,1] */
                        const double Dx = (
                            (fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]) -
                            F8 * (fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]) +
                            F28* (fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]) -
                            F56* (fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]) +
                            F70*  fh[idx_fh_F_ord4(iF,     jF, kF, ex)]
                        ) / dX;
                        const double Dy = (
                            (fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]) -
                            F8 * (fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]) +
                            F28* (fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]) -
                            F56* (fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]) +
                            F70*  fh[idx_fh_F_ord4(iF, jF,     kF, ex)]
                        ) / dY;
                        const double Dz = (
                            (fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]) -
                            F8 * (fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]) +
                            F28* (fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]) -
                            F56* (fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]) +
                            F70*  fh[idx_fh_F_ord4(iF, jF, kF,     ex)]
                        ) / dZ;
                        f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 5)
    /* ---- r=5, cof=1024, sign=+, 11pt stencil ------------------------- */
    {
        const int ord = 5;
        const int r = 5;
        const double cof = 1024.0;
        const double F10 = 10.0, F45 = 45.0, F120 = 120.0;
        const double F210 = 210.0, F252 = 252.0;
        const int NO_SYMM = 0, EQ_SYMM = 1;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        double *fh = (double*)malloc(fh_size * sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        /* i±5 valid: i0>=iminF+4, i0<=ex1-6 */
        const int i0_lo = (iminF + 4 > 0) ? iminF + 4 : 0;
        const int j0_lo = (jminF + 4 > 0) ? jminF + 4 : 0;
        const int k0_lo = (kminF + 4 > 0) ? kminF + 4 : 0;
        const int i0_hi = imaxF - 6;
        const int j0_hi = jmaxF - 6;
        const int k0_hi = kmaxF - 6;
        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
                        /* Stencil: [1,-10,45,-120,210,-252,210,-120,45,-10,1] */
                        const double Dx = (
                            (fh[idx_fh_F_ord5(iF - 5, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 5, jF, kF, ex)]) -
                            F10 * (fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]) +
                            F45 * (fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)]) -
                            F120* (fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)]) +
                            F210* (fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)]) -
                            F252*  fh[idx_fh_F_ord5(iF,     jF, kF, ex)]
                        ) / dX;
                        const double Dy = (
                            (fh[idx_fh_F_ord5(iF, jF - 5, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 5, kF, ex)]) -
                            F10 * (fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]) +
                            F45 * (fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)]) -
                            F120* (fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)]) +
                            F210* (fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)]) -
                            F252*  fh[idx_fh_F_ord5(iF, jF,     kF, ex)]
                        ) / dY;
                        const double Dz = (
                            (fh[idx_fh_F_ord5(iF, jF, kF - 5, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 5, ex)]) -
                            F10 * (fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]) +
                            F45 * (fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)]) -
                            F120* (fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)]) +
                            F210* (fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)]) -
                            F252*  fh[idx_fh_F_ord5(iF, jF, kF,     ex)]
                        ) / dZ;
                        f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #else
 #error "kodiss_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
 #endif
 }
--- a/AMSS_NCKU_source/kodiss_sh_c.C
+++ b/AMSS_NCKU_source/kodiss_sh_c.C
@@ -1,136 +0,0 @@
 #include "macrodef.h"
 #include "share_func.h"
 /*
 * kodis_sh — Kreiss-Oliger dissipation on shell patches.
 * Same stencil coefficients as Cartesian kodis. Uses symmetry_stbd.
 */
 extern "C" void kodis_sh_(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double SoAi[2],
              int Symmetry, double eps, int sst)
 {
    (void)sst;
    const double ZEO=0.0;
    const int ex1=ex[0], ex2=ex[1], ex3=ex[2];
    const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0];
    const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3;
    const double SoA[2]={SoAi[0],SoAi[1]};
 #if (ghost_width == 2)
    {
        const int ord=2, r=2;
        const double cof=16.0, F4=4.0, F6=6.0;
        const int NO_SYMM=0, OCTANT=2;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-1;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-1;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-1;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const int i0_lo=(iminF+1>0)?iminF+1:0,j0_lo=(jminF+1>0)?jminF+1:0,k0_lo=2;
        const int i0_hi=imaxF-3,j0_hi=jmaxF-3,k0_hi=kmaxF-3;
        if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
            for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
            for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
            for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                const double Dx=((fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
                const double Dy=((fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
                const double Dz=((fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
                f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz);
            }}}
        }
        free(fh);return;
    }
 #elif (ghost_width == 3)
    {
        const int ord=3, r=3;
        const double cof=64.0,SIX=6.0,FIT=15.0,TWT=20.0;
        const int NO_SYMM=0,OCTANT=2;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const int i0_lo=(iminF+2>0)?iminF+2:0,j0_lo=(jminF+2>0)?jminF+2:0,k0_lo=3;
        const int i0_hi=imaxF-4,j0_hi=jmaxF-4,k0_hi=kmaxF-4;
        if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
            for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
            for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
            for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                const double Dx=((fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
                const double Dy=((fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
                const double Dz=((fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
                f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz);
            }}}
        }
        free(fh);return;
    }
 #elif (ghost_width == 4)
    {
        const int ord=4, r=4;
        const double cof=256.0,F8=8.0,F28=28.0,F56=56.0,F70=70.0;
        const int NO_SYMM=0,OCTANT=2;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const int i0_lo=(iminF+3>0)?iminF+3:0,j0_lo=(jminF+3>0)?jminF+3:0,k0_lo=4;
        const int i0_hi=imaxF-5,j0_hi=jmaxF-5,k0_hi=kmaxF-5;
        if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
            for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
            for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
            for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                const double Dx=((fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
                const double Dy=((fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
                const double Dz=((fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
                f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz);
            }}}
        }
        free(fh);return;
    }
 #elif (ghost_width == 5)
    {
        const int ord=5, r=5;
        const double cof=1024.0,F10=10.0,F45k=45.0,F120=120.0,F210=210.0,F252=252.0;
        const int NO_SYMM=0,OCTANT=2;
        int iminF=1,jminF=1,kminF=1;
        if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-4;
        if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-4;
        if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-4;
        const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
        double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
        symmetry_stbd(ord,ex,f,fh,SoA);
        const int i0_lo=(iminF+4>0)?iminF+4:0,j0_lo=(jminF+4>0)?jminF+4:0,k0_lo=5;
        const int i0_hi=imaxF-6,j0_hi=jmaxF-6,k0_hi=kmaxF-6;
        if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
            for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
            for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
            for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
                const double Dx=((fh[idx_fh_stbd(iF-5,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+5,jF,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
                const double Dy=((fh[idx_fh_stbd(iF,jF-5,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+5,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
                const double Dz=((fh[idx_fh_stbd(iF,jF,kF-5,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+5,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
                f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz);
            }}}
        }
        free(fh);return;
    }
 #else
 #error "kodiss_sh_c.C: unsupported ghost_width"
 #endif
 }
--- a/AMSS_NCKU_source/lopsided_c.C
+++ b/AMSS_NCKU_source/lopsided_c.C
@@ -1,13 +1,14 @@
 #include "macrodef.h"
 #include "tool.h"
 /*
- * C 版 lopsided — upwind (lopsided) advection derivatives.
+ * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
 *
- * Adds advection terms to f_rhs for all three spatial directions.
+ * 约定：
- * Uses sign-biased (one-sided) stencils with centered fallbacks.
+ *   nghost = 3
- *
+ *   ex[3]  = {ex1,ex2,ex3}
- * For lopsided, symmetry_bd ord = ghost_width (same as kodiss).
+ *   f      = 原始网格 (ex1*ex2*ex3)
 *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
 *   SoA[3] = 输入参数
 */
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
@@ -15,577 +16,240 @@ void lopsided(const int ex[3],
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3])
 {
-    const double ZEO = 0.0, ONE = 1.0;
+    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
-    const double TWO = 2.0, F6 = 6.0, EIT = 8.0;
+    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
-    const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0;
+    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
-    const double F9 = 9.0, F45 = 45.0, F60 = 60.0;
+
-    const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0;
+    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
-    const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0;
+    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
    const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
    const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
-#if (ghost_width == 2)
+    const double d12dx = ONE / F12 / dX;
-    /* ---- 2nd-order lopsided --------------------------------------------- */
+    const double d12dy = ONE / F12 / dY;
-    {
+    const double d12dz = ONE / F12 / dZ;
        const int ord = 2;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
-        const size_t nx = (size_t)ex1 + ord;
+    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
-        const size_t ny = (size_t)ex2 + ord;
+    const double d2dx  = ONE / TWO / dX;
-        const size_t nz = (size_t)ex3 + ord;
+    const double d2dy  = ONE / TWO / dY;
-        const size_t fh_size = nx * ny * nz;
+    const double d2dz  = ONE / TWO / dZ;
    (void)d2dx; (void)d2dy; (void)d2dz;
-        double *fh = (double*)malloc(fh_size * sizeof(double));
+    // Fortran:
-        if (!fh) return;
+    // imax = ex(1); jmax = ex(2); kmax = ex(3)
-        symmetry_bd(ord, ex, f, fh, SoA);
+    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
-        const double d2dx  = ONE / TWO / dX;
+    // Fortran:
-        const double d2dy  = ONE / TWO / dY;
+    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
-        const double d2dz  = ONE / TWO / dZ;
+    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
-        const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
+    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
-        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
+    double *fh = (double*)malloc(fh_size * sizeof(double));
-            const int kF = k0 + 1;
+    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
-                    /* x-direction */
+    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
-                    const double sfx = Sfx[p];
+    symmetry_bd(3, ex, f, fh, SoA);
-                    if (sfx > ZEO) {
+
-                        if (i0 <= ex1 - 3) // i+2 <= imax
+    /*
-                            f_rhs[p] += sfx * d2dx * (
+     * Fortran 主循环：
-                                -F3*fh[idx_fh_F_ord2(iF,   jF, kF, ex)] +
+     * do k=1,ex(3)-1
-                                 F4*fh[idx_fh_F_ord2(iF+1, jF, kF, ex)] -
+     * do j=1,ex(2)-1
-                                    fh[idx_fh_F_ord2(iF+2, jF, kF, ex)]);
+     * do i=1,ex(1)-1
-                        else if (i0 <= ex1 - 2) // i+1 <= imax
+     *
-                            f_rhs[p] += sfx * d2dx * (
+     * 转成 C 0-based：
-                                -fh[idx_fh_F_ord2(iF,   jF, kF, ex)] +
+     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
-                                 fh[idx_fh_F_ord2(iF+1, jF, kF, ex)]);
+     *
-                    } else if (sfx < ZEO) {
+     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
-                        if ((i0 - 1) >= iminF) // i-2 >= imin
+     * iF=i0+1, jF=j0+1, kF=k0+1
-                            f_rhs[p] -= sfx * d2dx * (
+     */
-                                -F3*fh[idx_fh_F_ord2(iF,   jF, kF, ex)] +
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
-                                 F4*fh[idx_fh_F_ord2(iF-1, jF, kF, ex)] -
+        const int kF = k0 + 1;
-                                    fh[idx_fh_F_ord2(iF-2, jF, kF, ex)]);
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
-                        else if (i0 >= iminF) // i-1 >= imin
+            const int jF = j0 + 1;
-                            f_rhs[p] -= sfx * d2dx * (
+            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
-                                -fh[idx_fh_F_ord2(iF,   jF, kF, ex)] +
+                const int iF = i0 + 1;
-                                 fh[idx_fh_F_ord2(iF-1, jF, kF, ex)]);
+
                const size_t p = idx_ex(i0, j0, k0, ex);
                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    // Fortran: if(i+3 <= imax)
                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
-
+                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
-                    /* y-direction */
+                    else if (i0 <= ex1 - 3) {
-                    const double sfy = Sfy[p];
+                        f_rhs[p] += sfx * d12dx *
-                    if (sfy > ZEO) {
+                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
-                        if (j0 <= ex2-3)
+                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                            f_rhs[p] += sfy * d2dy * (
+                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                                -F3*fh[idx_fh_F_ord2(iF, jF,   kF, ex)] +
+                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                                 F4*fh[idx_fh_F_ord2(iF, jF+1, kF, ex)] -
                                    fh[idx_fh_F_ord2(iF, jF+2, kF, ex)]);
                        else if (j0 <= ex2-2)
                            f_rhs[p] += sfy * d2dy * (
                                -fh[idx_fh_F_ord2(iF, jF,   kF, ex)] +
                                 fh[idx_fh_F_ord2(iF, jF+1, kF, ex)]);
                    } else if (sfy < ZEO) {
                        if ((j0-1) >= jminF)
                            f_rhs[p] -= sfy * d2dy * (
                                -F3*fh[idx_fh_F_ord2(iF, jF,   kF, ex)] +
                                 F4*fh[idx_fh_F_ord2(iF, jF-1, kF, ex)] -
                                    fh[idx_fh_F_ord2(iF, jF-2, kF, ex)]);
                        else if (j0 >= jminF)
                            f_rhs[p] -= sfy * d2dy * (
                                -fh[idx_fh_F_ord2(iF, jF,   kF, ex)] +
                                 fh[idx_fh_F_ord2(iF, jF-1, kF, ex)]);
                    }
                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    // Fortran: if(i-3 >= imin)
                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i-1 >= imin) <=> i0 >= iminF
                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
-                    /* z-direction */
+                // ---------------- y direction ----------------
-                    const double sfz = Sfz[p];
+                const double sfy = Sfy[p];
-                    if (sfz > ZEO) {
+                if (sfy > ZEO) {
-                        if (k0 <= ex3-3)
+                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
-                            f_rhs[p] += sfz * d2dz * (
+                    if (j0 <= ex2 - 4) {
-                                -F3*fh[idx_fh_F_ord2(iF, jF, kF,   ex)] +
+                        f_rhs[p] += sfy * d12dy *
-                                 F4*fh[idx_fh_F_ord2(iF, jF, kF+1, ex)] -
+                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                                    fh[idx_fh_F_ord2(iF, jF, kF+2, ex)]);
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                        else if (k0 <= ex3-2)
+                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                            f_rhs[p] += sfz * d2dz * (
+                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
-                                -fh[idx_fh_F_ord2(iF, jF, kF,   ex)] +
+                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
-                                 fh[idx_fh_F_ord2(iF, jF, kF+1, ex)]);
+                    } else if (j0 <= ex2 - 3) {
-                    } else if (sfz < ZEO) {
+                        f_rhs[p] += sfy * d12dy *
-                        if ((k0-1) >= kminF)
+                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
-                            f_rhs[p] -= sfz * d2dz * (
+                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
-                                -F3*fh[idx_fh_F_ord2(iF, jF, kF,   ex)] +
+                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                                 F4*fh[idx_fh_F_ord2(iF, jF, kF-1, ex)] -
+                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
-                                    fh[idx_fh_F_ord2(iF, jF, kF-2, ex)]);
+                    } else if (j0 <= ex2 - 2) {
-                        else if (k0 >= kminF)
+                        f_rhs[p] -= sfy * d12dy *
-                            f_rhs[p] -= sfz * d2dz * (
+                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
-                                -fh[idx_fh_F_ord2(iF, jF, kF,   ex)] +
+                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
-                                 fh[idx_fh_F_ord2(iF, jF, kF-1, ex)]);
+                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
        free(fh);
        return;
    }
-#elif (ghost_width == 3)
+    free(fh);
    /* ---- 4th-order lopsided (original code) ---------------------------- */
    {
        const int ord = 3;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        double *fh = (double*)malloc(fh_size * sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d12dx = ONE / F12 / dX;
        const double d12dy = ONE / F12 / dY;
        const double d12dz = ONE / F12 / dZ;
        const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    const double sfx = Sfx[p];
                    if (sfx > ZEO) {
                        if (i0 <= ex1 - 4) // i+3 <= imax
                            f_rhs[p] += sfx * d12dx * (
                                -F3 *fh[idx_fh_F(iF-1, jF, kF, ex)]
                                -F10*fh[idx_fh_F(iF,   jF, kF, ex)]
                                +F18*fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -F6 *fh[idx_fh_F(iF+2, jF, kF, ex)]
                                +    fh[idx_fh_F(iF+3, jF, kF, ex)]);
                        else if (i0 <= ex1 - 3) // i+2 <= imax
                            f_rhs[p] += sfx * d12dx * (
                                fh[idx_fh_F(iF-2, jF, kF, ex)]
                                -EIT*fh[idx_fh_F(iF-1, jF, kF, ex)]
                                +EIT*fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -    fh[idx_fh_F(iF+2, jF, kF, ex)]);
                        else if (i0 <= ex1 - 2) // i+1 <= imax → mirrored
                            f_rhs[p] -= sfx * d12dx * (
                                -F3 *fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -F10*fh[idx_fh_F(iF,   jF, kF, ex)]
                                +F18*fh[idx_fh_F(iF-1, jF, kF, ex)]
                                -F6 *fh[idx_fh_F(iF-2, jF, kF, ex)]
                                +    fh[idx_fh_F(iF-3, jF, kF, ex)]);
                    } else if (sfx < ZEO) {
                        if ((i0 - 2) >= iminF) // i-3 >= imin
                            f_rhs[p] -= sfx * d12dx * (
                                -F3 *fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -F10*fh[idx_fh_F(iF,   jF, kF, ex)]
                                +F18*fh[idx_fh_F(iF-1, jF, kF, ex)]
                                -F6 *fh[idx_fh_F(iF-2, jF, kF, ex)]
                                +    fh[idx_fh_F(iF-3, jF, kF, ex)]);
                        else if ((i0 - 1) >= iminF) // i-2 >= imin
                            f_rhs[p] += sfx * d12dx * (
                                fh[idx_fh_F(iF-2, jF, kF, ex)]
                                -EIT*fh[idx_fh_F(iF-1, jF, kF, ex)]
                                +EIT*fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -    fh[idx_fh_F(iF+2, jF, kF, ex)]);
                        else if (i0 >= iminF) // i-1 >= imin → mirrored
                            f_rhs[p] += sfx * d12dx * (
                                -F3 *fh[idx_fh_F(iF-1, jF, kF, ex)]
                                -F10*fh[idx_fh_F(iF,   jF, kF, ex)]
                                +F18*fh[idx_fh_F(iF+1, jF, kF, ex)]
                                -F6 *fh[idx_fh_F(iF+2, jF, kF, ex)]
                                +    fh[idx_fh_F(iF+3, jF, kF, ex)]);
                    }
                    const double sfy = Sfy[p];
                    if (sfy > ZEO) {
                        if (j0 <= ex2-4)
                            f_rhs[p] += sfy * d12dy * (
                                -F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]
                                +fh[idx_fh_F(iF,jF+3,kF,ex)]);
                        else if (j0 <= ex2-3)
                            f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
                        else if (j0 <= ex2-2)
                            f_rhs[p] -= sfy * d12dy * (
                                -F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]
                                +fh[idx_fh_F(iF,jF-3,kF,ex)]);
                    } else if (sfy < ZEO) {
                        if ((j0-2) >= jminF)
                            f_rhs[p] -= sfy * d12dy * (
                                -F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]
                                +fh[idx_fh_F(iF,jF-3,kF,ex)]);
                        else if ((j0-1) >= jminF)
                            f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
                        else if (j0 >= jminF)
                            f_rhs[p] += sfy * d12dy * (
                                -F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]
                                +fh[idx_fh_F(iF,jF+3,kF,ex)]);
                    }
                    const double sfz = Sfz[p];
                    if (sfz > ZEO) {
                        if (k0 <= ex3-4)
                            f_rhs[p] += sfz * d12dz * (
                                -F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]
                                +fh[idx_fh_F(iF,jF,kF+3,ex)]);
                        else if (k0 <= ex3-3)
                            f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
                        else if (k0 <= ex3-2)
                            f_rhs[p] -= sfz * d12dz * (
                                -F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]
                                +fh[idx_fh_F(iF,jF,kF-3,ex)]);
                    } else if (sfz < ZEO) {
                        if ((k0-2) >= kminF)
                            f_rhs[p] -= sfz * d12dz * (
                                -F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]
                                +fh[idx_fh_F(iF,jF,kF-3,ex)]);
                        else if ((k0-1) >= kminF)
                            f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
                        else if (k0 >= kminF)
                            f_rhs[p] += sfz * d12dz * (
                                -F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
                                +F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]
                                +fh[idx_fh_F(iF,jF,kF+3,ex)]);
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 4)
    /* ---- 6th-order lopsided --------------------------------------------- */
    {
        const int ord = 4;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        double *fh = (double*)malloc(fh_size * sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d60dx = ONE / F60 / dX;
        const double d60dy = ONE / F60 / dY;
        const double d60dz = ONE / F60 / dZ;
        const double d12dx = ONE / F12 / dX;
        const double d12dy = ONE / F12 / dY;
        const double d12dz = ONE / F12 / dZ;
        const double d2dx  = ONE / TWO / dX;
        const double d2dy  = ONE / TWO / dY;
        const double d2dz  = ONE / TWO / dZ;
        const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    /* ---- x-direction ---- */
                    const double sfx = Sfx[p];
                    if (sfx > ZEO) {
                        /* Primary biased: 2*f(i-2)-24*f(i-1)-35*f(i)+80*f(i+1)-30*f(i+2)+8*f(i+3)-f(i+4) */
                        if (i0 <= ex1-5 && (i0-1)>=iminF) // i+4<=imax && i-2>=imin
                            f_rhs[p] += sfx * d60dx * (
                                +F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
                                -F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
                                -F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]
                                -fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]);
                        /* Boundary-adapted: -10*f(i-1)-77*f(i)+150*f(i+1)-100*f(i+2)+50*f(i+3)-15*f(i+4)+2*f(i+5) */
                        else if (i0 <= ex1-6 && i0 >= iminF) // i+5<=imax && i-1>=imin
                            f_rhs[p] += sfx * d60dx * (
                                -F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]
                                +F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]
                                +F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]
                                +F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]);
                        /* Centered fallbacks */
                        else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th: i+3<=imax && i-3>=imin
                            f_rhs[p] += sfx * d60dx * (
                                -fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
                                -F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
                                -F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
                        else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th
                            f_rhs[p] += sfx * d12dx * (
                                fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
                                +EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
                        else if (i0 <= ex1-2 && i0>=iminF) // 2nd
                            f_rhs[p] += sfx * d2dx * (
                                -fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
                    } else if (sfx < ZEO) {
                        if ((i0-4)>=iminF && i0<=ex1-2) // i-4>=imin && i+2<=imax
                            f_rhs[p] -= sfx * d60dx * (
                                +F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
                                -F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
                                -F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]
                                -fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
                        else if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+1<=imax
                            f_rhs[p] -= sfx * d60dx * (
                                -F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]
                                +F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
                                +F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]
                                +F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
                        else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
                            f_rhs[p] -= sfx * d60dx * (
                                -fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
                                -F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
                                -F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
                        else if ((i0-2)>=iminF && i0<=ex1-2) // 4th
                            f_rhs[p] -= sfx * d12dx * (
                                fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
                                +EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
                        else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd
                            f_rhs[p] -= sfx * d2dx * (
                                -fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
                    }
                    /* ---- y-direction ---- */
                    const double sfy = Sfy[p];
                    if (sfy > ZEO) {
                        if (j0<=ex2-5 && (j0-1)>=jminF)
                            f_rhs[p] += sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]);
                        else if (j0<=ex2-6 && j0>=jminF)
                            f_rhs[p] += sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]);
                        else if (j0<=ex2-4 && (j0-2)>=jminF)
                            f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
                        else if (j0<=ex2-3 && (j0-1)>=jminF)
                            f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
                        else if (j0<=ex2-2 && j0>=jminF)
                            f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
                    } else if (sfy < ZEO) {
                        if ((j0-4)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
                        else if ((j0-5)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
                        else if ((j0-3)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
                        else if ((j0-2)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
                        else if ((j0-1)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
                    }
                    /* ---- z-direction ---- */
                    const double sfz = Sfz[p];
                    if (sfz > ZEO) {
                        if (k0<=ex3-5 && (k0-1)>=kminF)
                            f_rhs[p] += sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]);
                        else if (k0<=ex3-6 && k0>=kminF)
                            f_rhs[p] += sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]);
                        else if (k0<=ex3-4 && (k0-2)>=kminF)
                            f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
                        else if (k0<=ex3-3 && (k0-1)>=kminF)
                            f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
                        else if (k0<=ex3-2 && k0>=kminF)
                            f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
                    } else if (sfz < ZEO) {
                        if ((k0-4)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
                        else if ((k0-5)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
                        else if ((k0-3)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
                        else if ((k0-2)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
                        else if ((k0-1)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 5)
    /* ---- 8th-order lopsided --------------------------------------------- */
    {
        const int ord = 5;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        const size_t fh_size = nx * ny * nz;
        double *fh = (double*)malloc(fh_size * sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d840dx = ONE / F840 / dX;
        const double d840dy = ONE / F840 / dY;
        const double d840dz = ONE / F840 / dZ;
        const double d60dx = ONE / F60 / dX;
        const double d60dy = ONE / F60 / dY;
        const double d60dz = ONE / F60 / dZ;
        const double d12dx = ONE / F12 / dX;
        const double d12dy = ONE / F12 / dY;
        const double d12dz = ONE / F12 / dZ;
        const double d2dx  = ONE / TWO / dX;
        const double d2dy  = ONE / TWO / dY;
        const double d2dz  = ONE / TWO / dZ;
        const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    const double sfx = Sfx[p];
                    if (sfx > ZEO) {
                        /* 8th biased: -5*f(i-3)+60*f(i-2)-420*f(i-1)-378*f(i)+1050*f(i+1)-420*f(i+2)+140*f(i+3)-30*f(i+4)+3*f(i+5) */
                        if (i0 <= ex1-6 && (i0-2)>=iminF) // i+5<=imax && i-3>=imin
                            f_rhs[p] += sfx * d840dx * (
                                -F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
                                -F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]
                                +F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
                                +F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]
                                +F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]);
                        /* 8th centered: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4) */
                        else if (i0 <= ex1-5 && (i0-3)>=iminF)
                            f_rhs[p] += sfx * d840dx * (
                                +F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]
                                +F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
                                +F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
                                +F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
                        else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th centered
                            f_rhs[p] += sfx * d60dx * (
                                -fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
                                -F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]
                                -F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
                        else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th centered
                            f_rhs[p] += sfx * d12dx * (
                                fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
                                +EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
                        else if (i0 <= ex1-2 && i0>=iminF) // 2nd centered
                            f_rhs[p] += sfx * d2dx * (
                                -fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
                    } else if (sfx < ZEO) {
                        if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+3<=imax
                            f_rhs[p] -= sfx * d840dx * (
                                -F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
                                -F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]
                                +F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
                                +F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]
                                +F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
                        else if ((i0-4)>=iminF && i0<=ex1-2) // 8th centered
                            f_rhs[p] -= sfx * d840dx * (
                                +F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]
                                +F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
                                +F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
                                +F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
                        else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
                            f_rhs[p] -= sfx * d60dx * (
                                -fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
                                -F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]
                                -F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
                        else if ((i0-2)>=iminF && i0<=ex1-2) // 4th centered
                            f_rhs[p] -= sfx * d12dx * (
                                fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
                                +EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
                        else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd centered
                            f_rhs[p] -= sfx * d2dx * (
                                -fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
                    }
                    const double sfy = Sfy[p];
                    if (sfy > ZEO) {
                        if (j0<=ex2-6 && (j0-2)>=jminF)
                            f_rhs[p] += sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]);
                        else if (j0<=ex2-5 && (j0-3)>=jminF)
                            f_rhs[p] += sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
                        else if (j0<=ex2-4 && (j0-2)>=jminF)
                            f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
                        else if (j0<=ex2-3 && (j0-1)>=jminF)
                            f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
                        else if (j0<=ex2-2 && j0>=jminF)
                            f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
                    } else if (sfy < ZEO) {
                        if ((j0-5)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
                        else if ((j0-4)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
                        else if ((j0-3)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
                        else if ((j0-2)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
                        else if ((j0-1)>=jminF && j0<=ex2-2)
                            f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
                    }
                    const double sfz = Sfz[p];
                    if (sfz > ZEO) {
                        if (k0<=ex3-6 && (k0-2)>=kminF)
                            f_rhs[p] += sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]);
                        else if (k0<=ex3-5 && (k0-3)>=kminF)
                            f_rhs[p] += sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
                        else if (k0<=ex3-4 && (k0-2)>=kminF)
                            f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
                        else if (k0<=ex3-3 && (k0-1)>=kminF)
                            f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
                        else if (k0<=ex3-2 && k0>=kminF)
                            f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
                    } else if (sfz < ZEO) {
                        if ((k0-5)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
                        else if ((k0-4)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
                        else if ((k0-3)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
                        else if ((k0-2)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
                        else if ((k0-1)>=kminF && k0<=ex3-2)
                            f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
                    }
                }
            }
        }
        free(fh);
        return;
    }
 #else
 #error "lopsided_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
 #endif
 }
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -1,15 +1,8 @@
 #include "macrodef.h"
 #include "tool.h"
 /*
- * C 版 lopsided_kodis — combined upwind advection + KO dissipation.
+ * Combined advection (lopsided) + KO dissipation (kodis).
- * Uses one shared symmetry_bd buffer (ord = ghost_width for both components).
+ * Uses one shared symmetry_bd buffer per call.
 *
 * FD order selection via ghost_width:
 *   2 → 2nd-order advection + r=2 KO (cof=16, sign=-)
 *   3 → 4th-order advection + r=3 KO (cof=64, sign=+)
 *   4 → 6th-order advection + r=4 KO (cof=256, sign=-)
 *   5 → 8th-order advection + r=5 KO (cof=1024, sign=+)
 */
 void lopsided_kodis(const int ex[3],
                    const double *X, const double *Y, const double *Z,
@@ -17,370 +10,239 @@ void lopsided_kodis(const int ex[3],
                    const double *Sfx, const double *Sfy, const double *Sfz,
                    int Symmetry, const double SoA[3], double eps)
 {
-    const double ZEO = 0.0, ONE = 1.0;
+    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
-    const double TWO = 2.0, F6 = 6.0, EIT = 8.0;
+    const double F6 = 6.0, F18 = 18.0;
-    const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0;
+    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
-    const double F9 = 9.0, F45 = 45.0, F60 = 60.0;
+    const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
-    const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0;
+    const double cof = 64.0; // 2^6
    const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0;
    const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
    const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
-    const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
+    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
-#if (ghost_width == 2)
+    const int imaxF = ex1;
-    {
+    const int jmaxF = ex2;
-        const int ord = 2;
+    const int kmaxF = ex3;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
-        const size_t nx = (size_t)ex1 + ord;
+    int iminF = 1, jminF = 1, kminF = 1;
-        const size_t ny = (size_t)ex2 + ord;
+    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
-        const size_t nz = (size_t)ex3 + ord;
+    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
-        double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
+    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
-        const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ;
+    // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
-        /* ---- advection (2nd-order) ---- */
+    double *fh = (double*)malloc(fh_size * sizeof(double));
-        for (int k0 = 0; k0 <= ex3-2; ++k0) {
+    if (!fh) return;
            const int kF = k0+1;
            for (int j0 = 0; j0 <= ex2-2; ++j0) {
                const int jF = j0+1;
                for (int i0 = 0; i0 <= ex1-2; ++i0) {
                    const int iF = i0+1;
                    const size_t p = idx_ex(i0,j0,k0,ex);
-                    const double sfx = Sfx[p];
+    symmetry_bd(3, ex, f, fh, SoA);
-                    if (sfx > ZEO) {
+
-                        if (i0<=ex1-3) f_rhs[p] += sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF+2,jF,kF,ex)]);
+    // Advection (same stencil logic as lopsided_c.C)
-                        else if (i0<=ex1-2) f_rhs[p] += sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]);
+    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
-                    } else if (sfx < ZEO) {
+        const int kF = k0 + 1;
-                        if ((i0-1)>=iminF) f_rhs[p] -= sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]);
+        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
-                        else if (i0>=iminF) f_rhs[p] -= sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]);
+            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
-                    const double sfy = Sfy[p];
+                } else if (sfx < ZEO) {
-                    if (sfy > ZEO) {
+                    if ((i0 - 2) >= iminF) {
-                        if (j0<=ex2-3) f_rhs[p] += sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF+2,kF,ex)]);
+                        f_rhs[p] -= sfx * d12dx *
-                        else if (j0<=ex2-2) f_rhs[p] += sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]);
+                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
-                    } else if (sfy < ZEO) {
+                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
-                        if ((j0-1)>=jminF) f_rhs[p] -= sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]);
+                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
-                        else if (j0>=jminF) f_rhs[p] -= sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]);
+                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    } else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    } else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
-                    const double sfz = Sfz[p];
+                }
-                    if (sfz > ZEO) {
+
-                        if (k0<=ex3-3) f_rhs[p] += sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF+2,ex)]);
+                const double sfy = Sfy[p];
-                        else if (k0<=ex3-2) f_rhs[p] += sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]);
+                if (sfy > ZEO) {
-                    } else if (sfz < ZEO) {
+                    if (j0 <= ex2 - 4) {
-                        if ((k0-1)>=kminF) f_rhs[p] -= sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]);
+                        f_rhs[p] += sfy * d12dy *
-                        else if (k0>=kminF) f_rhs[p] -= sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]);
+                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
        /* ---- KO dissipation (r=2, cof=16, sign=-) ---- */
        if (eps > ZEO) {
            const double cof = 16.0;
            const double F4k = 4.0, F6k = 6.0;
            const int i0_lo = (iminF+1>0)?iminF+1:0, j0_lo=(jminF+1>0)?jminF+1:0, k0_lo=(kminF+1>0)?kminF+1:0;
            const int i0_hi=imaxF-3, j0_hi=jmaxF-3, k0_hi=kmaxF-3;
            if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
                for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
                for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
                for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
                    const size_t p=idx_ex(i0,j0,k0,ex);
                    const double Dx=((fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+2,jF,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dX;
                    const double Dy=((fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+2,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dY;
                    const double Dz=((fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+2,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dZ;
                    f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz);
                }}}
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 3)
    /* ---- 4th-order advection + r=3 KO (original code) ----------------- */
    {
        const int ord = 3;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
-        const size_t nx = (size_t)ex1 + ord;
+    // KO dissipation (same domain restriction as kodiss_c.C)
-        const size_t ny = (size_t)ex2 + ord;
+    if (eps > ZEO) {
-        const size_t nz = (size_t)ex3 + ord;
+        const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
-        double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
+        const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
-        if (!fh) return;
+        const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
-        symmetry_bd(ord, ex, f, fh, SoA);
+        const int i0_hi = imaxF - 4; // inclusive
        const int j0_hi = jmaxF - 4;
        const int k0_hi = kmaxF - 4;
-        const double d12dx = ONE/F12/dX, d12dy = ONE/F12/dY, d12dz = ONE/F12/dZ;
+        if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
            for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
                const int kF = k0 + 1;
                for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
                    const int jF = j0 + 1;
                    for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
                        const int iF = i0 + 1;
                        const size_t p = idx_ex(i0, j0, k0, ex);
-        /* ---- advection ---- */
+                        const double Dx_term =
-        for (int k0 = 0; k0 <= ex3-2; ++k0) {
+                            ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
-            const int kF = k0+1;
+                             SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
-            for (int j0 = 0; j0 <= ex2-2; ++j0) {
+                             FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
-                const int jF = j0+1;
+                             TWT *  fh[idx_fh_F(iF,     jF, kF, ex)]) / dX;
                for (int i0 = 0; i0 <= ex1-2; ++i0) {
                    const int iF = i0+1;
                    const size_t p = idx_ex(i0,j0,k0,ex);
-                    const double sfx = Sfx[p];
+                        const double Dy_term =
-                    if (sfx > ZEO) {
+                            ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
-                        if (i0 <= ex1-4)
+                             SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
-                            f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]);
+                             FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
-                        else if (i0 <= ex1-3)
+                             TWT *  fh[idx_fh_F(iF, jF,     kF, ex)]) / dY;
-                            f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]);
+
-                        else if (i0 <= ex1-2)
+                        const double Dz_term =
-                            f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]);
+                            ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
-                    } else if (sfx < ZEO) {
+                             SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
-                        if ((i0-2) >= iminF)
+                             FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
-                            f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]);
+                             TWT *  fh[idx_fh_F(iF, jF, kF,     ex)]) / dZ;
-                        else if ((i0-1) >= iminF)
+
-                            f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]);
+                        f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                        else if (i0 >= iminF)
                            f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]);
                    }
                    const double sfy = Sfy[p];
                    if (sfy > ZEO) {
                        if (j0<=ex2-4) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]);
                        else if (j0<=ex2-3) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
                        else if (j0<=ex2-2) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]);
                    } else if (sfy < ZEO) {
                        if ((j0-2)>=jminF) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]);
                        else if ((j0-1)>=jminF) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
                        else if (j0>=jminF) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]);
                    }
                    const double sfz = Sfz[p];
                    if (sfz > ZEO) {
                        if (k0<=ex3-4) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]);
                        else if (k0<=ex3-3) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
                        else if (k0<=ex3-2) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]);
                    } else if (sfz < ZEO) {
                        if ((k0-2)>=kminF) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]);
                        else if ((k0-1)>=kminF) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
                        else if (k0>=kminF) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]);
                    }
                }
            }
        }
        /* ---- KO dissipation (r=3, cof=64, sign=+) ---- */
        if (eps > ZEO) {
            const double cof = 64.0;
            const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
            const int i0_lo=(iminF+2>0)?iminF+2:0, j0_lo=(jminF+2>0)?jminF+2:0, k0_lo=(kminF+2>0)?kminF+2:0;
            const int i0_hi=imaxF-4, j0_hi=jmaxF-4, k0_hi=kmaxF-4;
            if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
                for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
                for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
                for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
                    const size_t p=idx_ex(i0,j0,k0,ex);
                    const double Dx=((fh[idx_fh_F(iF-3,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)])-SIX*(fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF+2,jF,kF,ex)])+FIT*(fh[idx_fh_F(iF-1,jF,kF,ex)]+fh[idx_fh_F(iF+1,jF,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dX;
                    const double Dy=((fh[idx_fh_F(iF,jF-3,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)])-SIX*(fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF+2,kF,ex)])+FIT*(fh[idx_fh_F(iF,jF-1,kF,ex)]+fh[idx_fh_F(iF,jF+1,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dY;
                    const double Dz=((fh[idx_fh_F(iF,jF,kF-3,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)])-SIX*(fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF+2,ex)])+FIT*(fh[idx_fh_F(iF,jF,kF-1,ex)]+fh[idx_fh_F(iF,jF,kF+1,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dZ;
                    f_rhs[p] += (eps/cof)*(Dx+Dy+Dz);
                }}}
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 4)
    {
        const int ord = 4;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
-        const size_t nx = (size_t)ex1 + ord;
+    free(fh);
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ;
        const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
        const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
        /* ---- advection (6th-order lopsided) ---- */
        for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+1;
        for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+1;
        for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+1;
            const size_t p=idx_ex(i0,j0,k0,ex);
            /* x */
            const double sfx=Sfx[p];
            if (sfx>ZEO) {
                if (i0<=ex1-5&&(i0-1)>=iminF) f_rhs[p]+=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]);
                else if (i0<=ex1-6&&i0>=iminF) f_rhs[p]+=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]);
                else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
                else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
                else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
            } else if (sfx<ZEO) {
                if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
                else if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
                else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
                else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
                else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
            }
            /* y */
            const double sfy=Sfy[p];
            if (sfy>ZEO) {
                if (j0<=ex2-5&&(j0-1)>=jminF) f_rhs[p]+=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]);
                else if (j0<=ex2-6&&j0>=jminF) f_rhs[p]+=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]);
                else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
                else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
                else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
            } else if (sfy<ZEO) {
                if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
                else if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
                else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
                else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
                else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
            }
            /* z */
            const double sfz=Sfz[p];
            if (sfz>ZEO) {
                if (k0<=ex3-5&&(k0-1)>=kminF) f_rhs[p]+=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]);
                else if (k0<=ex3-6&&k0>=kminF) f_rhs[p]+=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]);
                else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
                else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
                else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
            } else if (sfz<ZEO) {
                if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
                else if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
                else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
                else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
                else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
            }
        }}}
        /* ---- KO dissipation (r=4, cof=256, sign=-) ---- */
        if (eps > ZEO) {
            const double cof = 256.0;
            const double F8k = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0;
            const int i0_lo=(iminF+3>0)?iminF+3:0, j0_lo=(jminF+3>0)?jminF+3:0, k0_lo=(kminF+3>0)?kminF+3:0;
            const int i0_hi=imaxF-5, j0_hi=jmaxF-5, k0_hi=kmaxF-5;
            if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
                for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
                for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
                for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
                    const size_t p=idx_ex(i0,j0,k0,ex);
                    const double Dx=((fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+4,jF,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+2,jF,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dX;
                    const double Dy=((fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+4,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+2,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dY;
                    const double Dz=((fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+4,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+2,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dZ;
                    f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz);
                }}}
            }
        }
        free(fh);
        return;
    }
 #elif (ghost_width == 5)
    {
        const int ord = 5;
        int iminF = 1, jminF = 1, kminF = 1;
        if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
        if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
        if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
        const size_t nx = (size_t)ex1 + ord;
        const size_t ny = (size_t)ex2 + ord;
        const size_t nz = (size_t)ex3 + ord;
        double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
        if (!fh) return;
        symmetry_bd(ord, ex, f, fh, SoA);
        const double d840dx=ONE/F840/dX, d840dy=ONE/F840/dY, d840dz=ONE/F840/dZ;
        const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ;
        const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
        const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
        /* ---- advection (8th-order lopsided) ---- */
        for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+1;
        for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+1;
        for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+1;
            const size_t p=idx_ex(i0,j0,k0,ex);
            const double sfx=Sfx[p];
            if (sfx>ZEO) {
                if (i0<=ex1-6&&(i0-2)>=iminF) f_rhs[p]+=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]);
                else if (i0<=ex1-5&&(i0-3)>=iminF) f_rhs[p]+=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
                else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
                else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
                else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
            } else if (sfx<ZEO) {
                if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
                else if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
                else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
                else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
                else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
            }
            const double sfy=Sfy[p];
            if (sfy>ZEO) {
                if (j0<=ex2-6&&(j0-2)>=jminF) f_rhs[p]+=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]);
                else if (j0<=ex2-5&&(j0-3)>=jminF) f_rhs[p]+=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
                else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
                else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
                else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
            } else if (sfy<ZEO) {
                if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
                else if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
                else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
                else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
                else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
            }
            const double sfz=Sfz[p];
            if (sfz>ZEO) {
                if (k0<=ex3-6&&(k0-2)>=kminF) f_rhs[p]+=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]);
                else if (k0<=ex3-5&&(k0-3)>=kminF) f_rhs[p]+=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
                else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
                else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
                else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
            } else if (sfz<ZEO) {
                if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
                else if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
                else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
                else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
                else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
            }
        }}}
        /* ---- KO dissipation (r=5, cof=1024, sign=+) ---- */
        if (eps > ZEO) {
            const double cof = 1024.0;
            const double F10k=10.0, F45k=45.0, F120=120.0, F210=210.0, F252=252.0;
            const int i0_lo=(iminF+4>0)?iminF+4:0, j0_lo=(jminF+4>0)?jminF+4:0, k0_lo=(kminF+4>0)?kminF+4:0;
            const int i0_hi=imaxF-6, j0_hi=jmaxF-6, k0_hi=kmaxF-6;
            if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
                for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
                for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
                for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
                    const size_t p=idx_ex(i0,j0,k0,ex);
                    const double Dx=((fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+5,jF,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+4,jF,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+2,jF,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dX;
                    const double Dy=((fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+5,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+4,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+2,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dY;
                    const double Dz=((fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+5,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+4,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+2,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dZ;
                    f_rhs[p] += (eps/cof)*(Dx+Dy+Dz);
                }}}
            }
        }
        free(fh);
        return;
    }
 #else
 #error "lopsided_kodis_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
 #endif
 }
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -2,90 +2,22 @@
 include makefile.inc
 -include AMSS_NCKU_build.mk
 ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
 ifeq ($(USE_TRANSFER_CACHE),auto)
 ifeq ($(ABE_TYPE),0)
 EFFECTIVE_USE_TRANSFER_CACHE = 1
 else
 EFFECTIVE_USE_TRANSFER_CACHE = 0
 endif
 else
 EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
 endif
 ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(ABE_TYPE),1)
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(USE_CXX_KERNELS),0)
 $(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel)
 endif
 endif
 ifeq ($(USE_CXX_EM_KERNEL),1)
 ifeq ($(ABE_TYPE),3)
 EFFECTIVE_USE_CXX_EM_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_EM_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_EM_KERNEL = 0
 endif
 ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
 ifeq ($(USE_CXX_KERNELS),0)
 $(error USE_CXX_EM_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_em_rhs_c.C reuses the C BSSN kernel)
 endif
 endif
 EM_KERNEL_FLAG = -DBSSN_USE_EM_C_KERNEL=$(EFFECTIVE_USE_CXX_EM_KERNEL)
 ## polint(ordn=6) kernel selector:
 ##   1 (default): barycentric fast path
 ##   0          : fallback to Neville path
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 FMISC_SAFE_FLAG = -DUSE_FMISC_SAFE_MODE=$(USE_FMISC_SAFE_MODE)
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
-## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
+## Legacy GNU/OpenMPI flags
-##   make                        -> opt  (PGO-guided, maximum performance)
+CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
-##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
+F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
 PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
-## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
+f90appflags = $(F90BASEFLAGS)
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
              $(FMISC_SAFE_FLAG) \
              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
 f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
              $(FMISC_SAFE_FLAG)
 else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-## PGO has been turned off, now tested and found to be negative optimization
+f90appflags = $(F90BASEFLAGS)
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
              $(FMISC_SAFE_FLAG) \
              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
 f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
              $(FMISC_SAFE_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -96,10 +28,6 @@ endif
 .C.o:
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 # ShellPatch.C uses OpenMP for setupintintstuff search loops
 ShellPatch.o: ShellPatch.C
 	${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
 .for.o:
 	$(f77) -c $< -o $@
@@ -125,42 +53,17 @@ lopsided_c.o: lopsided_c.C
 lopsided_kodis_c.o: lopsided_kodis_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 # C rewrite of shell-patch derivative kernels
 fderivs_sh_c.o: fderivs_sh_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fdderivs_sh_c.o: fdderivs_sh_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fderivs_shc_c.o: fderivs_shc_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 fdderivs_shc_c.o: fdderivs_shc_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 kodiss_sh_c.o: kodiss_sh_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 bssn_em_rhs_c.o: bssn_em_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 z4c_rhs_c.o: z4c_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
-TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
+TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
 TP_OPTFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
              -Dfortran3 -Dnewc -I${MKLROOT}/include
 TwoPunctures.o: TwoPunctures.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
 # Input files
@@ -169,21 +72,8 @@ ifeq ($(USE_CXX_KERNELS),0)
 # Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
 CFILES =
 else
-# C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels
+# C++ mode (default): C rewrite of bssn_rhs and helper kernels
 CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 CFILES += bssn_escalar_rhs_c.o
 endif
 ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
 CFILES += bssn_em_rhs_c.o
 endif
 endif
 ifeq ($(USE_CXX_Z4C_KERNELS),1)
 CFILES += z4c_rhs_c.o
 Z4C_F90_OBJ =
 else
 Z4C_F90_OBJ = Z4c_rhs.o
 endif
 ## RK4 kernel switch (independent from USE_CXX_KERNELS)
@@ -194,17 +84,6 @@ else
 RK4_F90_OBJ = rungekutta4_rout.o
 endif
 ## Shell-patch derivative kernel switch (independent from USE_CXX_KERNELS)
 ##   1 : use C++ rewrite of shell derivative functions (experimental)
 ##   0 : use original Fortran diff_new_sh.o and kodiss_sh.o (default)
 USE_CXX_SHELL_KERNELS ?= 0
 ifeq ($(USE_CXX_SHELL_KERNELS),1)
 CFILES += fderivs_sh_c.o fdderivs_sh_c.o fderivs_shc_c.o fdderivs_shc_c.o kodiss_sh_c.o
 SH_F90_OBJ =
 else
 SH_F90_OBJ = diff_new_sh.o kodiss_sh.o point_diff_new_sh.o
 endif
 C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
@@ -222,11 +101,11 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
 F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
-	   $(RK4_F90_OBJ) diff_new.o kodiss.o\
+	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
-	   lopsidediff.o sommerfeld_rout.o getnp4.o $(SH_F90_OBJ)\
+	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
 	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
-           fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o\
+           fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
 	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
 	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
 	   bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
@@ -292,7 +171,7 @@ ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILE
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
 TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,33 +1,27 @@
-## GCC version (commented out)
+## Legacy GNU/OpenMPI toolchain configuration
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+## OpenMPI wrappers are installed but may not be on PATH.
-filein  = -I/usr/include/ -I${MKLROOT}/include
+OMPI_BIN ?= /usr/lib64/openmpi/bin
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Wrapper compilers
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+f90          = $(OMPI_BIN)/mpifort
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+f77          = $(OMPI_BIN)/mpifort
 CXX          = $(OMPI_BIN)/mpicxx
 CC           = $(OMPI_BIN)/mpicc
 CLINKER      = $(OMPI_BIN)/mpicxx
-## Memory allocator switch
+## Extra include flags are not needed when using the OpenMPI wrappers.
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
+filein       =
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
-## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
+## BLAS/LAPACK backend:
-##   opt        : (default) maximum performance with PGO profile-guided optimization
+## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
-##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
+BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
-PGO_MODE ?= opt
+LDLIBS  = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
 ## PGO build mode switch
 ##   off        : default legacy GNU build without PGO
 ##   instrument : accepted for compatibility, currently same as off
 PGO_MODE ?= off
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
@@ -44,50 +38,18 @@ INTERP_LB_FLAGS =
 endif
 ## Kernel implementation switch
-##   1           : use C++ rewrite of bssn_rhs and helper kernels (faster)
+##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
-##   0 (default): fall back to original Fortran kernels
+##   0           : fall back to original Fortran kernels
-USE_CXX_KERNELS ?= 0
+USE_CXX_KERNELS ?= 1
 ## Z4C Cartesian RHS kernel switch
 ##   1           : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
 ##   0 (default): use original Fortran Z4c_rhs.o
 USE_CXX_Z4C_KERNELS ?= 0
 ## BSSN-EScalar RHS switch
 ##   1           : use BSSN-EScalar C wrapper on the normal patch path
 ##   0           : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
 ## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
 USE_CXX_ESCALAR_KERNEL ?= 0
 ## BSSN-EM RHS switch
 ##   1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path
 ##   0 : keep the original Fortran empart.f90 RHS for the EM fields (default)
 ## Note: experimental, requires USE_CXX_KERNELS=1
 USE_CXX_EM_KERNEL ?= 0
 ## Cached transfer switch
 ##   auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
 ##   1             : force cached Sync/Restrict/OutBd transfer on evolution hot paths
 ##   0             : force the original uncached transfer path
 USE_TRANSFER_CACHE ?= auto
 ## RK4 kernel implementation switch
-##   1           : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
+##   1 (default) : use C/C++ rewrite of rungekutta4_rout
-##   0 (default): use original Fortran rungekutta4_rout.o
+##   0           : use original Fortran rungekutta4_rout.o
-USE_CXX_RK4 ?= 0
+USE_CXX_RK4 ?= 1
-## fmisc conservative mode switch
+## OpenMP is only used for TwoPunctures on the legacy toolchain.
-##   1           : restore lower-optimization / legacy fmisc numerics
+TP_OPENMP_FLAGS ?= -fopenmp
 ##   0 (default): keep the optimized fmisc paths
 USE_FMISC_SAFE_MODE ?= 0
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
--- a/AMSS_NCKU_source/share_func.h
+++ b/AMSS_NCKU_source/share_func.h
@@ -46,45 +46,6 @@ static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(0:ex1, 0:ex2, 0:ex3)
 * ord=1 => shift=0
 * iF/jF/kF 为 Fortran 索引 (0..ex)
 */
 static inline size_t idx_fh_F_ord1(int iF, int jF, int kF, const int ex[3]) {
    const int nx = ex[0] + 1;            // ex1 + ord
    const int ny = ex[1] + 1;
    return (size_t)iF + (size_t)jF * (size_t)nx + (size_t)kF * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-3:ex1, -3:ex2, -3:ex3)
 * ord=4 => shift=3
 */
 static inline size_t idx_fh_F_ord4(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 3;
    const int nx = ex[0] + 4;            // ex1 + ord
    const int ny = ex[1] + 4;
    const int ii = iF + shift;           // 0..ex1+3
    const int jj = jF + shift;           // 0..ex2+3
    const int kk = kF + shift;           // 0..ex3+3
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-4:ex1, -4:ex2, -4:ex3)
 * ord=5 => shift=4
 */
 static inline size_t idx_fh_F_ord5(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 4;
    const int nx = ex[0] + 5;            // ex1 + ord
    const int ny = ex[1] + 5;
    const int ii = iF + shift;           // 0..ex1+4
    const int jj = jF + shift;           // 0..ex2+4
    const int kk = kF + shift;           // 0..ex3+4
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
 * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
@@ -270,10 +231,7 @@ static inline void symmetry_bd(int ord,
 {
    if (ord <= 0) return;
-    if (ord == 1) {
+    /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
        symmetry_bd_impl(1, 0, extc, func, funcc, SoA);
        return;
    }
    if (ord == 2) {
        symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
        return;
@@ -282,91 +240,7 @@ static inline void symmetry_bd(int ord,
        symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
        return;
    }
    if (ord == 4) {
        symmetry_bd_impl(4, 3, extc, func, funcc, SoA);
        return;
    }
    symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
 }
 /*
 * symmetry_stbd — shell-patch (staggered boundary) ghost fill.
 *
 * Fortran: funcc(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3)
 * Only 2 SoA values (x/y). No z symmetry fill.
 * Ghost on BOTH positive and negative sides of x and y.
 * Reflection uses i+2 (skips boundary) instead of i+1.
 * nx = extc1 + 2*ord, ny = extc2 + 2*ord
 */
 static inline void symmetry_stbd(int ord,
                                 const int extc[3],
                                 const double *func,
                                 double *funcc,
                                 const double SoA[2])
 {
    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
    const int nx = extc1 + 2 * ord;
    const int ny = extc2 + 2 * ord;
    const int sh = ord - 1;
    const size_t snx = (size_t)nx;
    const size_t splane = snx * (size_t)ny;
    /* 1) Copy interior: funcc(1:extc1, 1:extc2, 1:extc3) = func */
    for (int k0 = 0; k0 < extc3; ++k0) {
        const double *src = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
        const size_t kbase = (size_t)k0 * splane;
        for (int j0 = 0; j0 < extc2; ++j0) {
            double *dst = funcc + kbase + (size_t)(sh + j0 + 1) * snx + (size_t)(sh + 1);
            const double *s = src + (size_t)j0 * (size_t)extc1;
            for (int i0 = 0; i0 < extc1; ++i0) dst[i0] = s[i0];
        }
    }
    /* 2) x-direction ghost fill */
    const double s1 = SoA[0];
    for (int k0 = 0; k0 < extc3; ++k0) {
        const size_t kbase = (size_t)k0 * splane;
        for (int j0 = 0; j0 < extc2; ++j0) {
            const size_t off = kbase + (size_t)(sh + j0 + 1) * snx;
            /* left side: funcc(-i) = funcc(i+2) * s1 */
            for (int i = 0; i < ord; ++i) {
                funcc[off + (size_t)(sh - i)]       = funcc[off + (size_t)(sh + i + 2)] * s1;
                /* right side: funcc(extc1+1+i) = funcc(extc1-1-i) * s1 */
                funcc[off + (size_t)(sh + extc1 + 1 + i)] = funcc[off + (size_t)(sh + extc1 - 1 - i)] * s1;
            }
        }
    }
    /* 3) y-direction ghost fill */
    const double s2 = SoA[1];
    for (int i = 0; i < nx; ++i) {
        for (int k0 = 0; k0 < extc3; ++k0) {
            const size_t kbase = (size_t)k0 * splane;
            /* bottom: funcc(:,-i,:) = funcc(:,i+2,:) * s2 */
            for (int jj = 0; jj < ord; ++jj) {
                funcc[kbase + (size_t)(sh - jj) * snx + (size_t)i] =
                    funcc[kbase + (size_t)(sh + jj + 2) * snx + (size_t)i] * s2;
                /* top: funcc(:,extc2+1+jj,:) = funcc(:,extc2-1-jj,:) * s2 */
                funcc[kbase + (size_t)(sh + extc2 + 1 + jj) * snx + (size_t)i] =
                    funcc[kbase + (size_t)(sh + extc2 - 1 - jj) * snx + (size_t)i] * s2;
            }
        }
    }
 }
 /*
 * Indexing for shell fh buffer: Fortran fh(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3)
 * C 0-based: ii = iF + ord - 1
 * nx = extc1 + 2*ord, ny = extc2 + 2*ord
 */
 static inline size_t idx_fh_stbd(int iF, int jF, int kF, int ord, const int extc[3]) {
    const int sh = ord - 1;
    const int nx = extc[0] + 2 * ord;
    const int ny = extc[1] + 2 * ord;
    const int ii = iF + sh;
    const int jj = jF + sh;
    const int kk = kF - 1;  // Fortran 1-based kF → C 0-based
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 #endif
--- a/AMSS_NCKU_source/z4c_rhs_c.C
+++ b/AMSS_NCKU_source/z4c_rhs_c.C
@@ -1,901 +0,0 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "fmisc.h"
 #include "ricci_gamma.h"
 #include "share_func.h"
 #include "tool.h"
 #include <vector>
 #ifdef fortran1
 #define f_constraint_bssn constraint_bssn
 #define f_z4c_rhs_point z4c_rhs_point
 #endif
 #ifdef fortran2
 #define f_constraint_bssn CONSTRAINT_BSSN
 #define f_z4c_rhs_point Z4C_RHS_POINT
 #endif
 #ifdef fortran3
 #define f_constraint_bssn constraint_bssn_
 #define f_z4c_rhs_point z4c_rhs_point_
 #endif
 extern "C" void f_constraint_bssn(int *, double *, double *, double *,
                                  double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *, double *, double *, double *, double *, double *,
                                  double *, double *, double *,
                                  int &);
 extern "C" void f_z4c_rhs_point(
    double &A11,
    double &A12,
    double &A13,
    double &A22,
    double &A23,
    double &A33,
    double &alpha,
    double &B1,
    double &B2,
    double &B3,
    double &beta1,
    double &beta2,
    double &beta3,
    double &chi,
    double &chiDivFloor,
    double &da1,
    double &dA111,
    double &dA112,
    double &dA113,
    double &dA122,
    double &dA123,
    double &dA133,
    double &da2,
    double &dA211,
    double &dA212,
    double &dA213,
    double &dA222,
    double &dA223,
    double &dA233,
    double &da3,
    double &dA311,
    double &dA312,
    double &dA313,
    double &dA322,
    double &dA323,
    double &dA333,
    double &db11,
    double &dB11,
    double &db12,
    double &dB12,
    double &db13,
    double &dB13,
    double &db21,
    double &dB21,
    double &db22,
    double &dB22,
    double &db23,
    double &dB23,
    double &db31,
    double &dB31,
    double &db32,
    double &dB32,
    double &db33,
    double &dB33,
    double &dchi1,
    double &dchi2,
    double &dchi3,
    double &dda11,
    double &dda12,
    double &dda13,
    double &dda22,
    double &dda23,
    double &dda33,
    double &ddb111,
    double &ddb112,
    double &ddb113,
    double &ddb121,
    double &ddb122,
    double &ddb123,
    double &ddb131,
    double &ddb132,
    double &ddb133,
    double &ddb221,
    double &ddb222,
    double &ddb223,
    double &ddb231,
    double &ddb232,
    double &ddb233,
    double &ddb331,
    double &ddb332,
    double &ddb333,
    double &ddchi11,
    double &ddchi12,
    double &ddchi13,
    double &ddchi22,
    double &ddchi23,
    double &ddchi33,
    double &deldelg1111,
    double &deldelg1112,
    double &deldelg1113,
    double &deldelg1122,
    double &deldelg1123,
    double &deldelg1133,
    double &deldelg1211,
    double &deldelg1212,
    double &deldelg1213,
    double &deldelg1222,
    double &deldelg1223,
    double &deldelg1233,
    double &deldelg1311,
    double &deldelg1312,
    double &deldelg1313,
    double &deldelg1322,
    double &deldelg1323,
    double &deldelg1333,
    double &deldelg2211,
    double &deldelg2212,
    double &deldelg2213,
    double &deldelg2222,
    double &deldelg2223,
    double &deldelg2233,
    double &deldelg2311,
    double &deldelg2312,
    double &deldelg2313,
    double &deldelg2322,
    double &deldelg2323,
    double &deldelg2333,
    double &deldelg3311,
    double &deldelg3312,
    double &deldelg3313,
    double &deldelg3322,
    double &deldelg3323,
    double &deldelg3333,
    double &delG11,
    double &delg111,
    double &delg112,
    double &delg113,
    double &delG12,
    double &delg122,
    double &delg123,
    double &delG13,
    double &delg133,
    double &delG21,
    double &delg211,
    double &delg212,
    double &delg213,
    double &delG22,
    double &delg222,
    double &delg223,
    double &delG23,
    double &delg233,
    double &delG31,
    double &delg311,
    double &delg312,
    double &delg313,
    double &delG32,
    double &delg322,
    double &delg323,
    double &delG33,
    double &delg333,
    double &dKhat1,
    double &dKhat2,
    double &dKhat3,
    double &dTheta1,
    double &dTheta2,
    double &dTheta3,
    double &G1,
    double &g11,
    double &g12,
    double &g13,
    double &G2,
    double &g22,
    double &g23,
    double &G3,
    double &g33,
    double &kappa1,
    double &kappa2,
    double &Khat,
    double &rA11,
    double &rA12,
    double &rA13,
    double &rA22,
    double &rA23,
    double &rA33,
    double &rchi,
    double &rG1,
    double &rg11,
    double &rg12,
    double &rg13,
    double &rG2,
    double &rg22,
    double &rg23,
    double &rG3,
    double &rg33,
    double &rKhat,
    double &rTheta,
    double &Theta);
 static inline void z4c_contract_gamma(
    const double gxx, const double gxy, const double gxz,
    const double gyy, const double gyz, const double gzz,
    const double gxxx, const double gxyx, const double gxzx,
    const double gyyx, const double gyzx, const double gzzx,
    const double gxxy, const double gxyy, const double gxzy,
    const double gyyy, const double gyzy, const double gzzy,
    const double gxxz, const double gxyz, const double gxzz,
    const double gyyz, const double gyzz, const double gzzz,
    double &Gamxa, double &Gamya, double &Gamza)
 {
    double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz -
                 gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz;
    const double gupxx = (gyy * gzz - gyz * gyz) / det;
    const double gupxy = -(gxy * gzz - gyz * gxz) / det;
    const double gupxz = (gxy * gyz - gyy * gxz) / det;
    const double gupyy = (gxx * gzz - gxz * gxz) / det;
    const double gupyz = -(gxx * gyz - gxy * gxz) / det;
    const double gupzz = (gxx * gyy - gxy * gxy) / det;
    const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz));
    const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz));
    const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz));
    const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz));
    const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz));
    const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz));
    const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz);
    const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz);
    const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz);
    const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz));
    const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz));
    const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz));
    const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx);
    const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx);
    const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx);
    const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy);
    const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy);
    const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy);
    Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz +
            2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz);
    Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz +
            2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz);
    Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz +
            2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz);
 }
 static int compute_rhs_z4c_cartesian(
    int *ex, double &T, double *X, double *Y, double *Z,
    double *chi_state, double *chi_constraints, double *trK,
    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
    double *Gamx, double *Gamy, double *Gamz,
    double *Lap, double *betax, double *betay, double *betaz,
    double *dtSfx, double *dtSfy, double *dtSfz,
    double *TZ,
    double *chi_rhs, double *trK_rhs,
    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
    double *TZ_rhs,
    double *rho, double *Sx, double *Sy, double *Sz,
    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
    double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
    int &Symmetry, int &Lev, double &eps, int &co)
 {
    (void)T;
    const int nx = ex[0];
    const int ny = ex[1];
    const int nz = ex[2];
    const int all = nx * ny * nz;
    double alpn1[all], chin1[all], gxx[all], gyy[all], gzz[all];
    double chix[all], chiy[all], chiz[all], chixx[all], chixy[all], chixz[all], chiyy[all], chiyz[all], chizz[all];
    double gxxx[all], gxyx[all], gxzx[all], gyyx[all], gyzx[all], gzzx[all];
    double gxxy[all], gxyy[all], gxzy[all], gyyy[all], gyzy[all], gzzy[all];
    double gxxz[all], gxyz[all], gxzz[all], gyyz[all], gyzz[all], gzzz[all];
    double gxxxx[all], gxxxy[all], gxxxz[all], gxxyy[all], gxxyz[all], gxxzz[all];
    double gxyxx[all], gxyxy[all], gxyxz[all], gxyyy[all], gxyyz[all], gxyzz[all];
    double gxzxx[all], gxzxy[all], gxzxz[all], gxzyy[all], gxzyz[all], gxzzz[all];
    double gyyxx[all], gyyxy[all], gyyxz[all], gyyyy[all], gyyyz[all], gyyzz[all];
    double gyzxx[all], gyzxy[all], gyzxz[all], gyzyy[all], gyzyz[all], gyzzz[all];
    double gzzxx[all], gzzxy[all], gzzxz[all], gzzyy[all], gzzyz[all], gzzzz[all];
    double Lapx[all], Lapy[all], Lapz[all], Lapxx[all], Lapxy[all], Lapxz[all], Lapyy[all], Lapyz[all], Lapzz[all];
    double betaxx[all], betaxy[all], betaxz[all], betayx[all], betayy[all], betayz[all], betazx[all], betazy[all], betazz[all];
    double dBxx[all], dBxy[all], dBxz[all], dByx[all], dByy[all], dByz[all], dBzx[all], dBzy[all], dBzz[all];
    double sfxxx[all], sfxxy[all], sfxxz[all], sfxyy[all], sfxyz[all], sfxzz[all];
    double sfyxx[all], sfyxy[all], sfyxz[all], sfyyy[all], sfyyz[all], sfyzz[all];
    double sfzxx[all], sfzxy[all], sfzxz[all], sfzyy[all], sfzyz[all], sfzzz[all];
    double Gamxx[all], Gamxy[all], Gamxz[all], Gamyx[all], Gamyy[all], Gamyz[all], Gamzx[all], Gamzy[all], Gamzz[all];
    double Kx[all], Ky[all], Kz[all], TZx[all], TZy[all], TZz[all];
    double Axxx[all], Axxy[all], Axxz[all], Axyx[all], Axyy[all], Axyz[all];
    double Axzx[all], Axzy[all], Axzz[all], Ayyx[all], Ayyy[all], Ayyz[all];
    double Ayzx[all], Ayzy[all], Ayzz[all], Azzx[all], Azzy[all], Azzz[all];
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
    double reta[all];
 #endif
    const double SSS[3] = {1.0, 1.0, 1.0};
    const double AAS[3] = {-1.0, -1.0, 1.0};
    const double ASA[3] = {-1.0, 1.0, -1.0};
    const double SAA[3] = {1.0, -1.0, -1.0};
    const double ASS[3] = {-1.0, 1.0, 1.0};
    const double SAS[3] = {1.0, -1.0, 1.0};
    const double SSA[3] = {1.0, 1.0, -1.0};
    const double ONE = 1.0;
    const double TWO = 2.0;
    const double ZEO = 0.0;
    double chiDivfloor = 1.0e-5;
    double kappa1 = 2.0e-2;
    double kappa2 = 0.0;
    double FF = 0.75;
    double eta = 2.0;
    for (int idx = 0; idx < all; ++idx)
    {
        alpn1[idx] = Lap[idx] + ONE;
        chin1[idx] = chi_state[idx] + ONE;
        gxx[idx] = dxx[idx] + ONE;
        gyy[idx] = dyy[idx] + ONE;
        gzz[idx] = dzz[idx] + ONE;
    }
    fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dtSfx, dBxx, dBxy, dBxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dtSfy, dByx, dByy, dByz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dtSfz, dBzx, dBzy, dBzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, chi_state, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dxx, gxxxx, gxxxy, gxxxz, gxxyy, gxxyz, gxxzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dyy, gyyxx, gyyxy, gyyxz, gyyyy, gyyyz, gyyzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, dzz, gzzxx, gzzxy, gzzxz, gzzyy, gzzyz, gzzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, gxy, gxyxx, gxyxy, gxyxz, gxyyy, gxyyz, gxyzz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, gxz, gxzxx, gxzxy, gxzxz, gxzyy, gxzyz, gxzzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fdderivs(ex, gyz, gyzxx, gyzxy, gyzxz, gyzyy, gyzyz, gyzzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Gamx, Gamxx, Gamxy, Gamxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Gamy, Gamyx, Gamyy, Gamyz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Gamz, Gamzx, Gamzy, Gamzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, trK, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, TZ, TZx, TZy, TZz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betax, sfxxx, sfxxy, sfxxz, sfxyy, sfxyz, sfxzz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betay, sfyxx, sfyxy, sfyxz, sfyyy, sfyyz, sfyzz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, betaz, sfzxx, sfzxy, sfzxz, sfzyy, sfzyz, sfzzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
    fdderivs(ex, chi_state, chixx, chixy, chixz, chiyy, chiyz, chizz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fdderivs(ex, Lap, Lapxx, Lapxy, Lapxz, Lapyy, Lapyz, Lapzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axx, Axxx, Axxy, Axxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axy, Axyx, Axyy, Axyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Axz, Axzx, Axzy, Axzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Ayy, Ayyx, Ayyy, Ayyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    fderivs(ex, Ayz, Ayzx, Ayzy, Ayzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
    fderivs(ex, Azz, Azzx, Azzy, Azzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
    for (int idx = 0; idx < all; ++idx)
    {
        double point_kappa1 = 0.0;
        f_z4c_rhs_point(
            Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx],
            alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx],
            betax[idx], betay[idx], betaz[idx],
            chin1[idx], chiDivfloor,
            Lapx[idx],
            Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx],
            Lapy[idx],
            Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx],
            Lapz[idx],
            Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx],
            betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx],
            betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx],
            betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx],
            chix[idx], chiy[idx], chiz[idx],
            Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx],
            sfxxx[idx], sfyxx[idx], sfzxx[idx],
            sfxxy[idx], sfyxy[idx], sfzxy[idx],
            sfxxz[idx], sfyxz[idx], sfzxz[idx],
            sfxyy[idx], sfyyy[idx], sfzyy[idx],
            sfxyz[idx], sfyyz[idx], sfzyz[idx],
            sfxzz[idx], sfyzz[idx], sfzzz[idx],
            chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx],
            gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx],
            gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx],
            gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx],
            gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx],
            gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx],
            gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx],
            Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx],
            Gamyx[idx], gyyx[idx], gyzx[idx],
            Gamzx[idx], gzzx[idx],
            Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx],
            Gamyy[idx], gyyy[idx], gyzy[idx],
            Gamzy[idx], gzzy[idx],
            Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx],
            Gamyz[idx], gyyz[idx], gyzz[idx],
            Gamzz[idx], gzzz[idx],
            Kx[idx], Ky[idx], Kz[idx],
            TZx[idx], TZy[idx], TZz[idx],
            Gamx[idx], gxx[idx], gxy[idx], gxz[idx],
            Gamy[idx], gyy[idx], gyz[idx],
            Gamz[idx], gzz[idx],
            point_kappa1, kappa2,
            trK[idx],
            Axx_rhs[idx], Axy_rhs[idx], Axz_rhs[idx], Ayy_rhs[idx], Ayz_rhs[idx], Azz_rhs[idx],
            chi_rhs[idx],
            Gamx_rhs[idx], gxx_rhs[idx], gxy_rhs[idx], gxz_rhs[idx],
            Gamy_rhs[idx], gyy_rhs[idx], gyz_rhs[idx],
            Gamz_rhs[idx], gzz_rhs[idx], trK_rhs[idx], TZ_rhs[idx], TZ[idx]);
    }
    for (int idx = 0; idx < all; ++idx)
        Lap_rhs[idx] = -TWO * alpn1[idx] * trK[idx];
 #if (GAUGE == 0)
    for (int idx = 0; idx < all; ++idx)
    {
        betax_rhs[idx] = FF * dtSfx[idx];
        betay_rhs[idx] = FF * dtSfy[idx];
        betaz_rhs[idx] = FF * dtSfz[idx];
        dtSfx_rhs[idx] = Gamx_rhs[idx] - eta * dtSfx[idx];
        dtSfy_rhs[idx] = Gamy_rhs[idx] - eta * dtSfy[idx];
        dtSfz_rhs[idx] = Gamz_rhs[idx] - eta * dtSfz[idx];
    }
 #elif (GAUGE == 1)
    for (int idx = 0; idx < all; ++idx)
    {
        betax_rhs[idx] = Gamx[idx] - eta * betax[idx];
        betay_rhs[idx] = Gamy[idx] - eta * betay[idx];
        betaz_rhs[idx] = Gamz[idx] - eta * betaz[idx];
        dtSfx_rhs[idx] = ZEO;
        dtSfy_rhs[idx] = ZEO;
        dtSfz_rhs[idx] = ZEO;
    }
 #elif (GAUGE == 2)
    /* Variable-eta gamma-driver, chi-sqrt denominator */
    for (int idx = 0; idx < all; ++idx)
    {
        const double chin1i = chin1[idx];
        const double det = gxx[idx] * gyy[idx] * gzz[idx]
                         + gxy[idx] * gyz[idx] * gxz[idx] * 2.0
                         - gxz[idx] * gyy[idx] * gxz[idx]
                         - gxy[idx] * gxy[idx] * gzz[idx]
                         - gxx[idx] * gyz[idx] * gyz[idx];
        const double idet = ONE / det;
        const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
        const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
        const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
        const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
        const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
        const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
        const double grdchi2 =
            upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
          + TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
        const double sqchi = sqrt(chin1i);
        reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - sqchi) * (ONE - sqchi));
        betax_rhs[idx] = FF * dtSfx[idx];
        betay_rhs[idx] = FF * dtSfy[idx];
        betaz_rhs[idx] = FF * dtSfz[idx];
        dtSfx_rhs[idx] = Gamx_rhs[idx] - reta[idx] * dtSfx[idx];
        dtSfy_rhs[idx] = Gamy_rhs[idx] - reta[idx] * dtSfy[idx];
        dtSfz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * dtSfz[idx];
    }
 #elif (GAUGE == 3)
    /* Variable-eta gamma-driver, chi-linear denominator */
    for (int idx = 0; idx < all; ++idx)
    {
        const double chin1i = chin1[idx];
        const double det = gxx[idx] * gyy[idx] * gzz[idx]
                         + gxy[idx] * gyz[idx] * gxz[idx] * 2.0
                         - gxz[idx] * gyy[idx] * gxz[idx]
                         - gxy[idx] * gxy[idx] * gzz[idx]
                         - gxx[idx] * gyz[idx] * gyz[idx];
        const double idet = ONE / det;
        const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
        const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
        const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
        const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
        const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
        const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
        const double grdchi2 =
            upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
          + TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
        reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - chin1i) * (ONE - chin1i));
        betax_rhs[idx] = FF * dtSfx[idx];
        betay_rhs[idx] = FF * dtSfy[idx];
        betaz_rhs[idx] = FF * dtSfz[idx];
        dtSfx_rhs[idx] = Gamx_rhs[idx] - reta[idx] * dtSfx[idx];
        dtSfy_rhs[idx] = Gamy_rhs[idx] - reta[idx] * dtSfy[idx];
        dtSfz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * dtSfz[idx];
    }
 #elif (GAUGE == 4)
    /* Variable-eta gamma-driver, first-order, chi-sqrt denominator */
    for (int idx = 0; idx < all; ++idx)
    {
        const double chin1i = chin1[idx];
        const double det = gxx[idx] * gyy[idx] * gzz[idx]
                         + gxy[idx] * gyz[idx] * gxz[idx] * 2.0
                         - gxz[idx] * gyy[idx] * gxz[idx]
                         - gxy[idx] * gxy[idx] * gzz[idx]
                         - gxx[idx] * gyz[idx] * gyz[idx];
        const double idet = ONE / det;
        const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
        const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
        const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
        const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
        const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
        const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
        const double grdchi2 =
            upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
          + TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
        const double sqchi = sqrt(chin1i);
        reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - sqchi) * (ONE - sqchi));
        betax_rhs[idx] = Gamx_rhs[idx] - reta[idx] * betax[idx];
        betay_rhs[idx] = Gamy_rhs[idx] - reta[idx] * betay[idx];
        betaz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * betaz[idx];
        dtSfx_rhs[idx] = ZEO;
        dtSfy_rhs[idx] = ZEO;
        dtSfz_rhs[idx] = ZEO;
    }
 #elif (GAUGE == 5)
    /* Variable-eta gamma-driver, first-order, chi-linear denominator */
    for (int idx = 0; idx < all; ++idx)
    {
        const double chin1i = chin1[idx];
        const double det = gxx[idx] * gyy[idx] * gzz[idx]
                         + gxy[idx] * gyz[idx] * gxz[idx] * 2.0
                         - gxz[idx] * gyy[idx] * gxz[idx]
                         - gxy[idx] * gxy[idx] * gzz[idx]
                         - gxx[idx] * gyz[idx] * gyz[idx];
        const double idet = ONE / det;
        const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
        const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
        const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
        const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
        const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
        const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
        const double grdchi2 =
            upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
          + TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
        reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - chin1i) * (ONE - chin1i));
        betax_rhs[idx] = Gamx_rhs[idx] - reta[idx] * betax[idx];
        betay_rhs[idx] = Gamy_rhs[idx] - reta[idx] * betay[idx];
        betaz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * betaz[idx];
        dtSfx_rhs[idx] = ZEO;
        dtSfy_rhs[idx] = ZEO;
        dtSfz_rhs[idx] = ZEO;
    }
 #elif (GAUGE == 6 || GAUGE == 7)
    {
        /* Jason's position-dependent damping: rational (6) or exponential (7) */
        int BHN = 0;
        double Porg[9] = {0.0};
        double Mass[3] = {0.0};
    #ifdef fortran1
        extern "C" { void getpbh(int &, double *, double *); }
    #elif defined(fortran2)
        extern "C" { void GETPBH(int &, double *, double *); }
    #else
        extern "C" { void getpbh_(int &, double *, double *); }
    #endif
        {
    #ifdef fortran1
            getpbh(BHN, Porg, Mass);
    #elif defined(fortran2)
            GETPBH(BHN, Porg, Mass);
    #else
            getpbh_(BHN, Porg, Mass);
    #endif
        }
        if (BHN == 2)
        {
            const double M = Mass[0] + Mass[1];
            const double A = 2.0 / M;
            const double w1 = 12.0, w2 = 12.0;
            const double C1 = 1.0 / Mass[0] - A;
            const double C2 = 1.0 / Mass[1] - A;
            const double BH_sep2 = (Porg[3] - Porg[0]) * (Porg[3] - Porg[0])
                                 + (Porg[4] - Porg[1]) * (Porg[4] - Porg[1])
                                 + (Porg[5] - Porg[2]) * (Porg[5] - Porg[2]);
            const double inv_BH_sep2 = 1.0 / BH_sep2;
            for (int k0 = 0; k0 < nz; ++k0) {
            for (int j0 = 0; j0 < ny; ++j0) {
            for (int i0 = 0; i0 < nx; ++i0) {
                const size_t idx = idx_ex(i0, j0, k0, ex);
                const double xp = X[i0], yp = Y[j0], zp = Z[k0];
                const double r1 = ((Porg[0]-xp)*(Porg[0]-xp) + (Porg[1]-yp)*(Porg[1]-yp) + (Porg[2]-zp)*(Porg[2]-zp)) * inv_BH_sep2;
                const double r2 = ((Porg[3]-xp)*(Porg[3]-xp) + (Porg[4]-yp)*(Porg[4]-yp) + (Porg[5]-zp)*(Porg[5]-zp)) * inv_BH_sep2;
            #if (GAUGE == 6)
                const double reta_val = A + C1 / (1.0 + w1 * r1) + C2 / (1.0 + w2 * r2);
            #else
                const double reta_val = A + C1 * exp(-w1 * r1) + C2 * exp(-w2 * r2);
            #endif
                betax_rhs[idx] = FF * dtSfx[idx];
                betay_rhs[idx] = FF * dtSfy[idx];
                betaz_rhs[idx] = FF * dtSfz[idx];
                dtSfx_rhs[idx] = Gamx_rhs[idx] - reta_val * dtSfx[idx];
                dtSfy_rhs[idx] = Gamy_rhs[idx] - reta_val * dtSfy[idx];
                dtSfz_rhs[idx] = Gamz_rhs[idx] - reta_val * dtSfz[idx];
            }}}
        }
        else
        {
            fprintf(stderr, "z4c_rhs_c: GAUGE %d requires BHN=2, got BHN=%d\n", (int)GAUGE, BHN);
            return 1;
        }
    }
 #else
 #error "z4c_rhs_c.C: unsupported GAUGE value"
 #endif
    lopsided(ex, X, Y, Z, gxx, gxx_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, gxy, gxy_rhs, betax, betay, betaz, Symmetry, AAS);
    lopsided(ex, X, Y, Z, gxz, gxz_rhs, betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, gyy, gyy_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, gyz, gyz_rhs, betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, gzz, gzz_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Axx, Axx_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Axy, Axy_rhs, betax, betay, betaz, Symmetry, AAS);
    lopsided(ex, X, Y, Z, Axz, Axz_rhs, betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, Ayy, Ayy_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Ayz, Ayz_rhs, betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, Azz, Azz_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, chi_state, chi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, trK, trK_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Gamx, Gamx_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, Gamy, Gamy_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, Gamz, Gamz_rhs, betax, betay, betaz, Symmetry, SSA);
    lopsided(ex, X, Y, Z, Lap, Lap_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, betax, betax_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, betay, betay_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, betaz, betaz_rhs, betax, betay, betaz, Symmetry, SSA);
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
    lopsided(ex, X, Y, Z, dtSfx, dtSfx_rhs, betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, dtSfy, dtSfy_rhs, betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, dtSfz, dtSfz_rhs, betax, betay, betaz, Symmetry, SSA);
 #endif
    lopsided(ex, X, Y, Z, TZ, TZ_rhs, betax, betay, betaz, Symmetry, SSS);
    for (int idx = 0; idx < all; ++idx)
    {
        double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0;
        z4c_contract_gamma(
            gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx],
            gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx],
            gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx],
            gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx],
            Gamxa, Gamya, Gamza);
        TZ_rhs[idx] -= alpn1[idx] * (TWO + kappa2) * kappa1 * TZ[idx];
        trK_rhs[idx] += alpn1[idx] * kappa1 * (ONE - kappa2) * TZ[idx];
        Gamx_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamx[idx] - Gamxa);
        Gamy_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamy[idx] - Gamya);
        Gamz_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamz[idx] - Gamza);
    }
    if (eps > 0.0)
    {
        kodis(ex, X, Y, Z, chi_state, chi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, trK, trK_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxx, gxx_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxy, gxy_rhs, AAS, Symmetry, eps);
        kodis(ex, X, Y, Z, gxz, gxz_rhs, ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, gyy, gyy_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, gyz, gyz_rhs, SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, gzz, gzz_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axx, Axx_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axy, Axy_rhs, AAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Axz, Axz_rhs, ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, Ayy, Ayy_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ayz, Ayz_rhs, SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, Azz, Azz_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamx, Gamx_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamy, Gamy_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Gamz, Gamz_rhs, SSA, Symmetry, eps);
        kodis(ex, X, Y, Z, Lap, Lap_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, betax, betax_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, betay, betay_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, betaz, betaz_rhs, SSA, Symmetry, eps);
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
        kodis(ex, X, Y, Z, dtSfx, dtSfx_rhs, ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, dtSfy, dtSfy_rhs, SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, dtSfz, dtSfz_rhs, SSA, Symmetry, eps);
 #endif
        kodis(ex, X, Y, Z, TZ, TZ_rhs, SSS, Symmetry, eps);
    }
    if (co == 0)
    {
 #if (ABV == 0)
        f_ricci_gamma(ex, X, Y, Z,
                      chi_constraints,
                      dxx, gxy, gxz, dyy, gyz, dzz,
                      Gamx, Gamy, Gamz,
                      Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                      Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                      Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                      Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                      Symmetry);
 #endif
        f_constraint_bssn(ex, X, Y, Z,
                          chi_constraints, trK,
                          dxx, gxy, gxz, dyy, gyz, dzz,
                          Axx, Axy, Axz, Ayy, Ayz, Azz,
                          Gamx, Gamy, Gamz,
                          Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
                          Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                          Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                          Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                          Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                          Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
                          Symmetry);
    }
    return 0;
 }
 extern "C" int f_compute_rhs_Z4c(int *ex, double &T,
                                 double *X, double *Y, double *Z,
                                 double *chi, double *trK,
                                 double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                 double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                 double *Gamx, double *Gamy, double *Gamz,
                                 double *Lap, double *betax, double *betay, double *betaz,
                                 double *dtSfx, double *dtSfy, double *dtSfz,
                                 double *TZ,
                                 double *chi_rhs, double *trK_rhs,
                                 double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                 double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                 double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                 double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                 double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                 double *TZ_rhs,
                                 double *rho, double *Sx, double *Sy, double *Sz,
                                 double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                 double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                 double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                 double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                 double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                 double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
                                 int &Symmetry, int &Lev, double &eps, int &co)
 {
    return compute_rhs_z4c_cartesian(
        ex, T, X, Y, Z,
        chi, chi, trK,
        dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        TZ,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        TZ_rhs,
        rho, Sx, Sy, Sz,
        Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
        Symmetry, Lev, eps, co);
 }
 extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T,
                                    double *X, double *Y, double *Z,
                                    double *chi, double *trK,
                                    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                                    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                                    double *Gamx, double *Gamy, double *Gamz,
                                    double *Lap, double *betax, double *betay, double *betaz,
                                    double *dtSfx, double *dtSfy, double *dtSfz,
                                    double *TZ,
                                    double *chi_rhs, double *trK_rhs,
                                    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                                    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                                    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                                    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                                    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                                    double *TZ_rhs,
                                    double *rho, double *Sx, double *Sy, double *Sz,
                                    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                                    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                                    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                                    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                                    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                                    double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
                                    int &Symmetry, int &Lev, double &eps, int &co, double &chitiny)
 {
    const int all = ex[0] * ex[1] * ex[2];
    std::vector<double> chi_clamped(chi, chi + all);
    f_lowerboundset(ex, chi_clamped.data(), chitiny);
    const int ret = compute_rhs_z4c_cartesian(
        ex, T, X, Y, Z,
        chi_clamped.data(), chi, trK,
        dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        TZ,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        TZ_rhs,
        rho, Sx, Sy, Sz,
        Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
        Symmetry, Lev, eps, co);
    if (ret != 0 || co != 0)
        return ret;
 #if (ABV == 0)
    f_ricci_gamma(ex, X, Y, Z,
                  chi,
                  dxx, gxy, gxz, dyy, gyz, dzz,
                  Gamx, Gamy, Gamz,
                  Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                  Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                  Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                  Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                  Symmetry);
 #endif
    f_constraint_bssn(ex, X, Y, Z,
                      chi, trK,
                      dxx, gxy, gxz, dyy, gyz, dzz,
                      Axx, Axy, Axz, Ayy, Ayz, Azz,
                      Gamx, Gamy, Gamz,
                      Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
                      Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                      Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                      Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                      Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                      Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
                      Symmetry);
    return ret;
 }
--- a/BSSN_BUILD_CONFIG_MIGRATION.md
+++ b/BSSN_BUILD_CONFIG_MIGRATION.md
@@ -1,211 +0,0 @@
 # BSSN Build Config Migration
 This note records the build-configuration fix needed when replacing
 `AMSS_NCKU_Input.py` or `generate_macrodef.py` with a newer upstream version.
 ## Problem
 `AMSS_NCKU_source/macrodef.h` is not the authoritative file used by normal
 runs. `AMSS_NCKU_Program.py` first generates macro files under
 `input_data.File_directory`, copies `AMSS_NCKU_source` to
 `<File_directory>/AMSS_NCKU_source_copy`, then copies the generated macro files
 into that copied source tree and compiles there.
 Therefore, makefile logic must not depend only on the stale
 `AMSS_NCKU_source/macrodef.h`. The actual equation path must be passed to the
 copied build tree from the same generation step that creates `macrodef.h`.
 The performance regression was caused by compiling/linking the
 `BSSN-EScalar` C wrapper into BSSN vacuum builds. For BSSN vacuum (`ABEtype=0`),
 the build must use:
 ```make
 BSSN_USE_TRANSFER_CACHE=1
 BSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and must not link `bssn_escalar_rhs_c.o`.
 ## Required Migration Steps
 ### 1. Add an ABE type helper in `generate_macrodef.py`
 Add a helper that maps `input_data.Equation_Class` to the numeric `ABEtype`.
 Use the same mapping as `macrodef.h`:
 ```python
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ```
 Update `generate_macrodef_h()` to print `#define ABEtype {get_abe_type()}`
 instead of duplicating the if/elif mapping.
 ### 2. Generate a makefile fragment
 In `generate_macrodef.py`, add:
 ```python
 def generate_build_config():
    file1 = open(os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print("# Generated by generate_macrodef.py; do not edit manually.", file=file1)
    print(f"ABE_TYPE := {get_abe_type()}", file=file1)
    file1.close()
 ```
 This file is the build-time authority for the equation path.
 ### 3. Call and copy the generated build config
 In `AMSS_NCKU_Program.py`, after generating `macrodef.h` and `macrodef.fh`, call:
 ```python
 generate_macrodef.generate_build_config()
 print(" AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. ")
 ```
 When copying generated files into `AMSS_NCKU_source_copy`, also copy:
 ```python
 build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
 shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
 ```
 ### 4. Make the source makefile consume the generated config
 At the top of `AMSS_NCKU_source/makefile`, after `include makefile.inc`, add:
 ```make
 -include AMSS_NCKU_build.mk
 ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
 ```
 The generated `AMSS_NCKU_build.mk` is used during normal Python-driven builds.
 The fallback keeps manual source-tree builds usable.
 ### 5. Gate path-specific build options by `ABE_TYPE`
 Use effective build switches:
 ```make
 ifeq ($(USE_TRANSFER_CACHE),auto)
 ifeq ($(ABE_TYPE),0)
 EFFECTIVE_USE_TRANSFER_CACHE = 1
 else
 EFFECTIVE_USE_TRANSFER_CACHE = 0
 endif
 else
 EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
 endif
 ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
 ifeq ($(ABE_TYPE),1)
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
 endif
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
 ```
 Only add `bssn_escalar_rhs_c.o` when the effective EScalar C kernel switch is
 enabled:
 ```make
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 CFILES += bssn_escalar_rhs_c.o
 endif
 ```
 ### 6. Use safe transfer-cache default
 In `AMSS_NCKU_source/makefile.inc`, keep:
 ```make
 USE_TRANSFER_CACHE ?= auto
 ```
 With the effective switch logic above, this enables cached transfer for BSSN
 vacuum while keeping non-BSSN paths on the uncached path by default.
 ## Verification Checklist
 Run these checks after migrating:
 ```bash
 python3 -c "import generate_macrodef; generate_macrodef.generate_build_config()"
 cat GW150914/AMSS_NCKU_build.mk
 ```
 For BSSN, the generated file should contain:
 ```make
 ABE_TYPE := 0
 ```
 Dry-run the copied or source makefile:
 ```bash
 make -n -B INTERP_LB_MODE=off ABE | grep -E 'BSSN_USE_TRANSFER_CACHE|BSSN_USE_ESCALAR_C_KERNEL|bssn_escalar_rhs_c'
 ```
 Expected BSSN result:
 ```text
 -DBSSN_USE_TRANSFER_CACHE=1 -DBSSN_USE_ESCALAR_C_KERNEL=0
 ```
 and no `bssn_escalar_rhs_c.o` in the final link command.
 Run the full workflow:
 ```bash
 python3 AMSS_NCKU_Program.py
 ```
 For the 10-step BSSN test, compare coordinate output:
 ```bash
 python3 - <<'PY'
 from pathlib import Path
 old = Path('../GW150914-06457/AMSS_NCKU_output/bssn_BH.dat')
 new = Path('GW150914/AMSS_NCKU_output/bssn_BH.dat')
 def rows(path):
    out = []
    for line in path.read_text().splitlines():
        if not line.strip() or line.lstrip().startswith('#'):
            continue
        out.append([float(x) for x in line.split()])
    return out
 ro, rn = rows(old), rows(new)
 n = min(len(ro), len(rn))
 max_abs = 0.0
 for i in range(n):
    for a, b in zip(ro[i], rn[i]):
        max_abs = max(max_abs, abs(a - b))
 print(f"old_rows={len(ro)} new_rows={len(rn)} compared_rows={n}")
 print(f"max_abs_diff={max_abs:.17g}")
 PY
 ```
 For the validated migration, the first 10 rows matched exactly:
 ```text
 max_abs_diff=0
 ```
--- a/README.md
+++ b/README.md
@@ -97,7 +97,9 @@ Here, we take the Ubuntu 22.04 system as an example
    Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
-    The settings for the Ubuntu 22.04 system do not need to be modified.
+    The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
    If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
 1.  Enter the AMSS-NCKU Python code folder and modify the input.
--- a/generate_macrodef.py
+++ b/generate_macrodef.py
@@ -12,37 +12,6 @@ import os
 import AMSS_NCKU_Input as input_data          ## import program input file
 ##################################################################
 def get_abe_type():
    if ( input_data.Equation_Class == "BSSN" ):
        return 0
    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        return 1
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        return 3
    elif ( input_data.Equation_Class == "Z4C" ):
        return 2
    else:
        raise ValueError("Equation_Class setting error!!!")
 ##################################################################
 ## Generate the makefile fragment used by the copied source tree.
 ## The source-tree macrodef.h is not authoritative because macro files
 ## are regenerated under File_directory for each run.
 def generate_build_config():
    file1 = open( os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
    print( "# Generated by generate_macrodef.py; do not edit manually.", file=file1 )
    print( f"ABE_TYPE := {get_abe_type()}",                             file=file1 )
    file1.close()
 ##################################################################
 ## Generate the macro file macrodef.h according to user settings
@@ -89,10 +58,19 @@ def generate_macrodef_h():
    # 2: Z4c vacuum
    # 3: coupled to Maxwell field
-    try:
+    if ( input_data.Equation_Class == "BSSN" ):
-        print( f"#define ABEtype {get_abe_type()}", file=file1 )
+        print( "#define ABEtype 0", file=file1 )
-        print(                                      file=file1 )
+        print(                      file=file1 )
-    except ValueError:
+    elif ( input_data.Equation_Class == "BSSN-EScalar" ):
        print( "#define ABEtype 1", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "BSSN-EM" ):
        print( "#define ABEtype 3", file=file1 )
        print(                      file=file1 )
    elif ( input_data.Equation_Class == "Z4C" ):
        print( "#define ABEtype 2", file=file1 )
        print(                      file=file1 )
    else:
        print( "Equation_Class setting error!!!"                )
        print()
        print( "# Equation type #define ABEtype setting error!!!", file=file1 )
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@
 import AMSS_NCKU_Input as input_data
 import os
 import subprocess
 import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 ## Build parallelism: match the number of bound cores
 BUILD_JOBS = 64
 OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
 MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
 ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output