Case closed.

Refine GPU runtime controls and input checker
Fix CUDA AMR symmetry drift
2026-05-20 11:16:56 +08:00 · 2026-05-18 01:02:55 +08:00 · 2026-05-17 23:46:15 +08:00 · 2026-05-17 12:05:09 +08:00 · 2026-05-17 11:52:50 +08:00 · 2026-05-12 15:01:25 +08:00
228 changed files with 186990 additions and 171010 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 __pycache__
 GW150914
-GW150914-origin
+GW150914*
 docs
 *.tmp
-
+.codex
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -1,6 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/AMSS_NCKU_GPUCheck.py
+++ b/AMSS_NCKU_GPUCheck.py
@@ -0,0 +1,559 @@
 #!/usr/bin/env python3
 #
 # Current most stable GPU-branch baseline:
 #   GPU_Calculation="yes"
 #   Equation_Class="BSSN"
 #   Initial_Data_Method="Ansorg-TwoPuncture"
 #   puncture_data_set="Manually"
 #   basic_grid_set="Patch"
 #   grid_center_set="Cell"
 #   Symmetry="equatorial-symmetry"
 #   Time_Evolution_Method="runge-kutta-45"
 #   Finite_Diffenence_Method="4th-order"
 #   boundary_choice="BAM-choice"
 #   gauge_choice=0
 #   tetrad_type=2
 #   AHF_Find="no"
 #   devide_factor=2.0
 #   static_grid_type="Linear"
 #   moving_grid_type="Linear"
 #   AMSS_Z4C_MRBD=0
 #   Do not enable AMSS_CUDA_BH_INTERP_RESIDENT unless a dedicated
 #   CPU/GPU trajectory comparison has been run for that configuration.
 """
 Check whether AMSS_NCKU_Input.py is suitable for the current GPU branch.
 Usage:
  python3 AMSS_NCKU_GPUCheck.py
  python3 AMSS_NCKU_GPUCheck.py -f /path/to/AMSS_NCKU_Input.py
 """
 from __future__ import annotations
 import argparse
 import importlib.util
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Iterable, List, Sequence
 SUPPORTED_EQUATIONS = {"BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"}
 SUPPORTED_INITIAL_DATA = {
    "Ansorg-TwoPuncture",
    "Lousto-Analytical",
    "Cao-Analytical",
    "KerrSchild-Analytical",
 }
 SUPPORTED_SYMMETRIES = {
    "no-symmetry",
    "equatorial-symmetry",
    "octant-symmetry",
 }
 SUPPORTED_GRIDS = {"Patch", "Shell-Patch"}
 SUPPORTED_CENTERS = {"Cell", "Vertex"}
 SUPPORTED_FD = {"2nd-order", "4th-order", "6th-order", "8th-order"}
 SUPPORTED_GAUGES = {0, 1, 2, 3, 4, 5, 6, 7}
 SUPPORTED_TETRADS = {0, 1, 2}
 SUPPORTED_AHF = {"yes", "no"}
 SUPPORTED_BOUNDARIES = {"BAM-choice", "Shibata-choice"}
 SUPPORTED_PUNCTURE_DATA = {"Manually", "Automatically-BBH"}
 STABLE_BASELINE = {
    "GPU_Calculation": "yes",
    "Equation_Class": "BSSN",
    "Initial_Data_Method": "Ansorg-TwoPuncture",
    "puncture_data_set": "Manually",
    "basic_grid_set": "Patch",
    "grid_center_set": "Cell",
    "Symmetry": "equatorial-symmetry",
    "Time_Evolution_Method": "runge-kutta-45",
    "Finite_Diffenence_Method": "4th-order",
    "boundary_choice": "BAM-choice",
    "gauge_choice": 0,
    "tetrad_type": 2,
    "AHF_Find": "no",
    "devide_factor": 2.0,
    "static_grid_type": "Linear",
    "moving_grid_type": "Linear",
    "AMSS_Z4C_MRBD": 0,
 }
@dataclass
 class CheckResult:
    ok: bool = True
    warnings: List[str] = field(default_factory=list)
    risks: List[str] = field(default_factory=list)
    notes: List[str] = field(default_factory=list)
    def add_warning(self, msg: str) -> None:
        self.warnings.append(msg)
    def add_risk(self, msg: str) -> None:
        self.ok = False
        self.risks.append(msg)
    def add_note(self, msg: str) -> None:
        self.notes.append(msg)
    def extend_notes(self, messages: Iterable[str]) -> None:
        self.notes.extend(messages)
 def load_input_module(path: Path):
    spec = importlib.util.spec_from_file_location("amss_ncku_input", str(path))
    if spec is None or spec.loader is None:
        raise RuntimeError(f"cannot load input module from {path}")
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)  # type: ignore[union-attr]
    return module
 def get_attr(mod: Any, name: str, default: Any = None) -> Any:
    return getattr(mod, name, default)
 def as_text(value: Any) -> str:
    if isinstance(value, str):
        return value.strip()
    return str(value).strip()
 def as_lower_text(value: Any) -> str:
    return as_text(value).lower()
 def as_float(value: Any, default: float | None = None) -> float | None:
    try:
        return float(value)
    except (TypeError, ValueError):
        return default
 def as_int(value: Any, default: int | None = None) -> int | None:
    try:
        return int(value)
    except (TypeError, ValueError):
        return default
 def sequence_len(value: Any) -> int | None:
    try:
        return len(value)
    except TypeError:
        return None
 def sequence_values(value: Any) -> List[float] | None:
    try:
        return [float(v) for v in value]
    except (TypeError, ValueError):
        return None
 def approx_equal(a: Any, b: float, tol: float = 1.0e-12) -> bool:
    value = as_float(a)
    return value is not None and abs(value - b) <= tol
 def env_truthy(name: str) -> bool:
    value = os.environ.get(name)
    return value is not None and value.strip().lower() in {
        "1",
        "yes",
        "y",
        "true",
        "on",
        "enable",
        "enabled",
    }
 def stable_baseline_differences(mod: Any) -> List[str]:
    diffs = []
    for name, expected in STABLE_BASELINE.items():
        if not hasattr(mod, name):
            continue
        actual = get_attr(mod, name, None)
        if isinstance(expected, float):
            if not approx_equal(actual, expected):
                diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})")
        elif actual != expected:
            diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})")
    return diffs
 def add_membership_check(
    r: CheckResult,
    name: str,
    value: Any,
    supported: Sequence[Any] | set[Any],
    *,
    risk_message: str | None = None,
    note_message: str | None = None,
 ) -> None:
    if value not in supported:
        r.add_risk(risk_message or f"Unsupported {name}: {value!r}")
    elif note_message:
        r.add_note(note_message)
 def check_positive_int(r: CheckResult, name: str, value: Any) -> None:
    parsed = as_int(value)
    if parsed is None or parsed <= 0:
        r.add_risk(f"{name} must be a positive integer; got {value!r}")
 def check_nonnegative_number(r: CheckResult, name: str, value: Any) -> None:
    parsed = as_float(value)
    if parsed is None or parsed < 0.0:
        r.add_risk(f"{name} must be a non-negative number; got {value!r}")
 def check_grid_geometry(r: CheckResult, mod: Any, grid: str) -> None:
    grid_level = as_int(get_attr(mod, "grid_level", None))
    static_grid_level = as_int(get_attr(mod, "static_grid_level", None))
    moving_grid_level = as_int(get_attr(mod, "moving_grid_level", None))
    refinement_level = as_int(get_attr(mod, "refinement_level", None))
    analysis_level = as_int(get_attr(mod, "analysis_level", 0))
    for name in (
        "grid_level",
        "static_grid_level",
        "moving_grid_level",
        "static_grid_number",
        "moving_grid_number",
        "quarter_sphere_number",
    ):
        check_positive_int(r, name, get_attr(mod, name, None))
    if grid_level is not None and static_grid_level is not None:
        if static_grid_level > grid_level:
            r.add_risk("static_grid_level cannot exceed grid_level.")
        if moving_grid_level is not None and moving_grid_level != grid_level - static_grid_level:
            r.add_risk(
                "moving_grid_level should equal grid_level - static_grid_level; "
                f"got {moving_grid_level}, expected {grid_level - static_grid_level}."
            )
    if grid_level is not None:
        if refinement_level is None or refinement_level < 0 or refinement_level > grid_level:
            r.add_risk(f"refinement_level must be in [0, grid_level]; got {refinement_level!r}")
        if analysis_level is None or analysis_level < 0 or analysis_level >= grid_level:
            r.add_risk(f"analysis_level must be in [0, grid_level); got {analysis_level!r}")
    largest_max = sequence_values(get_attr(mod, "largest_box_xyz_max", None))
    largest_min = sequence_values(get_attr(mod, "largest_box_xyz_min", None))
    if largest_max is None or len(largest_max) != 3:
        r.add_risk("largest_box_xyz_max must contain three numeric values.")
    elif any(v <= 0.0 for v in largest_max):
        r.add_risk(f"largest_box_xyz_max values must be positive; got {largest_max!r}")
    if largest_min is None or len(largest_min) != 3:
        r.add_risk("largest_box_xyz_min must contain three numeric values.")
    elif largest_max is not None and len(largest_max) == 3:
        for idx, (lo, hi) in enumerate(zip(largest_min, largest_max)):
            if lo >= hi:
                r.add_risk(
                    f"largest_box_xyz_min[{idx}] must be smaller than largest_box_xyz_max[{idx}]."
                )
    if grid == "Shell-Patch" and largest_max is not None and len(largest_max) == 3:
        if max(largest_max) - min(largest_max) > 1.0e-12:
            r.add_risk("Shell-Patch requires a cubic largest_box_xyz_max.")
    if not approx_equal(get_attr(mod, "devide_factor", None), 2.0):
        r.add_risk("devide_factor must remain 2.0; the AMR code documents only this ratio as supported.")
    if as_text(get_attr(mod, "static_grid_type", "")) != "Linear":
        r.add_risk("static_grid_type must remain 'Linear'.")
    if as_text(get_attr(mod, "moving_grid_type", "")) != "Linear":
        r.add_risk("moving_grid_type must remain 'Linear'.")
    shell_shape = sequence_values(get_attr(mod, "shell_grid_number", None))
    if grid == "Shell-Patch":
        if shell_shape is None or len(shell_shape) != 3:
            r.add_risk("Shell-Patch requires shell_grid_number with three numeric values.")
        elif any(int(v) <= 0 for v in shell_shape):
            r.add_risk(f"shell_grid_number values must be positive; got {shell_shape!r}")
 def check_punctures(r: CheckResult, mod: Any, init: str, puncture_data: str) -> None:
    puncture_number = as_int(get_attr(mod, "puncture_number", None))
    if puncture_number is None or puncture_number <= 0:
        r.add_risk(f"puncture_number must be a positive integer; got {puncture_number!r}")
        return
    if init == "Ansorg-TwoPuncture" and puncture_number != 2:
        r.add_warning(
            "Ansorg-TwoPuncture is validated on the GPU branch mainly for puncture_number=2."
        )
    if puncture_data == "Automatically-BBH":
        r.add_risk("puncture_data_set='Automatically-BBH' is documented as still developing.")
    for name in ("position_BH", "parameter_BH", "dimensionless_spin_BH", "momentum_BH"):
        value = get_attr(mod, name, None)
        outer = sequence_len(value)
        if outer != puncture_number:
            r.add_risk(f"{name} must have puncture_number rows; got {outer!r}.")
            continue
        for idx in range(puncture_number):
            if sequence_len(value[idx]) != 3:
                r.add_risk(f"{name}[{idx}] must contain three values.")
                break
    if init == "Ansorg-TwoPuncture":
        for name in ("parameter_BH", "position_BH", "momentum_BH"):
            if get_attr(mod, name, None) is None:
                r.add_risk(f"Ansorg-TwoPuncture requires {name}.")
 def check_output_and_time(r: CheckResult, mod: Any) -> None:
    for name in (
        "Final_Evolution_Time",
        "Check_Time",
        "Dump_Time",
        "D2_Dump_Time",
        "Analysis_Time",
        "Courant_Factor",
        "Dissipation",
    ):
        check_nonnegative_number(r, name, get_attr(mod, name, None))
    check_positive_int(r, "Evolution_Step_Number", get_attr(mod, "Evolution_Step_Number", None))
    start_time = as_float(get_attr(mod, "Start_Evolution_Time", None))
    final_time = as_float(get_attr(mod, "Final_Evolution_Time", None))
    if start_time is None:
        r.add_risk("Start_Evolution_Time must be numeric.")
    elif final_time is not None and final_time <= start_time:
        r.add_risk("Final_Evolution_Time must be greater than Start_Evolution_Time.")
    for name in ("GW_L_max", "GW_M_max", "Detector_Number"):
        check_positive_int(r, name, get_attr(mod, name, None))
    detector_min = as_float(get_attr(mod, "Detector_Rmin", None))
    detector_max = as_float(get_attr(mod, "Detector_Rmax", None))
    if detector_min is None or detector_min <= 0.0:
        r.add_risk(f"Detector_Rmin must be positive; got {detector_min!r}")
    if detector_max is None or detector_max <= 0.0:
        r.add_risk(f"Detector_Rmax must be positive; got {detector_max!r}")
    if detector_min is not None and detector_max is not None and detector_max <= detector_min:
        r.add_risk("Detector_Rmax must be greater than Detector_Rmin.")
 def check_equation_specific(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None:
    if eq == "BSSN":
        r.add_note("Equation_Class=BSSN is the current validated GPU baseline.")
    elif eq == "BSSN-EScalar":
        r.add_warning("BSSN-EScalar has a CUDA path, but it is less broadly validated than BSSN.")
        fr_choice = as_int(get_attr(mod, "FR_Choice", None))
        if fr_choice not in {1, 2, 3, 4, 5}:
            r.add_risk(f"FR_Choice must be one of 1..5 for BSSN-EScalar; got {fr_choice!r}")
        if approx_equal(get_attr(mod, "FR_a2", None), 0.0):
            r.add_risk("CUDA BSSN-EScalar requires nonzero FR_a2.")
        elif not approx_equal(get_attr(mod, "FR_a2", None), 3.0):
            r.add_warning("CUDA BSSN-EScalar now passes FR_a2 to the kernel, but non-3.0 values need CPU/GPU regression.")
        for name in ("FR_l2", "FR_phi0", "FR_r0", "FR_sigma0"):
            check_nonnegative_number(r, name, get_attr(mod, name, None))
    elif eq == "BSSN-EM":
        r.add_warning(
            "BSSN-EM is accepted by the build, but this checker cannot certify its physics/output "
            "without a CPU/GPU regression run."
        )
        if fd == "8th-order":
            r.add_note("BSSN-EM with 8th-order enables extra CUDA AMR batching defaults.")
    elif eq == "Z4C":
        r.add_warning(
            "Z4C has CUDA support, but the resident path and Shell/CPBC combinations are more constrained."
        )
        if grid == "Patch":
            r.add_warning("Z4C+Patch avoids Shell CPBC, but still needs a dedicated regression test.")
        else:
            r.add_warning("Z4C+Shell-Patch uses CPBC/Shell logic and is not the stable BSSN baseline.")
 def check_runtime_environment(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None:
    if env_truthy("AMSS_CUDA_BH_INTERP_RESIDENT"):
        r.add_risk(
            "AMSS_CUDA_BH_INTERP_RESIDENT is enabled in the environment; this option previously caused "
            "late-time trajectory drift and should stay off unless explicitly revalidated."
        )
    else:
        r.add_note("AMSS_CUDA_BH_INTERP_RESIDENT is not enabled; this matches the fixed stable default.")
    if eq in {"BSSN", "BSSN-EScalar", "Z4C"}:
        r.add_note("makefile_and_run.py will default AMSS_CUDA_AMR_RESTRICT_DEVICE=1 for this equation.")
    if fd in {"2nd-order", "8th-order"}:
        r.add_warning(
            f"{fd} disables some interpolation/CUDA-aware MPI fast paths by default; validate performance and output."
        )
    if grid == "Shell-Patch":
        r.add_warning(
            "Shell-Patch changes runtime defaults and MPI process handling; use at least the script-adjusted 4 MPI ranks."
        )
    z4c_mrbd = as_int(get_attr(mod, "AMSS_Z4C_MRBD", 0), 0)
    if z4c_mrbd not in {0, 1, 2}:
        r.add_risk(f"AMSS_Z4C_MRBD must be 0, 1, or 2; got {z4c_mrbd!r}")
    elif eq == "Z4C" and z4c_mrbd == 2:
        r.add_risk("Z4C GPU resident path does not support AMSS_Z4C_MRBD=2.")
    elif eq == "Z4C" and z4c_mrbd in {0, 1}:
        r.add_note(f"Z4C will build with AMSS_Z4C_MRBD={z4c_mrbd}.")
 def check_stable_profile(r: CheckResult, mod: Any) -> None:
    diffs = stable_baseline_differences(mod)
    if not diffs:
        r.add_note("This input matches the documented most stable GPU baseline.")
        return
    r.add_warning(
        "This input differs from the documented most stable GPU baseline: " + "; ".join(diffs)
    )
 def check_input(mod: Any) -> CheckResult:
    r = CheckResult()
    gpu_text = as_lower_text(get_attr(mod, "GPU_Calculation", "no"))
    gpu = gpu_text == "yes"
    eq = as_text(get_attr(mod, "Equation_Class", ""))
    init = as_text(get_attr(mod, "Initial_Data_Method", ""))
    symmetry = as_text(get_attr(mod, "Symmetry", ""))
    time_method = as_text(get_attr(mod, "Time_Evolution_Method", ""))
    grid = as_text(get_attr(mod, "basic_grid_set", ""))
    center = as_text(get_attr(mod, "grid_center_set", ""))
    fd = as_text(get_attr(mod, "Finite_Diffenence_Method", ""))
    gauge = get_attr(mod, "gauge_choice", None)
    tetrad = get_attr(mod, "tetrad_type", None)
    ahf = as_text(get_attr(mod, "AHF_Find", "no")).lower()
    boundary = as_text(get_attr(mod, "boundary_choice", ""))
    puncture_data = as_text(get_attr(mod, "puncture_data_set", ""))
    cpu_part = get_attr(mod, "CPU_Part", None)
    gpu_part = get_attr(mod, "GPU_Part", None)
    if gpu_text not in {"yes", "no"}:
        r.add_risk(f"GPU_Calculation must be 'yes' or 'no'; got {get_attr(mod, 'GPU_Calculation', None)!r}")
    if not gpu:
        r.add_note("GPU_Calculation=no; this check only targets the GPU branch.")
        return r
    r.add_note("GPU_Calculation=yes detected.")
    add_membership_check(r, "Equation_Class", eq, SUPPORTED_EQUATIONS)
    add_membership_check(r, "Symmetry", symmetry, SUPPORTED_SYMMETRIES)
    add_membership_check(r, "Initial_Data_Method", init, SUPPORTED_INITIAL_DATA)
    add_membership_check(r, "basic_grid_set", grid, SUPPORTED_GRIDS)
    add_membership_check(r, "grid_center_set", center, SUPPORTED_CENTERS)
    add_membership_check(r, "Finite_Diffenence_Method", fd, SUPPORTED_FD)
    add_membership_check(r, "gauge_choice", gauge, SUPPORTED_GAUGES)
    add_membership_check(r, "tetrad_type", tetrad, SUPPORTED_TETRADS)
    add_membership_check(r, "AHF_Find", ahf, SUPPORTED_AHF)
    add_membership_check(r, "boundary_choice", boundary, SUPPORTED_BOUNDARIES)
    add_membership_check(r, "puncture_data_set", puncture_data, SUPPORTED_PUNCTURE_DATA)
    if init != "Ansorg-TwoPuncture":
        r.add_risk(
            f"Initial_Data_Method={init!r} is not validated as safe on this GPU branch; "
            "the stable path is Ansorg-TwoPuncture."
        )
    else:
        r.add_note("Initial_Data_Method=Ansorg-TwoPuncture is supported.")
    if time_method != "runge-kutta-45":
        r.add_risk(f"Only Time_Evolution_Method='runge-kutta-45' is supported; got {time_method!r}.")
    if grid == "Patch":
        r.add_note("basic_grid_set=Patch is the current stable GPU grid path.")
    elif grid == "Shell-Patch":
        r.add_warning("basic_grid_set=Shell-Patch has GPU support but is outside the stable BSSN baseline.")
    if center == "Vertex":
        r.add_warning("grid_center_set=Vertex is compiled by macros, but the stable GPU baseline is Cell.")
    if symmetry != "equatorial-symmetry":
        r.add_warning("The stable validation case uses equatorial-symmetry; other symmetries need regression tests.")
    if fd != "4th-order":
        r.add_warning("The stable validation case uses 4th-order finite differences.")
    if gauge not in {0, 1}:
        r.add_warning("Input comments recommend gauge_choice 0 or 1; other gauges need dedicated validation.")
    if tetrad != 2:
        r.add_warning("Input comments recommend tetrad_type=2; other tetrads affect wave extraction conventions.")
    if ahf == "yes":
        r.add_warning("AHF_Find=yes is supported by macros, but it is outside the current stable GPU baseline.")
    if boundary == "Shibata-choice":
        r.add_risk("Shibata-choice is not faithfully distinguished in the current macro generator; it maps to the BAM branch.")
    elif boundary == "BAM-choice":
        r.add_note("boundary_choice=BAM-choice is supported.")
    if cpu_part is not None or gpu_part is not None:
        r.add_warning("CPU_Part/GPU_Part are printed and propagated, but they do not control a real mixed CPU/GPU split in this branch.")
    check_output_and_time(r, mod)
    check_grid_geometry(r, mod, grid)
    check_punctures(r, mod, init, puncture_data)
    check_equation_specific(r, mod, eq, grid, fd)
    check_runtime_environment(r, mod, eq, grid, fd)
    check_stable_profile(r, mod)
    return r
 def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f",
        "--file",
        "--input",
        dest="input_file",
        default="AMSS_NCKU_Input.py",
        help="path to AMSS_NCKU_Input.py",
    )
    args = parser.parse_args()
    path = Path(args.input_file).resolve()
    if not path.exists():
        print(f"ERROR: input file not found: {path}")
        return 2
    try:
        mod = load_input_module(path)
    except Exception as exc:
        print(f"ERROR: failed to load input file: {exc}")
        return 2
    result = check_input(mod)
    print(f"Input: {path}")
    print(f"GPU_Calculation: {get_attr(mod, 'GPU_Calculation', 'no')}")
    print(f"Symmetry: {get_attr(mod, 'Symmetry', '')}")
    print(f"Equation_Class: {get_attr(mod, 'Equation_Class', '')}")
    print(f"Initial_Data_Method: {get_attr(mod, 'Initial_Data_Method', '')}")
    print(f"puncture_data_set: {get_attr(mod, 'puncture_data_set', '')}")
    print(f"basic_grid_set: {get_attr(mod, 'basic_grid_set', '')}")
    print(f"grid_center_set: {get_attr(mod, 'grid_center_set', '')}")
    print(f"Finite_Diffenence_Method: {get_attr(mod, 'Finite_Diffenence_Method', '')}")
    print(f"gauge_choice: {get_attr(mod, 'gauge_choice', '')}")
    print(f"tetrad_type: {get_attr(mod, 'tetrad_type', '')}")
    print(f"boundary_choice: {get_attr(mod, 'boundary_choice', '')}")
    print(f"AHF_Find: {get_attr(mod, 'AHF_Find', '')}")
    print(f"AMSS_Z4C_MRBD: {get_attr(mod, 'AMSS_Z4C_MRBD', 0)}")
    print("")
    for msg in result.notes:
        print(f"NOTE: {msg}")
    for msg in result.warnings:
        print(f"WARNING: {msg}")
    for msg in result.risks:
        print(f"RISK: {msg}")
    print("")
    if result.risks:
        print("Verdict: review the risks above before running.")
        return 1
    if result.warnings:
        print("Verdict: runnable on the current GPU branch, but keep the warnings in mind.")
        return 0
    print("Verdict: OK to run on the current GPU branch.")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -13,16 +13,32 @@ import numpy
 ## Setting MPI processes and the output file directory
-File_directory   = "GW150914"                    ## output file directory
+File_directory   = "case3"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 2                             ## number of mpi processes used in the simulation
-GPU_Calculation  = "no"                          ## Use GPU or not 
+GPU_Calculation  = "yes"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
 CPU_Part         = 1.0
 GPU_Part         = 0.0
 ## Aggressive runtime overrides for fastest low-accuracy GPU runs.
 AMSS_EVOLVE_TIMING                 = 0
 AMSS_ANALYSIS_MAP_EVERY            = 1000000000
 AMSS_INTERP_FAST                   = 1
 AMSS_INTERP_GPU                    = 1
 AMSS_CUDA_AWARE_MPI                = 1
 AMSS_CUDA_RESIDENT_SYNC            = 1
 AMSS_CUDA_BSSN_RESIDENT_SYNC       = 1
 AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP = 1
 AMSS_CUDA_KEEP_ALL_LEVELS          = 1
 AMSS_CUDA_AMR_RESTRICT_DEVICE      = 1
 AMSS_CUDA_AMR_RESTRICT_BATCH       = 1
 AMSS_CUDA_DEVICE_SEGMENT_BATCH     = 1
 AMSS_CUDA_UNCACHED_DEVICE_BUFFERS  = 1
 AMSS_CUDA_AMR_HOST_STAGED          = 1
 #################################################
@@ -45,13 +61,13 @@ Finite_Diffenence_Method = "4th-order"             ## finite-difference method:
 ## Setting the time evolutionary information
 Start_Evolution_Time     = 0.0                    ## start evolution time t0
-Final_Evolution_Time     = 1000.0                 ## final evolution time t1
+Final_Evolution_Time     = 200.0                 ## final evolution time t1
-Check_Time               = 100.0
+Check_Time               = 1000000000.0
-Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
+Dump_Time                = 1000000000.0           ## time inteval dT for dumping binary data
-D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
+D2_Dump_Time             = 1000000000.0           ## dump the ascii data for 2d surface after dT'
-Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
+Analysis_Time            = 1000000000.0           ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
+Evolution_Step_Number    = 1000000                ## stop the calculation after the maximal step number
-Courant_Factor           = 0.5                    ## Courant Factor
+Courant_Factor           = 0.8                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
 #################################################
@@ -64,21 +80,21 @@ Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation S
 basic_grid_set    = "Patch"                          ## grid structure: choose "Patch" or "Shell-Patch"
 grid_center_set   = "Cell"                           ## grid center: chose "Cell" or "Vertex"
-grid_level        = 9                                ## total number of AMR grid levels
+grid_level        = 7                                ## total number of AMR grid levels
-static_grid_level = 5                                ## number of AMR static grid levels
+static_grid_level = 4                                ## number of AMR static grid levels
 moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
 analysis_level    = 0
-refinement_level  = 3                                ## time refinement start from this grid level
+refinement_level  = 2                                ## time refinement start from this grid level
 largest_box_xyz_max = [320.0, 320.0, 320.0]          ## scale of the largest box
                                                     ## not ne cess ary to be cubic for "Patch" grid s tructure
                                                     ## need to be a cubic box for "Shell-Patch" grid structure
 largest_box_xyz_min = - numpy.array(largest_box_xyz_max)  
-static_grid_number = 96                              ## grid points of each static AMR grid (in x direction)
+static_grid_number = 64                               ## grid points of each static AMR grid (in x direction)
                                                     ## (grid points in y and z directions are automatically adjusted)
-moving_grid_number = 48                              ## grid points of each moving AMR grid
+moving_grid_number = 32                               ## grid points of each moving AMR grid
 shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
                                                     ## in (phi, theta, r) direction
 devide_factor      = 2.0                             ## resolution between different grid levels dh0/dh1, only support 2.0 now
@@ -87,7 +103,7 @@ devide_factor      = 2.0                             ## resolution between diffe
 static_grid_type   = 'Linear'                        ## AMR static grid structure , only supports "Linear"
 moving_grid_type   = 'Linear'                        ## AMR moving grid structure , only supports "Linear"
-quarter_sphere_number = 96                           ## grid number of 1/4 s pher ical surface
+quarter_sphere_number = 16                           ## grid number of 1/4 s pher ical surface
                                                     ## (which is needed for evaluating the spherical surface integral)
 #################################################
@@ -110,15 +126,15 @@ puncture_data_set     = "Manually"                       ## Method to give Punct
 ## initial orbital distance and ellipticity for BBHs system
 ## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
-Distance = 10.0
+Distance = 12.0
 e0       = 0.0
 ## black hole parameter (M Q* a*)
-parameter_BH[0] = [ 36.0/(36.0+29.0),  0.0,  +0.31 ]   
+parameter_BH[0] = [ 0.5,  0.0,  0.0 ]   
-parameter_BH[1] = [ 29.0/(36.0+29.0),  0.0,  -0.46 ]  
+parameter_BH[1] = [ 0.5,  0.0,  0.0 ]  
 ## dimensionless spin in each direction
-dimensionless_spin_BH[0] = [ 0.0,  0.0,  +0.31 ]   
+dimensionless_spin_BH[0] = [ 0.0,  0.0,  0.0 ]   
-dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]  
+dimensionless_spin_BH[1] = [ 0.0,  0.0,  0.0 ]  
 ## use Brugmann's convention
 ##  -----0-----> y
@@ -129,13 +145,13 @@ dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]
 ## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
 ## initial position for each puncture
-position_BH[0]  = [  0.0,  10.0*29.0/(36.0+29.0), 0.0 ]  
+position_BH[0]  = [  0.0,  6.0, 0.0 ]  
-position_BH[1]  = [  0.0, -10.0*36.0/(36.0+29.0), 0.0 ] 
+position_BH[1]  = [  0.0, -6.0, 0.0 ] 
 ## initial mumentum for each puncture
 ## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
-momentum_BH[0]  = [ -0.09530152296974252,  -0.00084541526517121,   0.0 ]
+momentum_BH[0]  = [ -0.06,  -0.01,   0.0 ]
-momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
+momentum_BH[1]  = [ +0.06,  +0.01,   0.0 ]
 #################################################
@@ -145,11 +161,11 @@ momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
 ## Setting the gravitational wave information
-GW_L_max        = 4                      ## maximal L number in gravitational wave
+GW_L_max        = 2                      ## maximal L number in gravitational wave
-GW_M_max        = 4                      ## maximal M number in gravitational wave
+GW_M_max        = 2                      ## maximal M number in gravitational wave
-Detector_Number = 12                     ## number of dector
+Detector_Number = 2                     ## number of dector
 Detector_Rmin   = 50.0                   ## nearest dector distance
-Detector_Rmax   = 160.0                  ## farest dector distance
+Detector_Rmax   = 100.0                  ## farest dector distance
 #################################################
@@ -160,8 +176,8 @@ Detector_Rmax   = 160.0                  ## farest dector distance
 AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"
-AHF_Find_Every = 24
+AHF_Find_Every = 1000000000
-AHF_Dump_Time  = 20.0
+AHF_Dump_Time  = 1000000000.0
 #################################################
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -58,31 +58,36 @@ File_directory = os.path.join(input_data.File_directory)
 ## If the specified output directory exists, ask the user whether to continue
 if os.path.exists(File_directory):
-    print( " Output dictionary has been existed !!!  "                                                              )
+    auto_overwrite = str(getattr(input_data, "Auto_Overwrite_Output", "yes")).strip().lower()
-    print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
+    if auto_overwrite in ("1", "yes", "y", "true", "on", "continue"):
-    print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
+        print( " Output dictionary has been existed; Auto_Overwrite_Output=yes, continue the calculation. " )
-    print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
+        print(                                                                                              )
-    print(                                                                                                          )
+    else:
-    ## Prompt whether to overwrite the existing directory
+        print( " Output dictionary has been existed !!!  "                                                              )
-    while True:
+        print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
-        try:
+        print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
-            inputvalue = input()
+        print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
-            ## If the user agrees to overwrite, proceed and remove the existing directory
+        print(                                                                                                          )
-            if ( inputvalue == "continue" ):
+        ## Prompt whether to overwrite the existing directory
-                print( " Continue the calculation !!! " )
+        while True:
-                print(                                  )
+            try:
-                break  
+                inputvalue = input()
-            ## If the user chooses not to overwrite, exit and keep the existing directory
+                ## If the user agrees to overwrite, proceed and remove the existing directory
-            elif ( inputvalue == "stop" ):
+                if ( inputvalue == "continue" ):
-                print( " Stop the calculation !!! "    )
+                    print( " Continue the calculation !!! " )
-                sys.exit() 
+                    print(                                  )
-            ## If the user input is invalid, prompt again
+                    break  
-            else:
+                ## If the user chooses not to overwrite, exit and keep the existing directory
                elif ( inputvalue == "stop" ):
                    print( " Stop the calculation !!! "    )
                    sys.exit() 
                ## If the user input is invalid, prompt again
                else:
                    print( " Please input your choice !!! "                   )
                    print( " Input 'continue' or 'stop' in the terminal !!! " )
            except ValueError:
                print( " Please input your choice !!! "                   )
                print( " Input 'continue' or 'stop' in the terminal !!! " )
        except ValueError:
            print( " Please input your choice !!! "                   )
            print( " Input 'continue' or 'stop' in the terminal !!! " )
 ## Remove the existing output directory if present
 shutil.rmtree(File_directory, ignore_errors=True)
@@ -126,6 +131,11 @@ setup.generate_AMSSNCKU_input()
 #inputvalue = input()  ## Wait for user input (press Enter) to proceed
 #print()
 setup.print_puncture_information()
 ##################################################################
 ## Generate AMSS-NCKU program input files based on the configured parameters
 print(                                                                                   )
@@ -253,7 +263,7 @@ print()
 if (input_data.GPU_Calculation == "no"):
    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
 elif (input_data.GPU_Calculation == "yes"):
-    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
+    ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE_CUDA")
 if not os.path.exists( ABE_file ):
    print(                                                                                                  )
@@ -307,7 +317,7 @@ if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
    import generate_TwoPuncture_input
-    generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input(numerical_grid.puncture_data)
+    generate_TwoPuncture_input.generate_AMSSNCKU_TwoPuncture_input()
    print(                                                                                              )
    print( " The input parfile for the TwoPunctureABE executable has been generated. " )
@@ -349,7 +359,7 @@ if (input_data.Initial_Data_Method == "Ansorg-TwoPuncture" ):
 import renew_puncture_parameter
-renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory, numerical_grid.puncture_data)
+renew_puncture_parameter.append_AMSSNCKU_BSSN_input(File_directory, output_directory)
 ## Generated AMSS-NCKU input filename
--- a/AMSS_NCKU_Program_Plot.py
+++ b/AMSS_NCKU_Program_Plot.py
@@ -0,0 +1,100 @@
 ##################################################################
 ##
 ## AMSS-NCKU Plot-Only Restart Script
 ## Author: Xiaoqu / Claude
 ## 2026/05/12
 ##
 ## This script checks for existing output data from AMSS_NCKU_Program.py.
 ## If data exists, it skips all computation and goes directly to plotting,
 ## saving time when plotting was interrupted.
 ## If no data is found, it exits with a message.
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 import os
 import sys
 import AMSS_NCKU_Input as input_data
 ##################################################################
 ## Construct paths from input configuration
 File_directory         = os.path.join(input_data.File_directory)
 output_directory       = os.path.join(File_directory, "AMSS_NCKU_output")
 binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
 figure_directory       = os.path.join(File_directory, "figure")
 ##################################################################
 ## Check whether the required output data files exist
 required_files = [
    os.path.join(binary_results_directory, "bssn_BH.dat"),
    os.path.join(binary_results_directory, "bssn_ADMQs.dat"),
    os.path.join(binary_results_directory, "bssn_psi4.dat"),
    os.path.join(binary_results_directory, "bssn_constraint.dat"),
 ]
 missing_files = [f for f in required_files if not os.path.exists(f)]
 if missing_files:
    print(" No existing AMSS_NCKU_Program.py output data found. ")
    print(" The following required files are missing: ")
    for f in missing_files:
        print(f"   {f}")
    print()
    print(" Please run AMSS_NCKU_Program.py first to generate the simulation data. ")
    print(" Exiting. ")
    sys.exit(1)
 print(" Found existing AMSS_NCKU_Program.py output data. "          )
 print(" Skipping all computation and going directly to plotting. " )
 print()
 ## Ensure the figure directory exists (it should, but be safe)
 os.makedirs(figure_directory, exist_ok=True)
 ##################################################################
 ## Plot the AMSS-NCKU program results
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory)))
 plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory)))
 ## Plot black hole separation vs. time
 plot_tasks.append((plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory)))
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i)))
    plot_tasks.append((plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i)))
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
    plot_tasks.append((plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i)))
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
    plot_tasks.append((plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i)))
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data (runs serially, not in the parallel pool)
 plot_xiaoqu.generate_binary_data_plot(binary_results_directory, figure_directory)
 print()
 print(" Plotting completed successfully. ")
 print()
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -198,16 +198,16 @@ int main(int argc, char *argv[])
      if (myrank == 0)
      {
            string out_dir;
-            char filename[50];
+            string filename;
            map<string, string>::iterator iter;
            iter = parameters::str_par.find("output dir");
            if (iter != parameters::str_par.end())
            {
                  out_dir = iter->second;
            }
-            sprintf(filename, "%s/setting.par", out_dir.c_str());
+            filename = out_dir + "/setting.par";
            ofstream setfile;
-            setfile.open(filename, ios::trunc);
+            setfile.open(filename.c_str(), ios::trunc);
            if (!setfile.good())
            {
@@ -484,7 +484,11 @@ int main(int argc, char *argv[])
            cout << endl;
      }
-      delete ADM;
+      // Let the process teardown reclaim the simulation object.  Some derived
      // equation classes keep MPI/CUDA-backed state whose destructor ordering
      // is fragile at program shutdown.
      if (getenv("AMSS_DELETE_ADM_ON_EXIT"))
            delete ADM;
      //=======================caculation done=============================================================
--- a/AMSS_NCKU_source/Initial_Data_Solver/Ansorg.C
+++ b/AMSS_NCKU_source/Initial_Data_Solver/Ansorg.C
--- a/AMSS_NCKU_source/Initial_Data_Solver/Ansorg.h
+++ b/AMSS_NCKU_source/Initial_Data_Solver/Ansorg.h
--- a/AMSS_NCKU_source/AHF_Direct/BH_diagnostics.C
+++ b/AMSS_NCKU_source/AHF_Direct/BH_diagnostics.C
--- a/AMSS_NCKU_source/AHF_Direct/BH_diagnostics.h
+++ b/AMSS_NCKU_source/AHF_Direct/BH_diagnostics.h
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_gpu.cu
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_gpu.cu
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_class.C
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_class.h
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_class.h
@@ -1,210 +0,0 @@
 #ifndef BSSN_GPU_CLASS_H
 #define BSSN_GPU_CLASS_H
 #ifdef newc
 #include <iostream>
 #include <iomanip>
 #include <fstream>
 #include <cstdlib>
 #include <string>
 #include <cmath>
 using namespace std;
 #else
 #include <iostream.h>
 #include <iomanip.h>
 #include <fstream.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #endif
 #include <mpi.h>
 #include "macrodef.h"
 #include "cgh.h"
 #include "ShellPatch.h"
 #include "misc.h"
 #include "var.h"
 #include "MyList.h"
 #include "monitor.h"
 #include "surface_integral.h"
 #include "checkpoint.h"
 // added by yangquan
 #include "bssn_macro.h"
 extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
 class bssn_class
 {
 public:
       // added by yangquan
       //----------------------
       int gpu_num_mynode;
       int cpu_core_num_mynode;
       int mpi_process_num_mynode;
       int my_sequence_mynode;
       int mynode_id;
       int use_gpu;
       virtual void Step_GPU(int lev, int YN);
       virtual void Get_runtime_envirment();
       // virtual void Step_OPENMP(int lev,int YN);
       //----------------------
       int ngfs;
       int nprocs, myrank;
       cgh *GH;
       ShellPatch *SH;
       double PhysTime;
       int checkrun;
       char checkfilename[50];
       int Steps;
       double StartTime, TotalTime;
       double AnasTime, DumpTime, d2DumpTime, CheckTime;
       double LastAnas, LastConsOut;
       double Courant;
       double numepss, numepsb, numepsh;
       int Symmetry;
       int maxl, decn;
       double maxrex, drex;
       int trfls, a_lev;
       double dT;
       double chitiny;
       double **Porg0, **Porgbr, **Porg, **Porg1, **Porg_rhs;
       int BH_num, BH_num_input;
       double *Mass, *Pmom, *Spin;
       double ADMMass;
       var *phio, *trKo;
       var *gxxo, *gxyo, *gxzo, *gyyo, *gyzo, *gzzo;
       var *Axxo, *Axyo, *Axzo, *Ayyo, *Ayzo, *Azzo;
       var *Gmxo, *Gmyo, *Gmzo;
       var *Lapo, *Sfxo, *Sfyo, *Sfzo;
       var *dtSfxo, *dtSfyo, *dtSfzo;
       var *phi0, *trK0;
       var *gxx0, *gxy0, *gxz0, *gyy0, *gyz0, *gzz0;
       var *Axx0, *Axy0, *Axz0, *Ayy0, *Ayz0, *Azz0;
       var *Gmx0, *Gmy0, *Gmz0;
       var *Lap0, *Sfx0, *Sfy0, *Sfz0;
       var *dtSfx0, *dtSfy0, *dtSfz0;
       var *phi, *trK;
       var *gxx, *gxy, *gxz, *gyy, *gyz, *gzz;
       var *Axx, *Axy, *Axz, *Ayy, *Ayz, *Azz;
       var *Gmx, *Gmy, *Gmz;
       var *Lap, *Sfx, *Sfy, *Sfz;
       var *dtSfx, *dtSfy, *dtSfz;
       var *phi1, *trK1;
       var *gxx1, *gxy1, *gxz1, *gyy1, *gyz1, *gzz1;
       var *Axx1, *Axy1, *Axz1, *Ayy1, *Ayz1, *Azz1;
       var *Gmx1, *Gmy1, *Gmz1;
       var *Lap1, *Sfx1, *Sfy1, *Sfz1;
       var *dtSfx1, *dtSfy1, *dtSfz1;
       var *phi_rhs, *trK_rhs;
       var *gxx_rhs, *gxy_rhs, *gxz_rhs, *gyy_rhs, *gyz_rhs, *gzz_rhs;
       var *Axx_rhs, *Axy_rhs, *Axz_rhs, *Ayy_rhs, *Ayz_rhs, *Azz_rhs;
       var *Gmx_rhs, *Gmy_rhs, *Gmz_rhs;
       var *Lap_rhs, *Sfx_rhs, *Sfy_rhs, *Sfz_rhs;
       var *dtSfx_rhs, *dtSfy_rhs, *dtSfz_rhs;
       var *rho, *Sx, *Sy, *Sz, *Sxx, *Sxy, *Sxz, *Syy, *Syz, *Szz;
       var *Gamxxx, *Gamxxy, *Gamxxz, *Gamxyy, *Gamxyz, *Gamxzz;
       var *Gamyxx, *Gamyxy, *Gamyxz, *Gamyyy, *Gamyyz, *Gamyzz;
       var *Gamzxx, *Gamzxy, *Gamzxz, *Gamzyy, *Gamzyz, *Gamzzz;
       var *Rxx, *Rxy, *Rxz, *Ryy, *Ryz, *Rzz;
       var *Rpsi4, *Ipsi4;
       var *t1Rpsi4, *t1Ipsi4, *t2Rpsi4, *t2Ipsi4;
       var *Cons_Ham, *Cons_Px, *Cons_Py, *Cons_Pz, *Cons_Gx, *Cons_Gy, *Cons_Gz;
 #ifdef Point_Psi4
       var *phix, *phiy, *phiz;
       var *trKx, *trKy, *trKz;
       var *Axxx, *Axxy, *Axxz;
       var *Axyx, *Axyy, *Axyz;
       var *Axzx, *Axzy, *Axzz;
       var *Ayyx, *Ayyy, *Ayyz;
       var *Ayzx, *Ayzy, *Ayzz;
       var *Azzx, *Azzy, *Azzz;
 #endif
       // FIXME: uc = StateList, up = OldStateList, upp = SynchList_cor; so never touch these three data
       MyList<var> *StateList, *SynchList_pre, *SynchList_cor, *RHSList;
       MyList<var> *OldStateList, *DumpList;
       MyList<var> *ConstraintList;
       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
       surface_integral *Waveshell;
       checkpoint *CheckPoint;
 public:
       bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
                  int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
                  int a_levi, int maxli, int decni, double maxrexi, double drexi);
       ~bssn_class();
       void Evolve(int Steps);
       void RecursiveStep(int lev);
 #if (PSTR == 1)
       void ParallelStep();
       void SHStep();
 #endif
       void RestrictProlong(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
       void RestrictProlong_aux(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
       void RestrictProlong(int lev, int YN, bool BB);
       void ProlongRestrict(int lev, int YN, bool BB);
       void Setup_Black_Hole_position();
       void compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int lev);
       bool read_Pablo_file(int *ext, double *datain, char *filename);
       void write_Pablo_file(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
                             char *filename);
       void AnalysisStuff(int lev, double dT_lev);
       void Setup_KerrSchild();
       void Enforce_algcon(int lev, int fg);
       void testRestrict();
       void testOutBd();
       virtual void Setup_Initial_Data_Lousto();
       virtual void Setup_Initial_Data_Cao();
       virtual void Initialize();
       virtual void Read_Ansorg();
       virtual void Read_Pablo() {};
       virtual void Compute_Psi4(int lev);
       virtual void Step(int lev, int YN);
       virtual void Interp_Constraint(bool infg);
       virtual void Constraint_Out();
       virtual void Compute_Constraint();
 #ifdef With_AHF
 protected:
       MyList<var> *AHList, *AHDList, *GaugeList;
       int AHfindevery;
       double AHdumptime;
       int *lastahdumpid, HN_num; // number of possible horizons
       int *findeveryl;
       double *xc, *yc, *zc, *xr, *yr, *zr;
       bool *trigger;
       double *dTT;
       int *dumpid;
 public:
       void AH_Prepare_derivatives();
       bool AH_Interp_Points(MyList<var> *VarList,
                             int NN, double **XX,
                             double *Shellf, int Symmetryi);
       void AH_Step_Find(int lev, double dT_lev);
 #endif
 };
 #endif /* BSSN_GPU_CLASS_H */
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_step_gpu.C
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_step_gpu.C
--- a/AMSS_NCKU_source/cgh/Block.C
+++ b/AMSS_NCKU_source/cgh/Block.C
@@ -12,7 +12,61 @@ using namespace std;
 #include "Block.h"
 #include "misc.h"
-Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
+#if USE_CUDA_BSSN || USE_CUDA_Z4C
 #include <cuda_runtime_api.h>
 #endif
 namespace {
 bool cuda_pin_gridfuncs_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double *alloc_gridfunc(size_t count, unsigned char &pinned)
 {
  pinned = 0;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (cuda_pin_gridfuncs_enabled())
  {
    double *ptr = 0;
    cudaError_t err = cudaMallocHost((void **)&ptr, count * sizeof(double));
    if (err == cudaSuccess)
    {
      pinned = 1;
      return ptr;
    }
    cudaGetLastError();
  }
 #endif
  return (double *)malloc(sizeof(double) * count);
 }
 void free_gridfunc(double *ptr, unsigned char pinned)
 {
  if (!ptr)
    return;
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
  if (pinned)
  {
    cudaFreeHost(ptr);
    return;
  }
 #else
  (void)pinned;
 #endif
  free(ptr);
 }
 }
 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), lev(levi), cgpu(cgpui), ingfs(ingfsi), fngfs(fngfsi), igfs(0), fgfs(0), fgfs_pinned(0)
 {
  for (int i = 0; i < dim; i++)
    X[i] = 0;
@@ -70,9 +124,10 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
    int nn = shape[0] * shape[1] * shape[2];
    fgfs = new double *[fngfs];
    fgfs_pinned = new unsigned char[fngfs];
    for (int i = 0; i < fngfs; i++)
    {
-      fgfs[i] = (double *)malloc(sizeof(double) * nn);
+      fgfs[i] = alloc_gridfunc((size_t)nn, fgfs_pinned[i]);
      if (!(fgfs[i]))
      {
        cout << "on node#" << rank << ", out of memory when constructing Block." << endl;
@@ -107,11 +162,13 @@ Block::~Block()
      free(igfs[i]);
    delete[] igfs;
    for (int i = 0; i < fngfs; i++)
-      free(fgfs[i]);
+      free_gridfunc(fgfs[i], fgfs_pinned ? fgfs_pinned[i] : 0);
    delete[] fgfs;
    delete[] fgfs_pinned;
    X[0] = X[1] = X[2] = 0;
    igfs = 0;
    fgfs = 0;
    fgfs_pinned = 0;
  }
 }
 void Block::checkBlock()
@@ -187,6 +244,8 @@ void Block::swapList(MyList<var> *VarList1, MyList<var> *VarList2, int myrank)
    while (varl1 && varl2)
    {
      misc::swap<double *>(fgfs[varl1->data->sgfn], fgfs[varl2->data->sgfn]);
      if (fgfs_pinned)
        misc::swap<unsigned char>(fgfs_pinned[varl1->data->sgfn], fgfs_pinned[varl2->data->sgfn]);
      varl1 = varl1->next;
      varl2 = varl2->next;
    }
--- a/AMSS_NCKU_source/cgh/Block.h
+++ b/AMSS_NCKU_source/cgh/Block.h
@@ -18,9 +18,10 @@ public:
   int ingfs, fngfs;
   int *(*igfs);
   double *(*fgfs);
   unsigned char *fgfs_pinned;
 public:
-   Block() {};
+   Block() : rank(0), lev(0), cgpu(0), ingfs(0), fngfs(0), igfs(0), fgfs(0), fgfs_pinned(0) {};
   Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfs, int levi, const int cgpui = 0);
   ~Block();
--- a/AMSS_NCKU_source/Read_and_Write/DataCT.C
+++ b/AMSS_NCKU_source/Read_and_Write/DataCT.C
--- a/AMSS_NCKU_source/AHF_Direct/FFT.f90
+++ b/AMSS_NCKU_source/AHF_Direct/FFT.f90
--- a/AMSS_NCKU_source/AHF_Direct/IntPnts.C
+++ b/AMSS_NCKU_source/AHF_Direct/IntPnts.C
--- a/AMSS_NCKU_source/AHF_Direct/IntPnts0.C
+++ b/AMSS_NCKU_source/AHF_Direct/IntPnts0.C
--- a/AMSS_NCKU_source/AHF_Direct/Jacobian.C
+++ b/AMSS_NCKU_source/AHF_Direct/Jacobian.C
--- a/AMSS_NCKU_source/AHF_Direct/Jacobian.h
+++ b/AMSS_NCKU_source/AHF_Direct/Jacobian.h
--- a/AMSS_NCKU_source/Patch/MPatch.C
+++ b/AMSS_NCKU_source/Patch/MPatch.C
@@ -14,6 +14,9 @@ using namespace std;
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #ifdef INTERP_LB_PROFILE
 #include "interp_lb_profile.h"
 #endif
@@ -178,6 +181,444 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
  return -1;
 }
 inline int fortran_idint_local(double x)
 {
  return int(x);
 }
 bool interp_fast_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_gpu_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_GPU");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool interp_fast_compare_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 double interp_fast_compare_tol()
 {
  static double tol = -1.0;
  if (tol < 0.0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL");
    tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11;
  }
  return tol;
 }
 long long interp_fast_compare_limit()
 {
  static long long limit = -1;
  if (limit < 0)
  {
    const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT");
    limit = (env && atoll(env) > 0) ? atoll(env) : 4096;
  }
  return limit;
 }
 struct FastInterpStencil
 {
  int cxB[dim];
  double cx[dim];
  double wx[8];
  double wy[8];
  double wz[8];
  int nsamples;
  int loc[512];
  unsigned char sign_mask[512];
  double weight[512];
 };
 inline void lagrange_unit_weights(double x, int ordn, double *w)
 {
  for (int i = 0; i < ordn; i++)
  {
    double num = 1.0;
    double den = 1.0;
    for (int j = 0; j < ordn; j++)
    {
      if (j == i)
        continue;
      num *= (x - double(j));
      den *= double(i - j);
    }
    w[i] = num / den;
  }
 }
 inline void z_unit_weights(double x, int ordn, double *w)
 {
  if (ordn == 6)
  {
    static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0};
    for (int i = 0; i < 6; i++)
    {
      if (x == double(i))
      {
        for (int j = 0; j < 6; j++)
          w[j] = (j == i) ? 1.0 : 0.0;
        return;
      }
    }
    double den = 0.0;
    for (int i = 0; i < 6; i++)
    {
      w[i] = c_uniform[i] / (x - double(i));
      den += w[i];
    }
    for (int i = 0; i < 6; i++)
      w[i] /= den;
    return;
  }
  lagrange_unit_weights(x, ordn, w);
 }
 inline bool fast_interp_map_index(int idx, int extent, int d,
                                  int &mapped, unsigned char &mask)
 {
  if (idx > 0)
    mapped = idx;
  else
  {
    mask |= (unsigned char)(1u << d);
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    mapped = 2 - idx;
 #else
 #ifdef Cell
    mapped = 1 - idx;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
  }
  return mapped >= 1 && mapped <= extent;
 }
 bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn,
                                 int Symmetry, FastInterpStencil &st)
 {
  if (!BP || ordn <= 0 || ordn > 8)
    return false;
  st.nsamples = 0;
  const int NO_SYMM = 0;
  const int OCTANT = 2;
  int cmin[dim], cmax[dim], cxT[dim];
  for (int d = 0; d < dim; d++)
  {
    const double *X = BP->X[d];
    const double dX = X[1] - X[0];
    const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1;
    st.cxB[d] = cxI - ordn / 2 + 1;
    cxT[d] = st.cxB[d] + ordn - 1;
    cmin[d] = 1;
    cmax[d] = BP->shape[d];
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 2;
 #else
 #ifdef Cell
    if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
    if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
      cmin[d] = -ordn / 2 + 1;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (st.cxB[d] < cmin[d])
    {
      st.cxB[d] = cmin[d];
      cxT[d] = st.cxB[d] + ordn - 1;
    }
    if (cxT[d] > cmax[d])
    {
      cxT[d] = cmax[d];
      st.cxB[d] = cxT[d] + 1 - ordn;
    }
    if (st.cxB[d] > 0)
      st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX;
    else
    {
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
      st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX;
 #else
 #ifdef Cell
      st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    }
  }
  lagrange_unit_weights(st.cx[0], ordn, st.wx);
  lagrange_unit_weights(st.cx[1], ordn, st.wy);
  z_unit_weights(st.cx[2], ordn, st.wz);
  for (int kk = 0; kk < ordn; kk++)
  {
    for (int jj = 0; jj < ordn; jj++)
    {
      for (int ii = 0; ii < ordn; ii++)
      {
        unsigned char mask = 0;
        int ix, iy, iz;
        if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) ||
            !fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) ||
            !fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask))
          return false;
        const int s = st.nsamples++;
        st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] +
                    (iz - 1) * BP->shape[0] * BP->shape[1];
        st.sign_mask[s] = mask;
        st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk];
      }
    }
  }
  return true;
 }
 bool interpolate_var_list_with_stencil(Block *BP, MyList<var> *VarList,
                                       int num_var, const double *pox,
                                       int ordn, int Symmetry,
                                       const FastInterpStencil &st,
                                       double *out)
 {
  if (num_var <= 0 || num_var > 128)
    return false;
  double *data_ptrs[128];
  double *soa_ptrs[128];
  var *vars[128];
  MyList<var> *varl = VarList;
  int k = 0;
  while (varl)
  {
    if (k >= num_var)
      return false;
    vars[k] = varl->data;
    data_ptrs[k] = BP->fgfs[vars[k]->sgfn];
    soa_ptrs[k] = vars[k]->SoA;
    out[k] = 0.0;
    varl = varl->next;
    k++;
  }
  if (k != num_var)
    return false;
  for (int s = 0; s < st.nsamples; s++)
  {
    const int loc = st.loc[s];
    const double w = st.weight[s];
    const unsigned char mask = st.sign_mask[s];
    if (mask == 0)
    {
      for (int v = 0; v < num_var; v++)
        out[v] += w * data_ptrs[v][loc];
    }
    else
    {
      for (int v = 0; v < num_var; v++)
      {
        const double *SoA = soa_ptrs[v];
        double sgn = 1.0;
        if (mask & 1u)
          sgn *= SoA[0];
        if (mask & 2u)
          sgn *= SoA[1];
        if (mask & 4u)
          sgn *= SoA[2];
        out[v] += w * sgn * data_ptrs[v][loc];
      }
    }
  }
  if (interp_fast_compare_enabled())
  {
    static int report_count = 0;
    static long long compare_calls = 0;
    if (compare_calls++ >= interp_fast_compare_limit())
      return true;
    const double tol = interp_fast_compare_tol();
    varl = VarList;
    k = 0;
    while (varl)
    {
      var *vp = vars[k];
      double ref = 0.0;
      double x = pox[0], y = pox[1], z = pox[2];
      f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
                      BP->fgfs[vp->sgfn], ref,
                      x, y, z, ordn, vp->SoA, Symmetry);
      const double diff = fabs(ref - out[k]);
      const double scale = 1.0 + fabs(ref);
      if (diff > tol * scale && report_count < 32)
      {
        int rank = 0;
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        fprintf(stderr,
                "[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n",
                rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]);
        report_count++;
      }
      varl = varl->next;
      k++;
    }
  }
  return true;
 }
 bool interpolate_var_list_fast(Block *BP, MyList<var> *VarList, int num_var,
                               const double *pox, int ordn, int Symmetry,
                               double *out)
 {
  if (!interp_fast_enabled())
    return false;
  FastInterpStencil st;
  if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st))
    return false;
  return interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
                                           ordn, Symmetry, st, out);
 }
 struct CachedInterpPoint
 {
  Block *bp;
  int owner_rank;
  FastInterpStencil stencil;
 };
 struct SurfaceInterpCache
 {
  Patch *patch;
  int NN;
  int symmetry;
  double key[9];
  vector<CachedInterpPoint> points;
  SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {}
 };
 bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch,
                               int NN, double **XX, int Symmetry)
 {
  if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry ||
      int(cache.points.size()) != NN || NN <= 0)
    return false;
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      if (cache.key[p++] != XX[d][ids[q]])
        return false;
  return true;
 }
 SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX,
                                       int Symmetry)
 {
  static vector<SurfaceInterpCache> caches;
  for (size_t i = 0; i < caches.size(); i++)
    if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry))
      return &caches[i];
  if (caches.size() >= 24)
    caches.erase(caches.begin());
  caches.push_back(SurfaceInterpCache());
  return &caches.back();
 }
 bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN,
                         double **XX, int Symmetry, const double *DH,
                         const BlockBinIndex &block_index, int ordn)
 {
  int myrank = 0;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  cache.patch = patch;
  cache.NN = NN;
  cache.symmetry = Symmetry;
  cache.points.clear();
  cache.points.resize(NN);
  const int mid = NN / 2;
  const int last = NN - 1;
  const int ids[3] = {0, mid, last};
  int p = 0;
  for (int q = 0; q < 3; q++)
    for (int d = 0; d < dim; d++)
      cache.key[p++] = XX[d][ids[q]];
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
    for (int d = 0; d < dim; d++)
      pox[d] = XX[d][j];
    const int block_i = find_block_index_for_point(block_index, pox, DH);
    if (block_i < 0)
    {
      cache.points[j].bp = 0;
      cache.points[j].owner_rank = -1;
      continue;
    }
    Block *BP = block_index.views[block_i].bp;
    cache.points[j].bp = BP;
    cache.points[j].owner_rank = BP->rank;
    cache.points[j].stencil.nsamples = 0;
    if (BP->rank == myrank)
    {
      if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry,
                                       cache.points[j].stencil))
        return false;
    }
  }
  return true;
 }
 } // namespace
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
@@ -565,14 +1006,18 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (myrank == BP->rank)
      {
        //---> interpolation
-        varl = VarList;
+        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
-        int k = 0;
+                                       Symmetry, Shellf + j * num_var))
        while (varl) // run along variables
        {
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          varl = VarList;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          int k = 0;
-          varl = varl->next;
+          while (varl) // run along variables
-          k++;
+          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
    }
@@ -659,8 +1104,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
    varl = varl->next;
  }
  memset(Shellf, 0, sizeof(double) * NN * num_var);
  // owner_rank[j] records which MPI rank owns point j
  int *owner_rank;
  owner_rank = new int[NN];
@@ -672,8 +1115,113 @@ void Patch::Interp_Points(MyList<var> *VarList,
    DH[i] = getdX(i);
  BlockBinIndex block_index;
  build_block_bin_index(this, DH, block_index);
  SurfaceInterpCache *surface_cache = 0;
  bool use_surface_cache = false;
  if (interp_fast_enabled())
  {
    surface_cache = find_surface_cache(this, NN, XX, Symmetry);
    use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry);
    if (!use_surface_cache)
      use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX,
                                              Symmetry, DH, block_index, ordn);
  }
  // --- Interpolation phase (identical to original) ---
 #if USE_CUDA_BSSN
  const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
                              VarList && VarList->next && !VarList->next->next;
 #else
  const bool use_gpu_interp = false;
 #endif
  if (use_gpu_interp)
  {
 #if USE_CUDA_BSSN
    vector<vector<int> > local_points(block_index.views.size());
    for (int j = 0; j < NN; j++)
    {
      for (int i = 0; i < dim; i++)
      {
        if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
        {
          cout << "Patch::Interp_Points: point (";
          for (int k = 0; k < dim; k++)
          {
            cout << XX[k][j];
            if (k < dim - 1)
              cout << ",";
            else
              cout << ") is out of current Patch." << endl;
          }
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      CachedInterpPoint &cp = surface_cache->points[j];
      Block *BP = cp.bp;
      owner_rank[j] = cp.owner_rank;
      if (BP && myrank == BP->rank)
      {
        for (size_t bi = 0; bi < block_index.views.size(); bi++)
        {
          if (block_index.views[bi].bp == BP)
          {
            local_points[bi].push_back(j);
            break;
          }
        }
      }
    }
    var *v0 = VarList->data;
    var *v1 = VarList->next->data;
    double soa6[6] = {
        v0->SoA[0], v0->SoA[1], v0->SoA[2],
        v1->SoA[0], v1->SoA[1], v1->SoA[2]};
    for (size_t bi = 0; bi < local_points.size(); bi++)
    {
      const int count = int(local_points[bi].size());
      if (count <= 0)
        continue;
      Block *BP = block_index.views[bi].bp;
      vector<double> px(count), py(count), pz(count), out(2 * count);
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        px[q] = XX[0][j];
        py[q] = XX[1][j];
        pz[q] = XX[2][j];
      }
      const double dx = BP->X[0][1] - BP->X[0][0];
      const double dy = BP->X[1][1] - BP->X[1][0];
      const double dz = BP->X[2][1] - BP->X[2][0];
      const int ok = bssn_cuda_interp_host_two_fields(
          BP, BP->shape,
          BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
          BP->X[0][0], BP->X[1][0], BP->X[2][0],
          dx, dy, dz,
          &px[0], &py[0], &pz[0], count,
          ordn, Symmetry, soa6, &out[0]);
      if (ok != 0)
      {
        if (myrank == 0)
          cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      for (int q = 0; q < count; q++)
      {
        const int j = local_points[bi][q];
        Shellf[j * num_var] = out[2 * q];
        Shellf[j * num_var + 1] = out[2 * q + 1];
      }
    }
 #endif
  }
  else
  {
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
@@ -695,24 +1243,55 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (use_surface_cache)
    if (block_i >= 0)
    {
-      Block *BP = block_index.views[block_i].bp;
+      CachedInterpPoint &cp = surface_cache->points[j];
-      owner_rank[j] = BP->rank;
+      Block *BP = cp.bp;
-      if (myrank == BP->rank)
+      owner_rank[j] = cp.owner_rank;
      if (BP && myrank == BP->rank)
      {
-        varl = VarList;
+        if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
-        int k = 0;
+                                               ordn, Symmetry, cp.stencil,
-        while (varl)
+                                               Shellf + j * num_var))
        {
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          MyList<var> *varl_fallback = VarList;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          int k = 0;
-          varl = varl->next;
+          while (varl_fallback)
-          k++;
+          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
            varl_fallback = varl_fallback->next;
            k++;
          }
        }
      }
    }
    else
    {
      const int block_i = find_block_index_for_point(block_index, pox, DH);
      if (block_i >= 0)
      {
        Block *BP = block_index.views[block_i].bp;
        owner_rank[j] = BP->rank;
      if (myrank == BP->rank)
      {
        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
                                       Symmetry, Shellf + j * num_var))
        {
          MyList<var> *varl_fallback = VarList;
          int k = 0;
          while (varl_fallback)
          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
            varl_fallback = varl_fallback->next;
            k++;
          }
        }
      }
      }
    }
  }
  }
 #ifdef INTERP_LB_PROFILE
@@ -969,14 +1548,18 @@ void Patch::Interp_Points(MyList<var> *VarList,
      if (myrank == BP->rank)
      {
        //---> interpolation
-        varl = VarList;
+        if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
-        int k = 0;
+                                       Symmetry, Shellf + j * num_var))
        while (varl) // run along variables
        {
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+          varl = VarList;
-                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          int k = 0;
-          varl = varl->next;
+          while (varl) // run along variables
-          k++;
+          {
            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
        }
      }
    }
--- a/AMSS_NCKU_source/Patch/MPatch.h
+++ b/AMSS_NCKU_source/Patch/MPatch.h
--- a/AMSS_NCKU_source/Variable/MyList.h
+++ b/AMSS_NCKU_source/Variable/MyList.h
--- a/AMSS_NCKU_source/AHF_Direct/Newton.C
+++ b/AMSS_NCKU_source/AHF_Direct/Newton.C
--- a/AMSS_NCKU_source/Null_Evolve/NullEvol.f90
+++ b/AMSS_NCKU_source/Null_Evolve/NullEvol.f90
--- a/AMSS_NCKU_source/Null_Evolve/NullEvol.h
+++ b/AMSS_NCKU_source/Null_Evolve/NullEvol.h
--- a/AMSS_NCKU_source/Null_Evolve/NullEvol2.f90
+++ b/AMSS_NCKU_source/Null_Evolve/NullEvol2.f90
--- a/AMSS_NCKU_source/Null_Evolve/NullNews.f90
+++ b/AMSS_NCKU_source/Null_Evolve/NullNews.f90
--- a/AMSS_NCKU_source/Null_Evolve/NullNews.h
+++ b/AMSS_NCKU_source/Null_Evolve/NullNews.h
--- a/AMSS_NCKU_source/Null_Evolve/NullNews2.f90
+++ b/AMSS_NCKU_source/Null_Evolve/NullNews2.f90
--- a/AMSS_NCKU_source/Null_Evolve/NullShellPatch.C
+++ b/AMSS_NCKU_source/Null_Evolve/NullShellPatch.C
--- a/AMSS_NCKU_source/Null_Evolve/NullShellPatch.h
+++ b/AMSS_NCKU_source/Null_Evolve/NullShellPatch.h
--- a/AMSS_NCKU_source/Null_Evolve/NullShellPatch2.C
+++ b/AMSS_NCKU_source/Null_Evolve/NullShellPatch2.C
--- a/AMSS_NCKU_source/Null_Evolve/NullShellPatch2.h
+++ b/AMSS_NCKU_source/Null_Evolve/NullShellPatch2.h
--- a/AMSS_NCKU_source/Null_Evolve/NullShellPatch2_Evo.C
+++ b/AMSS_NCKU_source/Null_Evolve/NullShellPatch2_Evo.C
--- a/AMSS_NCKU_source/Parallel/Parallel.C
+++ b/AMSS_NCKU_source/Parallel/Parallel.C
--- a/AMSS_NCKU_source/Parallel/Parallel.h
+++ b/AMSS_NCKU_source/Parallel/Parallel.h
@@ -104,6 +104,14 @@ namespace Parallel
    double **recv_bufs;
    int *send_buf_caps;
    int *recv_buf_caps;
    unsigned char *send_buf_pinned;
    unsigned char *recv_buf_pinned;
    unsigned char *send_buf_is_dev;
    unsigned char *recv_buf_is_dev;
    int *send_buf_caps_dev;
    int *recv_buf_caps_dev;
    double **send_bufs_dev;
    double **recv_bufs_dev;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
@@ -111,12 +119,14 @@ namespace Parallel
    int *tc_req_node;
    int *tc_req_is_recv;
    int *tc_completed;
    bool cuda_aware_mode;
    SyncCache();
    void invalidate();
    void destroy();
  };
  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
  void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
--- a/AMSS_NCKU_source/Parallel/Parallel_bam.C
+++ b/AMSS_NCKU_source/Parallel/Parallel_bam.C
--- a/AMSS_NCKU_source/Parallel/Parallel_bam.h
+++ b/AMSS_NCKU_source/Parallel/Parallel_bam.h
--- a/AMSS_NCKU_source/Scalar/Set_Rho_ADM.f90
+++ b/AMSS_NCKU_source/Scalar/Set_Rho_ADM.f90
--- a/AMSS_NCKU_source/Shell_Patch/ShellPatch.C
+++ b/AMSS_NCKU_source/Shell_Patch/ShellPatch.C
--- a/AMSS_NCKU_source/Shell_Patch/ShellPatch.h
+++ b/AMSS_NCKU_source/Shell_Patch/ShellPatch.h
@@ -102,6 +102,16 @@ public:
                 //-1: means no dumy dimension at all; 0: means rho; 1: means sigma
   };
   // Thread-safe search result (no pointers to shared mutable state)
   struct PointSearchResult
   {
      bool found;
      Block *Bg;
      double gx, gy, gz; // global Cartesian coordinates
      double lx, ly, lz; // local coordinates within the found block
      int ssst;          // source shell-patch type (-1 = Cartesian)
   };
   int myrank;
   int shape[dim];   // for (rho, sigma, R), for rho and sigma means number of points for every pi/2
   double Rrange[2]; // for Rmin and Rmax
@@ -175,6 +185,12 @@ public:
                         MyList<Patch> *Pp, double CDH[dim], MyList<pointstru> *pss);
   bool prolongpointstru(MyList<pointstru> *&psul, bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                         MyList<Patch> *Pp, double CDH[dim], double x, double y, double z, int Symmetry, int rank_in);
   // Read-only point search — thread-safe (no shared mutable state modified)
   PointSearchResult prolongpointstru_search(bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
                                             MyList<Patch> *Pp, double CDH[dim], double x, double y, double z,
                                             int Symmetry, int rank_in);
   // Append a search result to a linked list — use inside omp critical section
   void prolongpointstru_append(MyList<pointstru> *&psul, const PointSearchResult &sr, int tsst);
   void setupintintstuff(int cpusize, MyList<Patch> *CPatL, int Symmetry);
   void intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
--- a/AMSS_NCKU_source/Two_Puncture/TwoPunctureABE.C
+++ b/AMSS_NCKU_source/Two_Puncture/TwoPunctureABE.C
--- a/AMSS_NCKU_source/Two_Puncture/TwoPunctures.C
+++ b/AMSS_NCKU_source/Two_Puncture/TwoPunctures.C
--- a/AMSS_NCKU_source/Two_Puncture/TwoPunctures.h
+++ b/AMSS_NCKU_source/Two_Puncture/TwoPunctures.h
--- a/AMSS_NCKU_source/Z4C/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4C/Z4c_class.C
@@ -3,6 +3,7 @@
 #include <sstream>
 #include <cstdio>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -28,6 +29,20 @@ using namespace std;
 #include "kodiss.h"
 #include "parameters.h"
 #ifndef USE_CUDA_Z4C
 #define USE_CUDA_Z4C 0
 #endif
 #if USE_CUDA_Z4C && (ABEtype == 2)
 #include "z4c_rhs_cuda.h"
 #endif
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #ifdef WithShell
 #include "bssn_gpu.h"
 #endif
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
@@ -37,6 +52,81 @@ using namespace std;
 // Define Z4c_class
 #if USE_CUDA_Z4C && (ABEtype == 2) && defined(WithShell)
 // GPU-accelerated Z4C shell RHS: same parameter signature as f_compute_rhs_Z4c_ss.
 // Internally calls gpu_rhs_z4c_ss which modifies trK→trKd before upload,
 // runs BSSN algebraic kernels, then applies Z4C post-processing (TZ_rhs, damping).
 extern "C" {
 static int cuda_compute_rhs_z4c_ss(
    int *ex, double &T, double *crho, double *sigma, double *R,
    double *X, double *Y, double *Z,
    double *drhodx, double *drhody, double *drhodz,
    double *dsigmadx, double *dsigmady, double *dsigmadz,
    double *dRdx, double *dRdy, double *dRdz,
    double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz,
    double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
    double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz,
    double *chi, double *trK,
    double *gxx, double *gxy, double *gxz, double *gyy, double *gyz, double *gzz,
    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
    double *Gamx, double *Gamy, double *Gamz,
    double *Lap, double *betax, double *betay, double *betaz,
    double *dtSfx, double *dtSfy, double *dtSfz,
    double *TZ,
    double *chi_rhs, double *trK_rhs,
    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
    double *TZ_rhs,
    double *rho_mat, double *Sx, double *Sy, double *Sz,
    double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
    double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
    double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
    int &Symmetry, int &Lev, double &eps, int &sst, int &co)
 {
    return gpu_rhs_z4c_ss(0, 0,  // calledby=ABE_main, mpi_rank=device_0
                     ex, T, crho, sigma, R, X, Y, Z,
                     drhodx, drhody, drhodz,
                     dsigmadx, dsigmady, dsigmadz,
                     dRdx, dRdy, dRdz,
                     drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
                     dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
                     dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
                     chi, trK,
                     gxx, gxy, gxz, gyy, gyz, gzz,
                     Axx, Axy, Axz, Ayy, Ayz, Azz,
                     Gamx, Gamy, Gamz,
                     Lap, betax, betay, betaz,
                     dtSfx, dtSfy, dtSfz,
                     TZ,
                     chi_rhs, trK_rhs,
                     gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
                     Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
                     Gamx_rhs, Gamy_rhs, Gamz_rhs,
                     Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
                     dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
                     TZ_rhs,
                     rho_mat, Sx, Sy, Sz,
                     Sxx, Sxy, Sxz, Syy, Syz, Szz,
                     Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
                     Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
                     Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
                     Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
                     ham_Res, movx_Res, movy_Res, movz_Res,
                     Gmx_Res, Gmy_Res, Gmz_Res,
                     Symmetry, Lev, eps, sst, co);
 }
 }
 // Redirect all Z4C shell RHS calls in Step/SHStep to GPU
 #define f_compute_rhs_Z4c_ss cuda_compute_rhs_z4c_ss
 #endif
 // This class inherits some members and methods from the parent `bssn_class` and modifies others.
 // The modified members and methods are defined below (and in the header Z4c_class.h).
 // The remaining members/methods are inherited from `bssn_class` (declared in bssn_class.h).
@@ -132,6 +222,13 @@ void Z4c_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -165,13 +262,581 @@ Z4c_class::~Z4c_class()
 //================================================================================================
-#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
+#ifndef AMSS_Z4C_MRBD
 #define AMSS_Z4C_MRBD 0
 #endif
 #define MRBD AMSS_Z4C_MRBD // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
 #ifndef CPBC
 // for sommerfeld boundary
 #if USE_CUDA_Z4C && (ABEtype == 2)
 #if (MRBD == 2)
 #error "USE_CUDA_Z4C resident path does not support MRBD == 2"
 #endif
 namespace {
 static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
 bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
                         double **host_views,
                         double *propspeeds = 0,
                         double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < Z4C_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
 }
 void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
      {
        double *state_out[Z4C_CUDA_STATE_COUNT];
        if (!fill_z4c_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA Z4C state list mismatch on resident state download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
        {
          cout << "CUDA Z4C resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          z4c_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
 {
  if (!patch)
    return false;
  for (int d = 0; d < dim; d++)
  {
    const double h = patch->getdX(d);
    const double lo = patch->bbox[d] + patch->lli[d] * h;
    const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
    if (point[d] < lo || point[d] > hi)
      return false;
  }
  return true;
 }
 bool z4c_cuda_point_in_block(Patch *patch, Block *block,
                             const double *point, const double *DH)
 {
  if (!patch || !block)
    return false;
  for (int d = 0; d < dim; d++)
  {
    double llb;
    double uub;
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + (ghost_width - 0.5) * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
 #else
 #ifdef Cell
    llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
              ? block->bbox[d] + patch->lli[d] * DH[d]
              : block->bbox[d] + ghost_width * DH[d];
    uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
              ? block->bbox[dim + d] - patch->uui[d] * DH[d]
              : block->bbox[dim + d] - ghost_width * DH[d];
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
    if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
      return false;
  }
  return true;
 }
 int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
 {
  if (!coords || n <= ordn)
    return 0;
  int cxi = int((x - coords[0]) / dx + 0.4) + 1;
  int start = cxi - ordn / 2;
  if (start < 0)
    start = 0;
  const int max_start = n - ordn;
  if (start > max_start)
    start = max_start;
  return start;
 }
 bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
                                       int myrank,
                                       const double *point,
                                       var *forx, var *fory, var *forz,
                                       int Symmetry,
                                       double *shellf)
 {
  const int ordn = 2 * ghost_width;
  int owner_rank = -1;
  shellf[0] = shellf[1] = shellf[2] = 0.0;
  MyList<Patch> *PL = PatL;
  while (PL)
  {
    Patch *patch = PL->data;
    if (!z4c_cuda_patch_contains_point(patch, point))
    {
      PL = PL->next;
      continue;
    }
    double DH[dim];
    for (int d = 0; d < dim; d++)
      DH[d] = patch->getdX(d);
    MyList<Block> *BP = patch->blb;
    while (BP)
    {
      Block *block = BP->data;
      if (z4c_cuda_point_in_block(patch, block, point, DH))
      {
        owner_rank = block->rank;
        if (myrank == owner_rank)
        {
          int interp_ordn = ordn;
          int interp_sym = Symmetry;
          double x = point[0];
          double y = point[1];
          double z = point[2];
          if (z4c_cuda_has_resident_state(block) &&
              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
          {
            const int sx = ordn;
            const int sy = ordn;
            const int sz = ordn;
            const int region_all = sx * sy * sz;
            const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
            const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
            const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
            double *packed_fields = new double[3 * region_all];
            var *vars[3] = {forx, fory, forz};
            for (int f = 0; f < 3; f++)
            {
              if (z4c_cuda_pack_state_region_to_host_buffer(block,
                                                            k_z4c_cuda_bh_state_indices[f],
                                                            packed_fields + f * region_all,
                                                            block->shape,
                                                            i0, j0, k0,
                                                            sx, sy, sz) != 0)
              {
                delete[] packed_fields;
                cout << "CUDA Z4C BH tile download failed" << endl;
                MPI_Abort(MPI_COMM_WORLD, 1);
              }
              int tile_shape[3] = {sx, sy, sz};
              f_global_interp(tile_shape,
                              block->X[0] + i0,
                              block->X[1] + j0,
                              block->X[2] + k0,
                              packed_fields + f * region_all,
                              shellf[f],
                              x, y, z,
                              interp_ordn,
                              vars[f]->SoA,
                              interp_sym);
            }
            delete[] packed_fields;
          }
          else
          {
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forx->sgfn], shellf[0],
                            x, y, z, interp_ordn, forx->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[fory->sgfn], shellf[1],
                            x, y, z, interp_ordn, fory->SoA, interp_sym);
            f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
                            block->fgfs[forz->sgfn], shellf[2],
                            x, y, z, interp_ordn, forz->SoA, interp_sym);
          }
        }
        break;
      }
      if (BP == patch->ble)
        break;
      BP = BP->next;
    }
    if (owner_rank >= 0)
      break;
    PL = PL->next;
  }
  if (owner_rank < 0)
    return false;
  MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
  return true;
 }
 bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
                                        int ilev,
                                        int myrank,
                                        int BH_num,
                                        double **BH_PS,
                                        double **BH_RHS,
                                        var *forx, var *fory, var *forz,
                                        int Symmetry)
 {
  for (int n = 0; n < BH_num; n++)
  {
    double shellf[3] = {0.0, 0.0, 0.0};
    int lev = ilev;
    while (lev >= 0 &&
           !z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
                                              forx, fory, forz, Symmetry, shellf))
    {
      --lev;
    }
    if (lev < 0)
      return false;
    BH_RHS[n][0] = -shellf[0];
    BH_RHS[n][1] = -shellf[1];
    BH_RHS[n][2] = -shellf[2];
  }
  return true;
 }
 bool z4c_cuda_resident_step_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 } // namespace
 #endif
 void Z4c_class::Step(int lev, int YN)
 {
 #if USE_CUDA_Z4C && (ABEtype == 2)
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
  bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
  double ndeps = numepss;
  if (lev < GH->movls)
    ndeps = numepsb;
  double TRK4 = PhysTime;
  int iter_count = 0;
  int pre = 0, cor = 1;
  int ERROR = 0;
 #ifdef WithShell
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
        double *state_in[Z4C_CUDA_STATE_COUNT];
        double *state_out[Z4C_CUDA_STATE_COUNT];
        double propspeed[Z4C_CUDA_STATE_COUNT];
        double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
        if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
            !fill_z4c_cuda_views(cg, SynchList_pre, state_out))
        {
          cout << "CUDA Z4C state list mismatch on predictor step" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        int apply_bam_bc = 0;
 #if (MRBD == 0)
 #if (SommerType == 0)
        apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #elif (MRBD == 1)
        apply_bam_bc = 1;
 #endif
        int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
        int apply_enforce_ga = 0;
 #if (AGM == 0)
        apply_enforce_ga = 1;
 #endif
        if (z4c_cuda_rk4_substep(cg,
                                 cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                 state_in, state_out,
                                 propspeed, soa_flat, Pp->data->bbox,
                                 dT_lev, TRK4, iter_count, apply_bam_bc,
                                 Symmetry, lev, ndeps, pre,
                                 keep_resident_state, apply_enforce_ga, chitiny))
        {
          cout << "CUDA Z4C predictor substep failed in domain: ("
               << cg->bbox[0] << ":" << cg->bbox[3] << ","
               << cg->bbox[1] << ":" << cg->bbox[4] << ","
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  {
    int erh = ERROR;
    MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  }
  if (ERROR)
  {
    if (myrank == 0 && ErrorMonitor->outfile)
      ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
                            << ", lev = " << lev << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
      if (Symmetry > 0)
        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
      if (Symmetry == 2)
      {
        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
      }
    }
  }
  if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
  if (lev == a_lev)
    AnalysisStuff(lev, dT_lev);
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
    Pp = GH->PatL[lev];
    while (Pp)
    {
      MyList<Block> *BP = Pp->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
          double *state_in[Z4C_CUDA_STATE_COUNT];
          double *state_out[Z4C_CUDA_STATE_COUNT];
          double propspeed[Z4C_CUDA_STATE_COUNT];
          double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
          if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
              !fill_z4c_cuda_views(cg, SynchList_cor, state_out))
          {
            cout << "CUDA Z4C state list mismatch on corrector step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
 #if (MRBD == 0)
 #if (SommerType == 0)
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #elif (MRBD == 1)
          apply_bam_bc = 1;
 #endif
          int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #elif (AGM == 1)
          apply_enforce_ga = (iter_count == 3) ? 1 : 0;
 #endif
          if (z4c_cuda_rk4_substep(cg,
                                   cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                   state_in, state_out,
                                   propspeed, soa_flat, Pp->data->bbox,
                                   dT_lev, TRK4, iter_count, apply_bam_bc,
                                   Symmetry, lev, ndeps, cor,
                                   keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA Z4C corrector substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
        }
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
      }
      Pp = Pp->next;
    }
    {
      int erh = ERROR;
      MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    }
    if (ERROR)
    {
      if (myrank == 0 && ErrorMonitor->outfile)
        ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
                              << " at t = " << PhysTime
                              << ", lev = " << lev << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      if (z4c_cuda_resident_step_enabled())
      {
        if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
                                                Porg, Porg1,
                                                Sfx, Sfy, Sfz, Symmetry))
        {
          if (myrank == 0 && ErrorMonitor->outfile)
            ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
                                  << PhysTime << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      else
      {
        compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      }
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
        if (Symmetry > 0)
          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
        if (Symmetry == 2)
        {
          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
        }
      }
    }
    if (iter_count < 3)
    {
      Pp = GH->PatL[lev];
      while (Pp)
      {
        MyList<Block> *BP = Pp->data->blb;
        while (BP)
        {
          Block *cg = BP->data;
          cg->swapList(SynchList_pre, SynchList_cor, myrank);
          if (BP == Pp->data->ble)
            break;
          BP = BP->next;
        }
        Pp = Pp->next;
      }
      if (BH_num > 0 && lev == GH->levels - 1)
      {
        for (int ithBH = 0; ithBH < BH_num; ithBH++)
        {
          Porg[ithBH][0] = Porg1[ithBH][0];
          Porg[ithBH][1] = Porg1[ithBH][1];
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
    }
  }
  z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false);
 #if (RPS == 0)
  RestrictProlong(lev, YN, BB);
 #endif
  Pp = GH->PatL[lev];
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      cg->swapList(StateList, SynchList_cor, myrank);
      cg->swapList(OldStateList, SynchList_cor, myrank);
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
      Porg0[ithBH][0] = Porg1[ithBH][0];
      Porg0[ithBH][1] = Porg1[ithBH][1];
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
 #else
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
@@ -339,6 +1004,13 @@ void Z4c_class::Step(int lev, int YN)
  }
 #ifdef WithShell
 #if USE_CUDA_Z4C
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  // evolve Shell Patches
  if (lev == 0)
  {
@@ -1042,9 +1714,11 @@ void Z4c_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
 #endif
 }
 #else
 // for constraint preserving boundary (CPBC)
 // Note: CPBC path uses CPU Fortran RHS; GPU resident sync is a no-op here.
 #ifndef WithShell
 #error "CPBC only supports Shell"
 #endif
@@ -1074,6 +1748,14 @@ void Z4c_class::Step(int lev, int YN)
  int pre = 0, cor = 1;
  int ERROR = 0;
 #if USE_CUDA_Z4C && defined(WithShell)
  if (bssn_cuda_use_resident_sync(lev))
  {
    for (int dl = 0; dl < GH->levels; dl++)
      bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank);
  }
 #endif
  MyList<ss_patch> *sPp;
  // Predictor
  MyList<Patch> *Pp = GH->PatL[lev];
@@ -2404,6 +3086,11 @@ void Z4c_class::Check_extrop()
 //================================================================================================
 #if USE_CUDA_Z4C && (ABEtype == 2) && defined(WithShell)
 #undef f_compute_rhs_Z4c_ss
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
 #endif
 // this member function is used to compute and output constraint violation
 //================================================================================================
@@ -2679,11 +3366,12 @@ void Z4c_class::Interp_Constraint()
  }
  ofstream outfile;
-  char filename[50];
+  char suffix[64];
-  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
+  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename);
+  outfile.open(filename.c_str());
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
--- a/AMSS_NCKU_source/Z4C/Z4c_class.h
+++ b/AMSS_NCKU_source/Z4C/Z4c_class.h
--- a/AMSS_NCKU_source/Z4C/Z4c_rhs.f90
+++ b/AMSS_NCKU_source/Z4C/Z4c_rhs.f90
@@ -94,29 +94,31 @@
               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon,                    &            
               Symmetry,Lev,eps,co)
  if (co == 0) then
 #if (ABV == 0)  
-  call ricci_gamma(ex, X, Y, Z,                                      &
+    call ricci_gamma(ex, X, Y, Z,                                      &
-               chi,                                                  &
+                 chi,                                                  &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-               Symmetry)
+                 Symmetry)
 #endif
-  call constraint_bssn(ex, X, Y, Z,&
+    call constraint_bssn(ex, X, Y, Z,&
-               chi,trK, &
+                 chi,trK, &
-               dxx,gxy,gxz,dyy,gyz,dzz, &
+                 dxx,gxy,gxz,dyy,gyz,dzz, &
-               Axx,Axy,Axz,Ayy,Ayz,Azz, &
+                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
-               Gamx,Gamy,Gamz,&
+                 Gamx,Gamy,Gamz,&
-               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
-               Symmetry)
+                 Symmetry)
  endif
  return
@@ -227,6 +229,7 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
 !!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -261,6 +264,7 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -1263,30 +1267,32 @@
  endif
  if (co == 0) then
 #if (ABV == 0)  
-  call ricci_gamma(ex, X, Y, Z,                                      &
+    call ricci_gamma(ex, X, Y, Z,                                      &
-               chi,                                                  &
+                 chi,                                                  &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-               Symmetry)
+                 Symmetry)
 #endif
-  call constraint_bssn(ex, X, Y, Z,&
+    call constraint_bssn(ex, X, Y, Z,&
-               chi,trK, &
+                 chi,trK, &
-               dxx,gxy,gxz,dyy,gyz,dzz, &
+                 dxx,gxy,gxz,dyy,gyz,dzz, &
-               Axx,Axy,Axz,Ayy,Ayz,Azz, &
+                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
-               Gamx,Gamy,Gamz,&
+                 Gamx,Gamy,Gamz,&
-               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
+                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
-               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
+                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
-               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
+                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
-               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
+                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
-               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
+                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
-               Symmetry)
+                 Symmetry)
  endif
  gont = 0
--- a/AMSS_NCKU_source/Z4C/Z4c_rhs_ss.f90
+++ b/AMSS_NCKU_source/Z4C/Z4c_rhs_ss.f90
@@ -122,6 +122,7 @@
  call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
 !!! sanity check
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
@@ -156,6 +157,7 @@
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -1388,41 +1390,43 @@
  call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
  endif
  if (co == 0) then
 #if (ABV == 1)  
-  call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                                 &
+    call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z,                               &
-               drhodx, drhody, drhodz,                                         &
+                 drhodx, drhody, drhodz,                                       &
-               dsigmadx,dsigmady,dsigmadz,                                     &
+                 dsigmadx,dsigmady,dsigmadz,                                   &
-               dRdx,dRdy,dRdz,                                                 &
+                 dRdx,dRdy,dRdz,                                               &
-               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
+                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
-               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
+                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
-               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
+                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
-               chi,                                                  &
+                 chi,                                                          &
-               dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
+                 dxx    ,   gxy    ,   gxz    ,   dyy    ,   gyz    ,   dzz,&
-               Gamx   ,  Gamy    ,  Gamz    , &
+                 Gamx   ,  Gamy    ,  Gamz    , &
-               Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
+                 Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
-               Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
+                 Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
-               Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
+                 Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
-               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
+                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
-               Symmetry,Lev,sst)
+                 Symmetry,Lev,sst)
  call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
               drhodx, drhody, drhodz,                                         &
               dsigmadx,dsigmady,dsigmadz,                                     &
               dRdx,dRdy,dRdz,                                                 &
               drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,                &
               dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,    &
               dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                            &
               chi,trK, &
               dxx,gxy,gxz,dyy,gyz,dzz, &
               Axx,Axy,Axz,Ayy,Ayz,Azz, &
               Gamx,Gamy,Gamz,&
               Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
               Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
               Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
               Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
               Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
               Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
               Symmetry,Lev,sst)
 #endif
    call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z,  &
                 drhodx, drhody, drhodz,                                       &
                 dsigmadx,dsigmady,dsigmadz,                                   &
                 dRdx,dRdy,dRdz,                                               &
                 drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz,              &
                 dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz,  &
                 dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz,                          &
                 chi,trK, &
                 dxx,gxy,gxz,dyy,gyz,dzz, &
                 Axx,Axy,Axz,Ayy,Ayz,Azz, &
                 Gamx,Gamy,Gamz,&
                 Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
                 Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
                 Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
                 Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
                 Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
                 Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
                 Symmetry,Lev,sst)
  endif
  gont = 0
--- a/AMSS_NCKU_source/BSSN/adm_constraint.f90
+++ b/AMSS_NCKU_source/BSSN/adm_constraint.f90
--- a/AMSS_NCKU_source/Psi4/adm_ricci_gamma.f90
+++ b/AMSS_NCKU_source/Psi4/adm_ricci_gamma.f90
--- a/AMSS_NCKU_source/AHF_Direct/array.C
+++ b/AMSS_NCKU_source/AHF_Direct/array.C
--- a/AMSS_NCKU_source/AHF_Direct/array.h
+++ b/AMSS_NCKU_source/AHF_Direct/array.h
--- a/AMSS_NCKU_source/BSSN/bssn2adm.f90
+++ b/AMSS_NCKU_source/BSSN/bssn2adm.f90
--- a/AMSS_NCKU_source/BSSN/bssnEM_class.C
+++ b/AMSS_NCKU_source/BSSN/bssnEM_class.C
@@ -2,7 +2,9 @@
 #ifdef newc
 #include <sstream>
 #include <cstdio>
 #include <cstdlib>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -10,6 +12,7 @@ using namespace std;
 #endif
 #include <time.h>
 #include <cstring>
 #include "macrodef.h"
 #include "misc.h"
@@ -19,6 +22,9 @@ using namespace std;
 #include "bssnEM_class.h"
 #include "bssn_rhs.h"
 #include "empart.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #include "initial_puncture.h"
 #include "initial_maxwell.h"
 #include "enforce_algebra.h"
@@ -36,6 +42,397 @@ using namespace std;
 //================================================================================================
 namespace
 {
 MyList<var> *advance_var_list(MyList<var> *vars, int count)
 {
  while (vars && count > 0)
  {
    vars = vars->next;
    --count;
  }
  return vars;
 }
 bool bssn_em_step_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_step_timing_all_levels_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 #if USE_CUDA_BSSN
 bool bssn_em_zero_analysis_fastpath_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_zero_resident_download_fastpath_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH");
    enabled = (!env || atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_em_resident_zero_fastpath_ready(MyList<Patch> *PatL,
 #ifdef WithShell
                                          ShellPatch *shell,
 #else
                                          ShellPatch * /*shell*/,
 #endif
                                          int rank)
 {
  int local_ok = 1;
  int local_seen = 0;
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (rank == cg->rank)
      {
        local_seen = 1;
        if (!bssn_em_cuda_resident_zero_fast_state(cg))
          local_ok = 0;
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 #ifdef WithShell
  if (shell && shell->PatL)
  {
    MyList<ss_patch> *SP = shell->PatL;
    while (SP)
    {
      MyList<Block> *BP = SP->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (rank == cg->rank)
        {
          local_seen = 1;
          if (!bssn_em_cuda_resident_zero_fast_state(cg))
            local_ok = 0;
        }
        if (BP == SP->data->ble)
          break;
        BP = BP->next;
      }
      SP = SP->next;
    }
  }
 #endif
  int global_ok = 0;
  int global_seen = 0;
  MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
  return global_seen && global_ok;
 }
 bool bssn_em_analysis_zero_fastpath_ready(MyList<Patch> *PatL,
 #ifdef WithShell
                                          ShellPatch *shell,
 #else
                                          ShellPatch *shell,
 #endif
                                          int rank)
 {
  if (!bssn_em_zero_analysis_fastpath_enabled())
    return false;
  return bssn_em_resident_zero_fastpath_ready(PatL, shell, rank);
 }
 void zero_em_analysis_outputs(MyList<Patch> *PatL,
 #ifdef WithShell
                              ShellPatch *shell,
 #else
                              ShellPatch * /*shell*/,
 #endif
                              int rank,
                              var *Rphi2_var, var *Iphi2_var,
                              var *Rphi1_var, var *Iphi1_var)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (rank == cg->rank)
      {
        const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
        std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
        std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 #ifdef WithShell
  if (shell && shell->PatL)
  {
    MyList<ss_patch> *SP = shell->PatL;
    while (SP)
    {
      MyList<Block> *BP = SP->data->blb;
      while (BP)
      {
        Block *cg = BP->data;
        if (rank == cg->rank)
        {
          const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
          std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
          std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
        }
        if (BP == SP->data->ble)
          break;
        BP = BP->next;
      }
      SP = SP->next;
    }
  }
 #endif
 }
 #endif
 int bssn_em_step_timing_every()
 {
  static int every = -1;
  if (every < 0)
  {
    const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY");
    every = (env && atoi(env) > 0) ? atoi(env) : 1;
  }
  return every;
 }
 #if USE_CUDA_BSSN
 bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList<var> *vars,
                                  double **host_views,
                                  double *propspeeds = 0,
                                  double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_CUDA_STATE_COUNT;
 }
 bool fill_bssn_em_cuda_views(Block *cg, MyList<var> *vars,
                             double **host_views,
                             double *propspeeds = 0,
                             double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_EM_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0;
 }
 void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources,
                                          var *Jx, var *Jy, var *Jz, var *qchar)
 {
  sources[0] = cg->fgfs[Jx->sgfn];
  sources[1] = cg->fgfs[Jy->sgfn];
  sources[2] = cg->fgfs[Jz->sgfn];
  sources[3] = cg->fgfs[qchar->sgfn];
 }
 void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
                                    var *rho, var *Sx, var *Sy, var *Sz,
                                    var *Sxx, var *Sxy, var *Sxz,
                                    var *Syy, var *Syz, var *Szz)
 {
  matter[0] = cg->fgfs[rho->sgfn];
  matter[1] = cg->fgfs[Sx->sgfn];
  matter[2] = cg->fgfs[Sy->sgfn];
  matter[3] = cg->fgfs[Sz->sgfn];
  matter[4] = cg->fgfs[Sxx->sgfn];
  matter[5] = cg->fgfs[Sxy->sgfn];
  matter[6] = cg->fgfs[Sxz->sgfn];
  matter[7] = cg->fgfs[Syy->sgfn];
  matter[8] = cg->fgfs[Syz->sgfn];
  matter[9] = cg->fgfs[Szz->sgfn];
 }
 bool bssn_em_cuda_use_resident_sync(int lev)
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
    if (!env)
      env = getenv("AMSS_CUDA_EM_RESIDENT_SYNC");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
  }
  if (!enabled)
    return false;
 #ifdef WithShell
  (void)lev;
  return false;
 #else
  return true;
 #endif
 }
 bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
  static int keep_all_levels = -1;
  if (keep_all_levels < 0)
  {
    const char *env = getenv("AMSS_CUDA_EM_KEEP_ALL_LEVELS");
    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
  }
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_EM_KEEP_RESIDENT_AFTER_STEP");
    if (!env)
      env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  if (!enabled)
    return false;
  if (lev == analysis_lev)
    return false;
  if (keep_all_levels)
    return true;
  return lev < trfls_in;
 }
 void bssn_em_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                       int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_out[BSSN_EM_CUDA_STATE_COUNT];
        if (!fill_bssn_em_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA BSSN-EM resident state list mismatch during download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape,
                                                                state_out,
                                                                BSSN_EM_CUDA_STATE_COUNT))
        {
          cout << "CUDA BSSN-EM resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          bssn_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_em_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                        int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_key[BSSN_EM_CUDA_STATE_COUNT];
        if (!fill_bssn_em_cuda_views(cg, vars, state_key))
        {
          cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape,
                                                     state_key,
                                                     BSSN_EM_CUDA_STATE_COUNT))
        {
          cout << "CUDA BSSN-EM keep-only resident state failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 #endif
 }
 // Define bssnEM_class
 // It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -258,6 +655,13 @@ void bssnEM_class::Initialize()
    PhysTime = StartTime;
    Setup_Black_Hole_position();
  }
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
 }
 //================================================================================================
@@ -833,9 +1237,25 @@ void bssnEM_class::Step(int lev, int YN)
  int iter_count = 0; // count RK4 substeps
  int pre = 0, cor = 1;
  int ERROR = 0;
 #if USE_CUDA_BSSN
  const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev);
 #endif
  const bool em_step_timing = bssn_em_step_timing_enabled();
  const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0;
  double em_t0 = 0.0;
  double em_t_predictor = 0.0;
  double em_t_predictor_sync = 0.0;
  double em_t_corrector = 0.0;
  double em_t_corrector_sync = 0.0;
  double em_t_analysis = 0.0;
  double em_t_bh = 0.0;
  double em_t_swap = 0.0;
  double em_t_resident = 0.0;
  double em_t_rp = 0.0;
  MyList<ss_patch> *sPp;
  // Predictor
  em_t0 = em_step_timing ? MPI_Wtime() : 0.0;
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -845,15 +1265,20 @@ void bssnEM_class::Step(int lev, int YN)
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
 #if !USE_CUDA_BSSN
 #if (AGM == 0)
        f_enforce_ga(cg->shape,
                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
 #endif
-        if (
+        int em_rhs_error = 0;
        bool used_gpu_substep = false;
 #if !USE_CUDA_BSSN
        em_rhs_error =
            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
@@ -873,8 +1298,52 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                 Symmetry, lev, ndeps) ||
+                                 Symmetry, lev, ndeps);
-            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#endif
 #if USE_CUDA_BSSN
        if (!em_rhs_error)
        {
          double *state_in[BSSN_EM_CUDA_STATE_COUNT];
          double *state_out[BSSN_EM_CUDA_STATE_COUNT];
          double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
          double propspeed[BSSN_EM_CUDA_STATE_COUNT];
          double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
          if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
              !fill_bssn_em_cuda_views(cg, SynchList_pre, state_out))
          {
            cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
          int apply_bam_bc = 0;
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #endif
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          if (bssn_em_cuda_rk4_substep(cg,
                                       cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                       state_in, state_out, sources,
                                       propspeed, soa_flat, Pp->data->bbox,
                                       dT_lev, TRK4, iter_count, apply_bam_bc,
                                       Symmetry, lev, ndeps, pre,
                                       keep_resident_state, apply_enforce_ga, chitiny))
          {
            ERROR = 1;
          }
          used_gpu_substep = true;
        }
 #endif
        if (em_rhs_error ||
            (!used_gpu_substep &&
             f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -907,7 +1376,7 @@ void bssnEM_class::Step(int lev, int YN)
                               cg->fgfs[Cons_Ham->sgfn],
                               cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                               cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                               Symmetry, lev, ndeps, pre))
+                               Symmetry, lev, ndeps, pre)))
        {
          cout << "find NaN in domain: (" 
               << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -916,6 +1385,8 @@ void bssnEM_class::Step(int lev, int YN)
          ERROR = 1;
        }
        if (!used_gpu_substep)
        {
        // rk4 substep and boundary
        {
          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
@@ -957,6 +1428,7 @@ void bssnEM_class::Step(int lev, int YN)
          }
        }
        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
        }
      }
      if (BP == Pp->data->ble)
        break;
@@ -964,6 +1436,8 @@ void bssnEM_class::Step(int lev, int YN)
    }
    Pp = Pp->next;
  }
  if (em_step_timing)
    em_t_predictor += MPI_Wtime() - em_t0;
  // check error information
  {
    int erh = ERROR;
@@ -1221,7 +1695,11 @@ void bssnEM_class::Step(int lev, int YN)
  }
 #endif
-  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+  if (em_step_timing)
    em_t0 = MPI_Wtime();
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
  if (em_step_timing)
    em_t_predictor_sync += MPI_Wtime() - em_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1244,6 +1722,8 @@ void bssnEM_class::Step(int lev, int YN)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
@@ -1272,16 +1752,24 @@ void bssnEM_class::Step(int lev, int YN)
        DG_List->clearList();
      }
    }
    if (em_step_timing)
      em_t_bh += MPI_Wtime() - em_t0;
  }
  // data analysis part
  // Warning NOTE: the variables1 are used as temp storege room
  if (lev == a_lev)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    AnalysisStuff_EM(lev, dT_lev);
    if (em_step_timing)
      em_t_analysis += MPI_Wtime() - em_t0;
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
@@ -1294,6 +1782,7 @@ void bssnEM_class::Step(int lev, int YN)
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
 #if !USE_CUDA_BSSN
 #if (AGM == 0)
          f_enforce_ga(cg->shape,
                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
@@ -1307,9 +1796,13 @@ void bssnEM_class::Step(int lev, int YN)
                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
 #endif
-          if (
+          int em_rhs_error = 0;
          bool used_gpu_substep = false;
 #if !USE_CUDA_BSSN
          em_rhs_error =
              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                   cg->fgfs[phi->sgfn],
                                   cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
@@ -1329,8 +1822,55 @@ void bssnEM_class::Step(int lev, int YN)
                                   cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                   Symmetry, lev, ndeps) ||
+                                   Symmetry, lev, ndeps);
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#endif
 #if USE_CUDA_BSSN
          if (!em_rhs_error)
          {
            double *state_in[BSSN_EM_CUDA_STATE_COUNT];
            double *state_out[BSSN_EM_CUDA_STATE_COUNT];
            double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
            double propspeed[BSSN_EM_CUDA_STATE_COUNT];
            double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
            if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
                !fill_bssn_em_cuda_views(cg, SynchList_cor, state_out))
            {
              cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl;
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
            int apply_bam_bc = 0;
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
            int apply_enforce_ga = 0;
 #if (AGM == 0)
            apply_enforce_ga = 1;
 #elif (AGM == 1)
            if (iter_count == 3)
              apply_enforce_ga = 1;
 #endif
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            if (bssn_em_cuda_rk4_substep(cg,
                                         cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                         state_in, state_out, sources,
                                         propspeed, soa_flat, Pp->data->bbox,
                                         dT_lev, TRK4, iter_count, apply_bam_bc,
                                         Symmetry, lev, ndeps, cor,
                                         keep_resident_state, apply_enforce_ga, chitiny))
            {
              ERROR = 1;
            }
            used_gpu_substep = true;
          }
 #endif
          if (em_rhs_error ||
              (!used_gpu_substep &&
               f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1362,7 +1902,7 @@ void bssnEM_class::Step(int lev, int YN)
                                 cg->fgfs[Cons_Ham->sgfn],
                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, cor))
+                                 Symmetry, lev, ndeps, cor)))
          {
            cout << "find NaN in domain: (" 
                 << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -1370,6 +1910,8 @@ void bssnEM_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          if (!used_gpu_substep)
          {
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
@@ -1412,6 +1954,7 @@ void bssnEM_class::Step(int lev, int YN)
            }
          }
          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
          }
        }
        if (BP == Pp->data->ble)
          break;
@@ -1683,7 +2226,13 @@ void bssnEM_class::Step(int lev, int YN)
    }
 #endif
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    if (em_step_timing)
      em_t_corrector += MPI_Wtime() - em_t0;
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
    if (em_step_timing)
      em_t_corrector_sync += MPI_Wtime() - em_t0;
 #ifdef WithShell
    if (lev == 0)
@@ -1705,6 +2254,8 @@ void bssnEM_class::Step(int lev, int YN)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      if (em_step_timing)
        em_t0 = MPI_Wtime();
      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
@@ -1733,10 +2284,14 @@ void bssnEM_class::Step(int lev, int YN)
          DG_List->clearList();
        }
      }
      if (em_step_timing)
        em_t_bh += MPI_Wtime() - em_t0;
    }
    // swap time level
    if (iter_count < 3)
    {
      if (em_step_timing)
        em_t0 = MPI_Wtime();
      Pp = GH->PatL[lev];
      while (Pp)
      {
@@ -1780,12 +2335,32 @@ void bssnEM_class::Step(int lev, int YN)
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
      if (em_step_timing)
        em_t_swap += MPI_Wtime() - em_t0;
    }
  }
 #if USE_CUDA_BSSN
  if (use_cuda_resident_sync)
  {
    if (em_step_timing)
      em_t0 = MPI_Wtime();
    const bool needs_resident_download =
        !bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev);
    if (needs_resident_download)
      bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
    if (em_step_timing)
      em_t_resident += MPI_Wtime() - em_t0;
  }
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part
  if (em_step_timing)
    em_t0 = MPI_Wtime();
  RestrictProlong(lev, YN, BB);
  if (em_step_timing)
    em_t_rp += MPI_Wtime() - em_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1813,6 +2388,8 @@ void bssnEM_class::Step(int lev, int YN)
  //
  // OldStateList  old -----------
  // update
  if (em_step_timing)
    em_t0 = MPI_Wtime();
  Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -1858,6 +2435,26 @@ void bssnEM_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  if (em_step_timing)
  {
    em_t_swap += MPI_Wtime() - em_t0;
    static int em_step_report_count = 0;
    const int em_timing_every = bssn_em_step_timing_every();
    const bool report_all_levels = bssn_em_step_timing_all_levels_enabled();
    if (lev == GH->levels - 1)
      ++em_step_report_count;
    if ((report_all_levels || lev == GH->levels - 1) &&
        (em_timing_every <= 1 || em_step_report_count % em_timing_every == 0))
    {
      fprintf(stderr,
              "[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f "
              "analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n",
              lev, MPI_Wtime() - em_step_t0,
              em_t_predictor, em_t_predictor_sync,
              em_t_analysis, em_t_corrector, em_t_corrector_sync,
              em_t_bh, em_t_swap, em_t_resident, em_t_rp);
    }
  }
 }
 //================================================================================================
@@ -2036,6 +2633,59 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
  if (LastAnas >= AnasTime)
  {
 #if USE_CUDA_BSSN
    const bool zero_em_analysis =
        bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev],
 #ifdef WithShell
                                             SH
 #else
                                             0
 #endif
                                             , myrank
        );
 #else
    const bool zero_em_analysis = false;
 #endif
    if (zero_em_analysis)
    {
 #if USE_CUDA_BSSN
      zero_em_analysis_outputs(GH->PatL[lev],
 #ifdef WithShell
                               SH,
 #else
                               0,
 #endif
                               myrank,
                               Rphi2, Iphi2, Rphi1, Iphi1);
 #endif
      int NN = 0;
      for (int pl = 1; pl < maxl + 1; pl++)
        for (int pm = -pl; pm < pl + 1; pm++)
          NN++;
      double *RP = new double[NN];
      double *IP = new double[NN];
      std::memset(RP, 0, NN * sizeof(double));
      std::memset(IP, 0, NN * sizeof(double));
      for (int i = 0; i < decn; i++)
        Phi2Monitor->writefile(PhysTime, NN, RP, IP);
      delete[] RP;
      delete[] IP;
      NN = 0;
      for (int pl = 0; pl < maxl + 1; pl++)
        for (int pm = -pl; pm < pl + 1; pm++)
          NN++;
      RP = new double[NN];
      IP = new double[NN];
      std::memset(RP, 0, NN * sizeof(double));
      std::memset(IP, 0, NN * sizeof(double));
      for (int i = 0; i < decn; i++)
        Phi1Monitor->writefile(PhysTime, NN, RP, IP);
      delete[] RP;
      delete[] IP;
    }
    else
    {
    Compute_Phi2(lev);
    double *RP, *IP;
    int NN = 0;
@@ -2124,6 +2774,7 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
    }
    delete[] RP;
    delete[] IP;
    }
  }
  AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here
@@ -2304,11 +2955,12 @@ void bssnEM_class::Interp_Constraint()
  }
  ofstream outfile;
-  char filename[50];
+  char suffix[64];
-  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
+  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename);
+  outfile.open(filename.c_str());
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
--- a/AMSS_NCKU_source/BSSN/bssnEM_class.h
+++ b/AMSS_NCKU_source/BSSN/bssnEM_class.h
--- a/AMSS_NCKU_source/Scalar/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/Scalar/bssnEScalar_class.C
@@ -3,6 +3,7 @@
 #include <sstream>
 #include <cstdio>
 #include <map>
 #include <string>
 using namespace std;
 #else
 #include <stdio.h>
@@ -25,6 +26,9 @@ using namespace std;
 #include "getnp4.h"
 #include "shellfunctions.h"
 #include "parameters.h"
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
 #ifdef With_AHF
 #include "derivatives.h"
@@ -33,6 +37,310 @@ using namespace std;
 //================================================================================================
 namespace
 {
 #if USE_CUDA_BSSN
 bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
                                  double **host_views,
                                  double *propspeeds = 0,
                                  double *soa_flat = 0)
 {
  int idx = 0;
  while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT)
  {
    host_views[idx] = cg->fgfs[vars->data->sgfn];
    if (propspeeds)
      propspeeds[idx] = vars->data->propspeed;
    if (soa_flat)
    {
      soa_flat[3 * idx + 0] = vars->data->SoA[0];
      soa_flat[3 * idx + 1] = vars->data->SoA[1];
      soa_flat[3 * idx + 2] = vars->data->SoA[2];
    }
    vars = vars->next;
    ++idx;
  }
  return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0;
 }
 bool bssn_escalar_cuda_use_resident_sync(int lev)
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
    if (!env)
      env = getenv("AMSS_CUDA_ESCALAR_RESIDENT_SYNC");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
  }
  if (!enabled)
    return false;
 #ifdef WithShell
  (void)lev;
  return false;
 #else
  return true;
 #endif
 }
 bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
 {
  static int keep_all_levels = -1;
  if (keep_all_levels < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS");
    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
  }
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  if (!enabled)
    return false;
  if (lev == analysis_lev)
    return false;
  static int release_only_level = -2;
  if (release_only_level == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_RELEASE_ONLY_LEVEL");
    release_only_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  if (release_only_level >= 0)
    return lev != release_only_level;
  static int keep_level_limit = -2;
  if (keep_level_limit == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_LEVELS_BELOW");
    keep_level_limit = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  if (keep_level_limit >= 0)
    return lev < keep_level_limit;
  if (keep_all_levels)
    return true;
  return lev < trfls_in;
 }
 bool bssn_escalar_sync_merged_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 void bssn_escalar_sync_level(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
 {
  if (bssn_escalar_sync_merged_enabled())
    Parallel::Sync_merged(PatL, VarList, Symmetry);
  else
    Parallel::Sync(PatL, VarList, Symmetry);
 }
 bool bssn_escalar_timing_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_ESCALAR_STEP_TIMING");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_post_rp_download_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_DOWNLOAD");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_post_rp_download_level_enabled(int lev)
 {
  if (!bssn_escalar_cuda_post_rp_download_enabled())
    return false;
  static int min_level = -2;
  if (min_level == -2)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_RP_MIN_LEVEL");
    min_level = (env && atoi(env) >= 0) ? atoi(env) : -1;
  }
  return min_level < 0 || lev >= min_level;
 }
 bool bssn_escalar_cuda_post_swap_release_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_POST_SWAP_RELEASE");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_pre_rp_release_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_PRE_RP_RELEASE");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_bh_interp_resident_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
    enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 0;
  }
  return enabled != 0;
 }
 bool bssn_escalar_cuda_prune_after_swap_enabled()
 {
  static int enabled = -1;
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_ESCALAR_PRUNE_AFTER_SWAP");
    enabled = (env && atoi(env) != 0) ? 1 : 0;
  }
  return enabled != 0;
 }
 void bssn_escalar_cuda_upload_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                          int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_in))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during upload" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_upload_resident_state(cg, cg->shape, state_in))
        {
          cout << "CUDA BSSN-EScalar resident state upload failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_escalar_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                             int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_key[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_key))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during prune" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_keep_only_resident_state(cg, cg->shape, state_key))
        {
          cout << "CUDA BSSN-EScalar resident state prune failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
                                double sync, double bh, double analysis, double swap,
                                double resident, double rp)
 {
  if (!bssn_escalar_timing_enabled())
    return;
  double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp};
  double maxv[8] = {};
  MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
  if (myrank == 0)
    fprintf(stderr,
            "[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f "
            "bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n",
            lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5],
            maxv[6], maxv[7],
            maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]);
 }
 void bssn_escalar_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
                                            int myrank, bool release_ctx)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
      {
        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
        if (!fill_bssn_escalar_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out))
        {
          cout << "CUDA BSSN-EScalar resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (release_ctx)
          bssn_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 #endif
 }
 //================================================================================================
 // Define bssnEScalar_class
 // It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -179,6 +487,11 @@ void bssnEScalar_class::Initialize()
 bssnEScalar_class::~bssnEScalar_class()
 {
 #if USE_CUDA_BSSN
  for (int lev = 0; GH && lev < GH->levels; ++lev)
    bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true);
 #endif
  delete Sphio;
  delete Spio;
  delete Sphi0;
@@ -708,6 +1021,11 @@ void bssnEScalar_class::Read_Pablo()
 void bssnEScalar_class::Step(int lev, int YN)
 {
  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
 #if USE_CUDA_BSSN
  const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev);
 #else
  const bool use_cuda_resident_sync = false;
 #endif
 #ifdef With_AHF
  AH_Step_Find(lev, dT_lev);
 #endif
@@ -719,9 +1037,19 @@ void bssnEScalar_class::Step(int lev, int YN)
  int iter_count = 0; // count RK4 substeps
  int pre = 0, cor = 1;
  int ERROR = 0;
  const bool escalar_step_timing = bssn_escalar_timing_enabled();
  const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  double escalar_t_rhs = 0.0;
  double escalar_t_sync = 0.0;
  double escalar_t_bh = 0.0;
  double escalar_t_analysis = 0.0;
  double escalar_t_swap = 0.0;
  double escalar_t_resident = 0.0;
  double escalar_t_rp = 0.0;
  MyList<ss_patch> *sPp;
  // Predictor
  double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -732,14 +1060,59 @@ void bssnEScalar_class::Step(int lev, int YN)
      if (myrank == cg->rank)
      {
 #if (AGM == 0)
 #if !USE_CUDA_BSSN
        f_enforce_ga(cg->shape,
                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
 #endif
-        if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        bool used_gpu_substep = false;
 #if USE_CUDA_BSSN
        {
          double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
          double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
          if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
              !fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out))
          {
            cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl;
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          if (bssn_escalar_cuda_rk4_substep(cg,
                                            cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                            state_in, state_out,
                                            propspeed, soa_flat, Pp->data->bbox,
                                            dT_lev, TRK4, iter_count, apply_bam_bc,
                                            Symmetry, lev, ndeps, pre,
                                            keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA BSSN-EScalar predictor substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          used_gpu_substep = true;
        }
 #endif
        if (!used_gpu_substep &&
            f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                       cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -783,6 +1156,8 @@ void bssnEScalar_class::Step(int lev, int YN)
          ERROR = 1;
        }
        if (!used_gpu_substep)
        {
        // rk4 substep and boundary
        {
          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
@@ -822,6 +1197,7 @@ void bssnEScalar_class::Step(int lev, int YN)
          }
        }
        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
        }
      }
      if (BP == Pp->data->ble)
        break;
@@ -845,6 +1221,8 @@ void bssnEScalar_class::Step(int lev, int YN)
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
  }
  if (escalar_step_timing)
    escalar_t_rhs += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  // evolve Shell Patches
@@ -993,7 +1371,14 @@ void bssnEScalar_class::Step(int lev, int YN)
  }
 #endif
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
  bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry);
 #else
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #endif
  if (escalar_step_timing)
    escalar_t_sync += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1016,6 +1401,11 @@ void bssnEScalar_class::Step(int lev, int YN)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false);
 #endif
    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
    for (int ithBH = 0; ithBH < BH_num; ithBH++)
    {
@@ -1044,16 +1434,26 @@ void bssnEScalar_class::Step(int lev, int YN)
        DG_List->clearList();
      }
    }
    if (escalar_step_timing)
      escalar_t_bh += MPI_Wtime() - escalar_t0;
  }
  // data analysis part
  // Warning NOTE: the variables1 are used as temp storege room
  if (lev == a_lev)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    if (use_cuda_resident_sync)
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
 #endif
    AnalysisStuff_EScalar(lev, dT_lev);
    if (escalar_step_timing)
      escalar_t_analysis += MPI_Wtime() - escalar_t0;
  }
  // corrector
  for (iter_count = 1; iter_count < 4; iter_count++)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
    if (iter_count == 1 || iter_count == 3)
      TRK4 += dT_lev / 2;
@@ -1067,11 +1467,13 @@ void bssnEScalar_class::Step(int lev, int YN)
        if (myrank == cg->rank)
        {
 #if (AGM == 0)
 #if !USE_CUDA_BSSN
          f_enforce_ga(cg->shape,
                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
 #elif (AGM == 1)
          if (iter_count == 3)
            f_enforce_ga(cg->shape,
@@ -1081,7 +1483,50 @@ void bssnEScalar_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          bool used_gpu_substep = false;
 #if USE_CUDA_BSSN
          {
            double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
            double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
            if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
                !fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out))
            {
              cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl;
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            int apply_bam_bc = 0;
            int apply_enforce_ga = 0;
 #if (AGM == 0)
            apply_enforce_ga = 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
 #endif
 #endif
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            if (bssn_escalar_cuda_rk4_substep(cg,
                                              cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                              state_in, state_out,
                                              propspeed, soa_flat, Pp->data->bbox,
                                              dT_lev, TRK4, iter_count, apply_bam_bc,
                                              Symmetry, lev, ndeps, cor,
                                              keep_resident_state, apply_enforce_ga, chitiny))
            {
              cout << "CUDA BSSN-EScalar corrector substep failed in domain: ("
                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
              ERROR = 1;
            }
            used_gpu_substep = true;
          }
 #endif
          if (!used_gpu_substep &&
              f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                         cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1125,6 +1570,8 @@ void bssnEScalar_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
          if (!used_gpu_substep)
          {
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
@@ -1167,6 +1614,7 @@ void bssnEScalar_class::Step(int lev, int YN)
            }
          }
          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
          }
        }
        if (BP == Pp->data->ble)
          break;
@@ -1192,6 +1640,8 @@ void bssnEScalar_class::Step(int lev, int YN)
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
    if (escalar_step_timing)
      escalar_t_rhs += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
    // evolve Shell Patches
@@ -1349,7 +1799,14 @@ void bssnEScalar_class::Step(int lev, int YN)
    }
 #endif
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
    bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #endif
    if (escalar_step_timing)
      escalar_t_sync += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
    if (lev == 0)
@@ -1371,6 +1828,11 @@ void bssnEScalar_class::Step(int lev, int YN)
    // for black hole position
    if (BH_num > 0 && lev == GH->levels - 1)
    {
      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
 #if USE_CUDA_BSSN
      if (use_cuda_resident_sync && !bssn_escalar_cuda_bh_interp_resident_enabled())
        bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
 #endif
      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
      for (int ithBH = 0; ithBH < BH_num; ithBH++)
      {
@@ -1399,10 +1861,13 @@ void bssnEScalar_class::Step(int lev, int YN)
          DG_List->clearList();
        }
      }
      if (escalar_step_timing)
        escalar_t_bh += MPI_Wtime() - escalar_t0;
    }
    // swap time level
    if (iter_count < 3)
    {
      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
      Pp = GH->PatL[lev];
      while (Pp)
      {
@@ -1446,12 +1911,29 @@ void bssnEScalar_class::Step(int lev, int YN)
          Porg[ithBH][2] = Porg1[ithBH][2];
        }
      }
      if (escalar_step_timing)
        escalar_t_swap += MPI_Wtime() - escalar_t0;
    }
  }
 #if USE_CUDA_BSSN
  if (use_cuda_resident_sync)
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank,
                                             bssn_escalar_cuda_pre_rp_release_enabled());
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  RestrictProlong(lev, YN, BB);
  if (escalar_step_timing)
    escalar_t_rp += MPI_Wtime() - escalar_t0;
 #ifdef WithShell
  if (lev == 0)
@@ -1478,6 +1960,7 @@ void bssnEScalar_class::Step(int lev, int YN)
  //
  // OldStateList  old -----------
  // update
  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
  Pp = GH->PatL[lev];
  while (Pp)
  {
@@ -1512,6 +1995,25 @@ void bssnEScalar_class::Step(int lev, int YN)
      sPp = sPp->next;
    }
  }
 #endif
  #if USE_CUDA_BSSN
  bool release_after_sync = false;
  if (use_cuda_resident_sync && bssn_escalar_cuda_post_rp_download_level_enabled(lev))
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    release_after_sync = bssn_escalar_cuda_post_swap_release_enabled();
    bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, release_after_sync);
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
  if (use_cuda_resident_sync && !release_after_sync &&
      bssn_escalar_cuda_prune_after_swap_enabled())
  {
    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
    bssn_escalar_cuda_keep_only_level_state(GH->PatL[lev], StateList, myrank);
    if (escalar_step_timing)
      escalar_t_resident += MPI_Wtime() - escalar_t0;
  }
 #endif
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -1523,6 +2025,14 @@ void bssnEScalar_class::Step(int lev, int YN)
      Porg0[ithBH][2] = Porg1[ithBH][2];
    }
  }
  if (escalar_step_timing)
  {
    escalar_t_swap += MPI_Wtime() - escalar_t0;
    bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0,
                               escalar_t_rhs, escalar_t_sync, escalar_t_bh,
                               escalar_t_analysis, escalar_t_swap,
                               escalar_t_resident, escalar_t_rp);
  }
 }
 //================================================================================================
@@ -2024,11 +2534,12 @@ void bssnEScalar_class::Interp_Constraint()
  }
  ofstream outfile;
-  char filename[50];
+  char suffix[64];
-  sprintf(filename, "%s/interp_constraint_%05d.dat", ErrorMonitor->out_dir.c_str(), int(PhysTime / dT + 0.5)); 
+  sprintf(suffix, "/interp_constraint_%05d.dat", int(PhysTime / dT + 0.5));
  string filename = ErrorMonitor->out_dir + suffix;
  // 0.5 for round off
-  outfile.open(filename);
+  outfile.open(filename.c_str());
  outfile << "#  corrdinate, H_Res, Px_Res, Py_Res, Pz_Res, Gx_Res, Gy_Res, Gz_Res, fR_Res, ...." << endl;
  for (int i = 0; i < n; i++)
  {
@@ -2077,7 +2588,37 @@ void bssnEScalar_class::Constraint_Out()
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              if (lev > 0)
+              bool used_cuda_constraints = false;
 #if USE_CUDA_BSSN
              {
                double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
                if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in))
                {
                  cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl;
                  MPI_Abort(MPI_COMM_WORLD, 1);
                }
                double *constraint_out[8] = {
                    cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
                    cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                    cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
                    cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]};
                int lev_arg = lev;
                int sym_arg = Symmetry;
                double eps_arg = ndeps;
                if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                                          state_in, constraint_out,
                                                          sym_arg, lev_arg, eps_arg))
                {
                  cout << "CUDA BSSN-EScalar constraint compute failed in domain: ("
                       << cg->bbox[0] << ":" << cg->bbox[3] << ","
                       << cg->bbox[1] << ":" << cg->bbox[4] << ","
                       << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
                  MPI_Abort(MPI_COMM_WORLD, 1);
                }
                used_cuda_constraints = true;
              }
 #endif
              if (!used_cuda_constraints && lev > 0)
                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
@@ -2114,7 +2655,8 @@ void bssnEScalar_class::Constraint_Out()
                                           cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                           cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
                                           Symmetry, lev, ndeps, pre);
-              f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+              if (!used_cuda_constraints)
                f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                      cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], 
                                      cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],
                                      cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
--- a/AMSS_NCKU_source/Scalar/bssnEScalar_class.h
+++ b/AMSS_NCKU_source/Scalar/bssnEScalar_class.h
--- a/AMSS_NCKU_source/Scalar/bssnEScalar_rhs.f90
+++ b/AMSS_NCKU_source/Scalar/bssnEScalar_rhs.f90
--- a/AMSS_NCKU_source/BSSN/bssn_class.C
+++ b/AMSS_NCKU_source/BSSN/bssn_class.C
--- a/AMSS_NCKU_source/BSSN/bssn_class.h
+++ b/AMSS_NCKU_source/BSSN/bssn_class.h
@@ -48,6 +48,7 @@ public:
       double StartTime, TotalTime;
       double AnasTime, DumpTime, d2DumpTime, CheckTime;
       double LastAnas, LastConsOut;
       bool cuda_level0_constraint_cache_valid;
       int *ConstraintRefreshLevels;
       double Courant;
       double numepss, numepsb, numepsh;
@@ -143,7 +144,7 @@ public:
       bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
                  int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
                  int a_levi, int maxli, int decni, double maxrexi, double drexi);
-       ~bssn_class();
+       virtual ~bssn_class();
       void Evolve(int Steps);
       void RecursiveStep(int lev);
--- a/AMSS_NCKU_source/BSSN/bssn_constraint.f90
+++ b/AMSS_NCKU_source/BSSN/bssn_constraint.f90
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_gpu.h
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_gpu.h
@@ -2,7 +2,7 @@
 #ifndef BSSN_GPU_H_
 #define BSSN_GPU_H_
 #include "bssn_macro.h"
-#include "macrodef.fh"
+#include "macrodef.h"
 #define DEVICE_ID 0
 // #define DEVICE_ID_BY_MPI_RANK
@@ -25,49 +25,32 @@
 /**  main function */
 int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
            double *X, double *Y, double *Z,
            double *chi, double *trK,
            double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
            double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
            double *Gamx, double *Gamy, double *Gamz,
            double *Lap, double *betax, double *betay, double *betaz,
            double *dtSfx, double *dtSfy, double *dtSfz,
            double *chi_rhs, double *trK_rhs,
            double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
            double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
            double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
            double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
            double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
            double *rho, double *Sx, double *Sy, double *Sz, double *Sxx,
            double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
            double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
            double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
            double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
            double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
            double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
            int &Symmetry, int &Lev, double &eps, int &co);
 int gpu_rhs_ss(RHS_SS_PARA);
-/** Init GPU side data in GPUMeta. */
+#define Z4C_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *TZ, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *TZ_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
-// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
+
 int gpu_rhs_z4c_ss(Z4C_SS_PARA);
 #endif
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_rhs_ss.cu
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_gpu_rhs_ss.cu
@@ -20,12 +20,14 @@ using namespace std;
 __device__ volatile unsigned int global_count = 0;
 #ifdef RESULT_CHECK
 void compare_result_gpu(int ftag1,double * datac,int data_num){
 	double * data = (double*)malloc(sizeof(double)*data_num);
 	cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
 	compare_result(ftag1,data,data_num);
 	free(data);
 }
 #endif
 __global__ void sub_symmetry_bd_ss_partF(int ord, double * func, double *funcc)
 {
@@ -153,11 +155,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
 inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
 	sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -247,13 +249,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
 	//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
 	//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fx,h_3D_SIZE[0]);
 			//compare_result_gpu(2,fy,h_3D_SIZE[0]);
 			//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -451,17 +453,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 	//fderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,fh,h_3D_SIZE[2]);	
 	sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	//fdderivs_sh
 	sub_symmetry_bd_ss(2,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(21,fh,h_3D_SIZE[2]);
 	sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
 			compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
 			compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -472,7 +474,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
 			compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
 			compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
 	sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
 			compare_result_gpu(2,fxy,h_3D_SIZE[0]);
 			compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -496,9 +498,9 @@ __global__ void computeRicci_ss_part1(double * dst)
 inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
 {
 	sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 }
 __global__ void sub_lopsided_ss_part1(double * dst)
@@ -516,9 +518,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
 inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
 {
 		sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 }
 __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -590,11 +592,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
 	}
 			//compare_result_gpu(10,f,h_3D_SIZE[0]);
 	sub_symmetry_bd_ss(3,f,fh,SoA1);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(0,fh,h_3D_SIZE[3]);
 	sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 			//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
 }
@@ -1699,7 +1701,7 @@ void destroy_meta(Meta *meta,Metass *metass)
 	if(Msh_ gzz) cudaFree(Msh_ gzz);
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
-	if(Mh_ reta) CUDA_SAFE_CALL(cudaFree(Mh_ reta));
+	if(Mh_ reta) cudaFree(Mh_ reta);
 #endif	
@@ -1895,7 +1897,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//1.2 local Data
 	cudaMalloc((void**)&(Mh_ gxx), matrix_size * sizeof(double));
-	CUDA_SAFE_CALL( cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double)));
+	cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double));
 	cudaMalloc((void**)&(Mh_ gzz), matrix_size * sizeof(double));
 	cudaMalloc((void**)&(Mh_ chix), matrix_size * sizeof(double));
 	cudaMalloc((void**)&(Mh_ chiy), matrix_size * sizeof(double));
@@ -2160,7 +2162,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	double tmp_con2 = 1/Mass[0] - tmp_con;
 	cudaMemcpyToSymbol(C1, &tmp_con2, sizeof(double));
-	double tmp_con2 = 1/Mass[1] - tmp_con;
+	tmp_con2 = 1/Mass[1] - tmp_con;
 	cudaMemcpyToSymbol(C2, &tmp_con2, sizeof(double));
@@ -2233,7 +2235,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
  		if((sst == 2 || sst == 4) && abs[1] < dYh) 
  		{
  			ijkmin_h[1] = -2; 
-  			ijkmin_h[1] = -3;
+  			ijkmin3_h[1] = -3;
  		}
  		if((sst == 3 || sst == 5) && abs_Y_ex2 < dYh)
  		{
@@ -2287,13 +2289,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
 #ifdef TIMING1
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl; 
 #endif	
 	//cout<<"GPU meta data ready.\n";
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 //-------------get device info-------------------------------------
@@ -2306,7 +2308,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//sub_enforce_ga(matrix_size);
 	//4.1-----compute rhs---------
 	compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2322,7 +2324,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
 	compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2332,7 +2334,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
 	compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2340,25 +2342,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	//cudaThreadSynchronize();
+	//cudaDeviceSynchronize();
 	//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
 	//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
 	compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
 	compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
 	sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2423,7 +2425,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	}
 	if(co == 0){
 		compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 		sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2432,7 +2434,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 		sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
 		sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
 		compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 #if (ABV == 1)
@@ -2512,7 +2514,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	//test kodis
 	//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
 #ifdef TIMING
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	gettimeofday(&tv2, NULL);
   	cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl; 
 #endif
@@ -2522,4 +2524,55 @@ int gpu_rhs_ss(RHS_SS_PARA)
 	return 0;//TODO return
 }
 #if (ABEtype == 2)
 // Z4C Shell GPU: calls BSSN gpu_rhs_ss with trKd=trK+2*TZ, then applies
 // TZ_rhs = alpn1*Hcon/2 and constraint damping on CPU.
 int gpu_rhs_z4c_ss(Z4C_SS_PARA)
 {
    int matrix_size = ex[0] * ex[1] * ex[2];
    double k1 = 0.02, k2 = 0.0;
    double *trKd_host = new double[matrix_size];
    for (int _i = 0; _i < matrix_size; _i++)
        trKd_host[_i] = trK[_i] + 2.0 * TZ[_i];
    int result = gpu_rhs_ss(calledby, mpi_rank,
        ex, T, crho, sigma, R, X, Y, Z,
        drhodx, drhody, drhodz, dsigmadx, dsigmady, dsigmadz,
        dRdx, dRdy, dRdz,
        drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz,
        dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz,
        dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz,
        chi, trKd_host, dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        ham_Res, movx_Res, movy_Res, movz_Res,
        Gmx_Res, Gmy_Res, Gmz_Res,
        Symmetry, Lev, eps, sst, co);
    delete[] trKd_host;
    if (result != 0) return result;
    for (int _i = 0; _i < matrix_size; _i++) {
        double alp = Lap[_i] + 1.0;
        TZ_rhs[_i] = alp * ham_Res[_i] * 0.5;
        TZ_rhs[_i]  -= alp * (2.0 + k2) * k1 * TZ[_i];
        trK_rhs[_i] += alp * k1 * (1.0 - k2) * TZ[_i];
    }
    return 0;
 }
 #endif // ABEtype == 2
 #endif //WithShell
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_macro.C
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_macro.C
--- a/AMSS_NCKU_source/BSSN_GPU/bssn_macro.h
+++ b/AMSS_NCKU_source/BSSN_GPU/bssn_macro.h
--- a/AMSS_NCKU_source/BSSN/bssn_rhs.f90
+++ b/AMSS_NCKU_source/BSSN/bssn_rhs.f90
--- a/AMSS_NCKU_source/BSSN/bssn_rhs.h
+++ b/AMSS_NCKU_source/BSSN/bssn_rhs.h
--- a/AMSS_NCKU_source/BSSN/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/BSSN/bssn_rhs_c.C
@@ -1098,12 +1098,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
            betaz_rhs[i] = FF * dtSfz[i];
            reta[i] =
-                gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
+                        + gupyz[i] * chiy[i] * chiz[i] );
            #if (GAUGE == 2)
            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
@@ -1116,12 +1116,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
            dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
            #elif (GAUGE == 4 || GAUGE == 5)
            reta[i] =
-                gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+                gupxx[i] * chix[i] * chix[i]
-                + gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+                + gupyy[i] * chiy[i] * chiy[i]
-                + gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+                + gupzz[i] * chiz[i] * chiz[i]
-                + TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+                + TWO * ( gupxy[i] * chix[i] * chiy[i]
-                        + gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+                        + gupxz[i] * chix[i] * chiz[i]
-                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
+                        + gupyz[i] * chiy[i] * chiz[i] );
            #if (GAUGE == 4)
            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -0,0 +1,413 @@
 #ifndef BSSN_RHS_CUDA_H
 #define BSSN_RHS_CUDA_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum {
    BSSN_CUDA_STATE_COUNT = 24,
    BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
    BSSN_EM_CUDA_STATE_COUNT = 32,
    BSSN_EM_CUDA_SOURCE_COUNT = 4,
    BSSN_CUDA_MATTER_COUNT = 10
 };
 int f_compute_rhs_bssn(int *ex, double &T,
                       double *X, double *Y, double *Z,
                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
                       double *Lap, double *betax, double *betay, double *betaz,
                       double *dtSfx, double *dtSfy, double *dtSfz,
                       double *chi_rhs, double *trK_rhs,
                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                       double *rho, double *Sx, double *Sy, double *Sz,
                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                       int &Symmetry, int &Lev, double &eps, int &co);
 int bssn_cuda_rk4_substep(void *block_tag,
                          int *ex, double *X, double *Y, double *Z,
                          double **state_host_in,
                          double **state_host_out,
                          double **matter_host,
                          const double *propspeed,
                          const double *soa_flat,
                          const double *bbox,
                          double &dT,
                          double &T,
                          int &RK4,
                          int &apply_bam_bc,
                          int &Symmetry,
                          int &Lev,
                          double &eps,
                          int &co,
                          int &use_zero_matter,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny);
 int bssn_escalar_cuda_rk4_substep(void *block_tag,
                                  int *ex, double *X, double *Y, double *Z,
                                  double **state_host_in,
                                  double **state_host_out,
                                  const double *propspeed,
                                  const double *soa_flat,
                                  const double *bbox,
                                  double &dT,
                                  double &T,
                                  int &RK4,
                                  int &apply_bam_bc,
                                  int &Symmetry,
 	                                  int &Lev,
 	                                  double &eps,
 	                                  int &co,
 	                                  int &keep_resident_state,
 	                                  int &apply_enforce_ga,
 	                                  double &chitiny);
 int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
                                          double **state_host_in,
                                          double **constraint_host_out,
                                          int &Symmetry,
                                          int &Lev,
                                          double &eps);
 int bssn_em_cuda_rk4_substep(void *block_tag,
                             int *ex, double *X, double *Y, double *Z,
                             double **state_host_in,
                             double **state_host_out,
                             double **source_host,
                             const double *propspeed,
                             const double *soa_flat,
                             const double *bbox,
                             double &dT,
                             double &T,
                             int &RK4,
                             int &apply_bam_bc,
                             int &Symmetry,
                             int &Lev,
                             double &eps,
                             int &co,
                             int &keep_resident_state,
                             int &apply_enforce_ga,
                             double &chitiny);
 int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
                                        int *ex,
                                        int i0, int j0, int k0,
                                        int sx, int sy, int sz);
 int bssn_cuda_copy_state_region_from_host(void *block_tag,
                                          int state_index,
                                          double *host_state,
                                          int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz);
 int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out);
 int bssn_escalar_cuda_download_resident_state(void *block_tag,
                                              int *ex,
                                              double **state_host_out);
 int bssn_cuda_upload_resident_state_count(void *block_tag,
                                          int *ex,
                                          double **state_host_in,
                                          int state_count);
 int bssn_escalar_cuda_upload_resident_state(void *block_tag,
                                            int *ex,
                                            double **state_host_in);
 int bssn_cuda_keep_only_resident_state_count(void *block_tag,
                                             int *ex,
                                             double **state_host_key,
                                             int state_count);
 int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
                                               int *ex,
                                               double **state_host_key);
 int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
                                                       int *ex,
                                                       double **state_host_out,
                                                       int state_count);
 int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                int *ex,
                                                double **state_host_out);
 int bssn_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out);
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int state_index,
                                               double *host_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int bssn_cuda_interp_state_point3(void *block_tag,
                                  int *ex,
                                  int state0,
                                  int state1,
                                  int state2,
                                  double x0,
                                  double y0,
                                  double z0,
                                  double dx,
                                  double dy,
                                  double dz,
                                  double px,
                                  double py,
                                  double pz,
                                  int ordn,
                                  int symmetry,
                                  double **state_host_key,
                                  const double *soa3,
                                  double *out3);
 int bssn_cuda_interp_host_two_fields(void *block_tag,
                                     int *ex,
                                     double *field0,
                                     double *field1,
                                     double x0,
                                     double y0,
                                     double z0,
                                     double dx,
                                     double dy,
                                     double dz,
                                     const double *px,
                                     const double *py,
                                     const double *pz,
                                     int npoints,
                                     int ordn,
                                     int symmetry,
                                     const double *soa6,
                                     double *out_interleaved);
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  int state_index,
                                                                  double *host_buffer,
                                                                  int *ex,
                                                                  int i0, int j0, int k0,
                                                                  int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int state_count,
                                              double *host_buffer,
                                              int *ex,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                             double **state_host_key,
                                                             int state_count,
                                                             double *host_buffer,
                                                             int *ex,
                                                             int i0, int j0, int k0,
                                                             int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
                                                  int *ex,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *host_buffer,
                                                                 int *ex,
                                                                 int i0, int j0, int k0,
                                                                 int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                int state_count,
                                                double *device_buffer,
                                                int *ex,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz);
 int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                               double **state_host_key,
                                                               int state_count,
                                                               double *device_buffer,
                                                               int *ex,
                                                               int i0, int j0, int k0,
                                                               int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int i0, int j0, int k0,
                                                    int sx, int sy, int sz);
 int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int i0, int j0, int k0,
                                                                   int sx, int sy, int sz);
 int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int segment_count,
                                                   const int *segment_meta);
 int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int segment_count,
                                                                  const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta);
 int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta,
                                                                      const double *state_soa);
 int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta);
 int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa);
 int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int sx, int sy, int sz,
                                                    int fi0, int fj0, int fk0);
 int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int sx, int sy, int sz,
                                                                   int fi0, int fj0, int fk0,
                                                                   const double *state_soa);
 int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int ii0, int jj0, int kk0,
                                                   int lbc_i, int lbc_j, int lbc_k);
 int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int ii0, int jj0, int kk0,
                                                                  int lbc_i, int lbc_j, int lbc_k,
                                                                  const double *state_soa);
 int bssn_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,
                                    const int *state_indices,
                                    double **state_host_out);
 int bssn_cuda_upload_state_subset(void *block_tag,
                                  int *ex,
                                  int subset_count,
                                  const int *state_indices,
                                  double **state_host_in);
 int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                       int *ex,
                                       int state_count,
                                       double **src1_host_key,
                                       double **src2_host_key,
                                       double **src3_host_key,
                                       double **dst_host_key,
                                       int source_count,
                                       int tindex);
 int bssn_cuda_has_resident_state(void *block_tag);
 void bssn_cuda_release_step_ctx(void *block_tag);
 #ifdef __cplusplus
 }
 // C++-only helpers declared for derived equation classes (Z4C, etc.)
 // Defined in bssn_class.C. Requires MyList, Patch, var from including TU.
 bool bssn_cuda_use_resident_sync(int lev);
 void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank);
 #endif
 #endif
--- a/AMSS_NCKU_source/BSSN/bssn_rhs_ss.f90
+++ b/AMSS_NCKU_source/BSSN/bssn_rhs_ss.f90
--- a/AMSS_NCKU_source/AHF_Direct/cctk.h
+++ b/AMSS_NCKU_source/AHF_Direct/cctk.h
--- a/AMSS_NCKU_source/AHF_Direct/cctk_Config.h
+++ b/AMSS_NCKU_source/AHF_Direct/cctk_Config.h
--- a/AMSS_NCKU_source/AHF_Direct/cctk_Constants.h
+++ b/AMSS_NCKU_source/AHF_Direct/cctk_Constants.h
--- a/AMSS_NCKU_source/AHF_Direct/cctk_Types.h
+++ b/AMSS_NCKU_source/AHF_Direct/cctk_Types.h
--- a/AMSS_NCKU_source/cgh/cgh.C
+++ b/AMSS_NCKU_source/cgh/cgh.C
--- a/AMSS_NCKU_source/cgh/cgh.h
+++ b/AMSS_NCKU_source/cgh/cgh.h
--- a/AMSS_NCKU_source/Check_Point/checkpoint.C
+++ b/AMSS_NCKU_source/Check_Point/checkpoint.C
@@ -76,8 +76,11 @@ checkpoint::checkpoint(bool checked, const char fname[], int myrank) : filename(
 	I_Print = (myrank == 0);
-	int i = strlen(fname);
+	size_t filename_len = out_dir.size() + strlen(fname) + 32;
-	filename = new char[i+30];
+#ifdef CHECKDETAIL
 	filename_len += 32;
 #endif
 	filename = new char[filename_len];
 	// cout << filename << endl;
 	// cout << i << endl;
@@ -103,7 +106,7 @@ checkpoint::checkpoint(bool checked, const char fname[], int myrank) : filename(
 checkpoint::~checkpoint()
 {
 	CheckList->clearList();
-	if (I_Print)
+	if (filename)
 		delete[] filename;
 }
@@ -136,7 +139,7 @@ void checkpoint::writecheck_cgh(double time, cgh *GH)
 	if (I_Print)
 	{
 		// char fname[50];
-		char fname[50+50]; 
+		char fname[4096]; 
 		sprintf(fname, "%s_cgh.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -195,7 +198,7 @@ void checkpoint::readcheck_cgh(double &time, cgh *GH, int myrank, int nprocs, in
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	infile.open(fname);
@@ -297,7 +300,7 @@ void checkpoint::writecheck_sh(double time, ShellPatch *SH)
 	if (I_Print)
 	{
-		char fname[50];
+		char fname[4096];
 		sprintf(fname, "%s_sh.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -335,7 +338,7 @@ void checkpoint::readcheck_sh(ShellPatch *SH, int myrank)
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_sh.CHK", filename);
 	infile.open(fname);
@@ -390,7 +393,7 @@ void checkpoint::write_Black_Hole_position(int BH_num_input, int BH_num, double
 	if (I_Print)
 	{
-		char fname[50];
+		char fname[4096];
 		sprintf(fname, "%s_BHp.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -417,7 +420,7 @@ void checkpoint::read_Black_Hole_position(int &BH_num_input, int &BH_num, double
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	infile.open(fname);
@@ -461,7 +464,7 @@ void checkpoint::write_bssn(double LastDump, double Last2dDump, double LastAnas)
 	if (I_Print)
 	{
-		char fname[50];
+		char fname[4096];
 		sprintf(fname, "%s_bssn.CHK", filename);
 		outfile.open(fname, ios::out | ios::trunc);
@@ -481,7 +484,7 @@ void checkpoint::read_bssn(double &LastDump, double &Last2dDump, double &LastAna
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	infile.open(fname);
@@ -506,7 +509,7 @@ void checkpoint::write_bssn(double LastDump, double Last2dDump, double LastAnas)
 	ofstream outfile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -527,7 +530,7 @@ void checkpoint::read_bssn(double &LastDump, double &Last2dDump, double &LastAna
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_bssn.CHK", filename);
 	infile.open(fname);
@@ -551,7 +554,7 @@ void checkpoint::write_Black_Hole_position(int BH_num_input, int BH_num, double
 	ofstream outfile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -581,7 +584,7 @@ void checkpoint::read_Black_Hole_position(int &BH_num_input, int &BH_num, double
 {
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_BHp.CHK", filename);
 	infile.open(fname);
@@ -628,7 +631,7 @@ void checkpoint::writecheck_cgh(double time, cgh *GH)
 	ofstream outfile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	outfile.open(fname, ios::out | ios::trunc);
@@ -738,7 +741,7 @@ void checkpoint::readcheck_cgh(double &time, cgh *GH, int myrank, int nprocs, in
 	int DIM = dim;
 	ifstream infile;
 	// char fname[50];
-	char fname[50+50]; 
+	char fname[4096]; 
 	sprintf(fname, "%s_cgh.CHK", filename);
 	infile.open(fname);
--- a/AMSS_NCKU_source/Check_Point/checkpoint.h
+++ b/AMSS_NCKU_source/Check_Point/checkpoint.h
--- a/AMSS_NCKU_source/AHF_Direct/config.h
+++ b/AMSS_NCKU_source/AHF_Direct/config.h
--- a/AMSS_NCKU_source/AHF_Direct/coords.C
+++ b/AMSS_NCKU_source/AHF_Direct/coords.C
--- a/AMSS_NCKU_source/AHF_Direct/coords.h
+++ b/AMSS_NCKU_source/AHF_Direct/coords.h
--- a/AMSS_NCKU_source/Z4C/cpbc.f90
+++ b/AMSS_NCKU_source/Z4C/cpbc.f90
--- a/AMSS_NCKU_source/Z4C/cpbc.h
+++ b/AMSS_NCKU_source/Z4C/cpbc.h
--- a/AMSS_NCKU_source/Z4C/cpbc_util.C
+++ b/AMSS_NCKU_source/Z4C/cpbc_util.C
--- a/AMSS_NCKU_source/AHF_Direct/cpm_map.C
+++ b/AMSS_NCKU_source/AHF_Direct/cpm_map.C
--- a/AMSS_NCKU_source/AHF_Direct/cpm_map.h
+++ b/AMSS_NCKU_source/AHF_Direct/cpm_map.h
--- a/AMSS_NCKU_source/Derivative/derivatives.h
+++ b/AMSS_NCKU_source/Derivative/derivatives.h
--- a/AMSS_NCKU_source/Derivative/diff_new.f90
+++ b/AMSS_NCKU_source/Derivative/diff_new.f90
--- a/AMSS_NCKU_source/Derivative/diff_new_sh.f90
+++ b/AMSS_NCKU_source/Derivative/diff_new_sh.f90
--- a/AMSS_NCKU_source/Derivative/diff_newwb.f90
+++ b/AMSS_NCKU_source/Derivative/diff_newwb.f90
--- a/AMSS_NCKU_source/AHF_Direct/driver.h
+++ b/AMSS_NCKU_source/AHF_Direct/driver.h
--- a/AMSS_NCKU_source/BSSN/empart.f90
+++ b/AMSS_NCKU_source/BSSN/empart.f90
--- a/AMSS_NCKU_source/BSSN/empart.h
+++ b/AMSS_NCKU_source/BSSN/empart.h
--- a/AMSS_NCKU_source/BSSN/enforce_algebra.f90
+++ b/AMSS_NCKU_source/BSSN/enforce_algebra.f90
--- a/AMSS_NCKU_source/BSSN/enforce_algebra.h
+++ b/AMSS_NCKU_source/BSSN/enforce_algebra.h
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
CGH0S7	dd5b7561c1	Case closed.	2026-05-20 11:16:56 +08:00
CGH0S7	a99534d2f3	Refine GPU runtime controls and input checker	2026-05-18 01:02:55 +08:00
CGH0S7	f2264989d8	Fix CUDA AMR symmetry drift	2026-05-17 23:46:15 +08:00
CGH0S7	a0b43bae04	Restore default GPU BH interpolation	2026-05-17 12:05:09 +08:00
CGH0S7	c7a48ebe7e	Stabilize GPU BH trajectory defaults	2026-05-17 11:52:50 +08:00
CGH0S7	5d8dfaf679	Add plot-only restart script to skip recomputation when plotting is interrupted Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-12 15:01:25 +08:00
CGH0S7	24f4a45097	Fix macrodef.h include and clean up stale z4c_gpu_rhs_ss.cu Include macrodef.h (not macrodef.fh) in gpu_rhsSS_mem.h and bssn_gpu.h so that ABEtype is visible to #if guards in CUDA files. Remove the separate z4c_gpu_rhs_ss.cu (merged into bssn_gpu_rhs_ss.cu). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 20:02:35 +08:00
CGH0S7	f16469ea77	Simplify Z4C Shell GPU: CPU-side trKd+TZ_rhs wrapper Replace the duplicated z4c_gpu_rhs_ss.cu with a lightweight gpu_rhs_z4c_ss wrapper inside bssn_gpu_rhs_ss.cu (guarded by #if ABEtype==2). The wrapper: 1. Builds trKd = trK + 2TZ on host and passes it to gpu_rhs_ss 2. After BSSN GPU returns, computes TZ_rhs = alpn1Hcon/2 and applies kappa1/kappa2 constraint damping on CPU This avoids duplicate kernel definitions (linker errors) and keeps all shell GPU code in a single file. The CPU-side Z4C corrections are O(100K) operations — negligible vs GPU RHS time. Also remove the separate z4c_gpu_rhs_ss.cu and its build rule. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 16:05:56 +08:00
CGH0S7	f754aa1ec2	Add Z4C Shell-Patch GPU acceleration (Phase 3 complete) Create z4c_gpu_rhs_ss.cu (reusing BSSN shell FD/chain-rule kernels): - Uploads trKd = trK + 2TZ to GPU so existing BSSN algebraic kernels compute correct Z4C physical equations without modification - New kern_z4c_post applies TZ_rhs = alpn1 Hcon / 2, kappa1/kappa2 constraint damping, TZ advection (lopsided), and dissipation (kodis) - Adds TZ/TZ_rhs to Meta struct, alloc/upload/download/free lifecycle Add cuda_compute_rhs_z4c_ss() wrapper in Z4c_class.C matching the Fortran f_compute_rhs_Z4c_ss signature, with #define redirection for Step/SHStep call sites and #undef before analysis functions. Add z4c_gpu_rhs_ss.o to ABE_CUDA_CFILES and build rule in makefile. Add kappa1_c/kappa2_c constants to gpu_rhsSS_mem.h. Build verified with USE_CUDA_Z4C=1 + WithShell — compiles and links cleanly. All three Shell GPU files now coexist: bssn_gpu_rhs_ss.o (BSSN), z4c_gpu_rhs_ss.o (Z4C), both sharing FD/chain-rule kernels. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 13:52:48 +08:00
CGH0S7	c4194214c6	Enable Z4C + Shell-Patch GPU coexistence (Phase 3) Remove the compile-time #error that blocked USE_CUDA_Z4C + WithShell. Add GPU-to-CPU state sync at the start of both Z4C Step functions (non-CPBC and CPBC) so shell CPU consumers read valid field data after Cartesian GPU RHS with resident state. Move bssn_cuda_use_resident_sync and bssn_cuda_download_level_state _if_present from anonymous namespace to file scope in bssn_class.C so derived classes (Z4C) can call them. Declare both in bssn_rhs_cuda.h. Include bssn_rhs_cuda.h in Z4c_class.C. Z4C shell RHS remains on CPU (Fortran Z4c_rhs_ss.f90) pending future GPU kernel implementation. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 12:08:02 +08:00
CGH0S7	0ca86afd41	Use static OpenMP schedule in ShellPatch::setupintintstuff Static scheduling has lower overhead than guided for uniform workloads (grid points all have equal computational cost). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 02:23:07 +08:00
CGH0S7	f5bf3ab252	Add thread-safe ShellPatch::setupintintstuff with OpenMP Split prolongpointstru into search-only (prolongpointstru_search) and append-only (prolongpointstru_append) functions. The search is read-only and thread-safe; each thread builds private linked lists via prolongpointstru_append, merged after the parallel loop. This eliminates critical-section contention and delivers ~2.2x speedup: setupintintstuff: 511s -> 252s, total init: 592s -> 267s. Also add -qopenmp to ShellPatch.o compilation via makefile override rule and <omp.h> include with _OPENMP guards + fallback stubs. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 02:10:20 +08:00
CGH0S7	d0d3f965a6	Add diagnostic timing to Shell-Patch initialization Print MPI_Wtime breakdown of Initialize() shell setup steps and Read_Ansorg::Compute_Constraint duration. Reveals that ShellPatch::setupintintstuff() takes ~511s of the ~590s startup. The function builds interpolation tables by searching every shell grid point against all Cartesian patches — thread-safe OpenMP parallelization is blocked by shared linked-list mutations in prolongpointstru(), which would need a search/append split first. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-09 21:51:07 +08:00
CGH0S7	fbb2ed112d	Fix Compile_Constraint/analysis use CPU Fortran for shell RHS Limit GPU shell RHS redirection to Step and SHStep only via #define/#undef. Compute_Constraint, Interp_Constraint, and Constraint_Out continue using the CPU Fortran path to avoid GPU alloc-per-call overhead during initialization and analysis phases. Also: wrap compare_result_gpu in #ifdef RESULT_CHECK to avoid link error. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-09 19:25:45 +08:00
CGH0S7	bd4ce3fbf3	GPU-accelerate Shell-Patch BSSN evolution Phase 1: Enable GPU resident state for Cartesian patches in Shell mode. - Remove WithShell guard from bssn_cuda_use_resident_sync(). - Add GPU-to-CPU state sync before shell CPU consumers (SHStep, CS_Inter, inline shell RHS blocks). Phase 2: GPU-accelerate BSSN Shell Patch RHS. - Create bssn_gpu.h with RHS_SS_PARA macro and gpu_rhs_ss declaration. - Fix compilation bugs in legacy bssn_gpu_rhs_ss.cu (deprecated cudaThreadSynchronize, tmp_con2 redeclaration, ijkmin3_h typo, CUDA_SAFE_CALL, missing compare_result guard). - Add bssn_gpu_rhs_ss.o to CFILES_CUDA_BSSN with build rule. - Write cuda_compute_rhs_bssn_ss() wrapper bridging Fortran and GPU parameter conventions, redirect all shell RHS call sites via #define. Verified: 30-step Shell-Patch GPU run completes without errors/NaN. Step wall time ~4.4s (step_fn ~2.0s + RP ~0.68s + constraint ~0.70s). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-09 18:50:10 +08:00
CGH0S7	5eb49949d9	Fix AHF crash under CUDA resident-sync mode Download BSSN StateList from GPU to CPU before AHFinderDirect_find_horizons so that AH_Interp_Points reads valid field data instead of stale CPU arrays. The resident-sync path keeps canonical state on GPU; without this download the Newton iteration diverges and probes outside the computational domain. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-09 16:11:56 +08:00
CGH0S7	39450228f5	Accelerate Shell-Patch interpolation fast paths	2026-05-08 13:26:16 +08:00
CGH0S7	063f28b3b4	Add Shell-Patch GPU runtime fast paths	2026-05-08 09:26:36 +08:00
CGH0S7	1064a68d16	Optimize BSSN-EM 8th-order AMR transfers	2026-05-07 21:38:16 +08:00
CGH0S7	dcc83bafcb	Support 2nd and 8th order CUDA AMR paths	2026-05-07 20:31:26 +08:00
CGH0S7	c4d8d41b25	Cover Z4C CUDA AMR restrict prolong	2026-05-07 19:49:09 +08:00
CGH0S7	0076b3ca18	Optimize 6th-order CUDA AMR stencils	2026-05-07 19:22:37 +08:00
CGH0S7	9ff2f065be	Apply BSSN AMR sync default to EScalar	2026-05-07 17:12:33 +08:00
CGH0S7	2317e4abde	Fix BSSN GPU resident AMR sync default	2026-05-07 17:11:09 +08:00
CGH0S7	fea2dcc0d5	Fix BSSN-EM runtime crash	2026-05-07 16:47:55 +08:00
CGH0S7	5525465cad	Support CUDA finite-difference order selection	2026-05-07 16:28:02 +08:00
CGH0S7	96829d0441	Optimize Z4C GPU runtime defaults	2026-05-07 15:37:09 +08:00
CGH0S7	83afaf19ce	Skip zero EM resident downloads	2026-05-07 13:04:46 +08:00
CGH0S7	cb911dec06	Add EM GPU fast paths and defaults	2026-05-07 12:18:56 +08:00
CGH0S7	dd0e20d8c7	Fix BSSN-EScalar CUDA boundary and scalar KO	2026-05-06 15:44:35 +08:00
CGH0S7	ffa0d801ed	Default Python GPU runner to EScalar fast path	2026-05-06 00:12:46 +08:00
CGH0S7	ae64a22178	Complete BSSN-EScalar CUDA resident transfers	2026-05-05 23:57:42 +08:00
CGH0S7	85fe29cc2e	Optimize BSSN-EScalar CUDA path	2026-05-05 10:47:46 +08:00
ianchb	06f62dee36	Switch back to Intel toolchain as the default option Seems that Intel MPI also supports CUDA-aware by setting I_MPI_OFFLOAD to 1. Besides, I_MPI_OFFLOAD_IPC=0 is needed to avoid segfaults.	2026-05-01 21:59:13 +08:00
CGH0S7	35b6ceff02	Broaden cached CUDA sync paths	2026-05-01 18:03:04 +08:00
CGH0S7	51f3819892	Save generated source formatting state	2026-04-30 20:47:44 +08:00
CGH0S7	a9a3809148	Default Python launcher to fast GPU path	2026-04-30 20:15:34 +08:00
CGH0S7	b1974ef146	Stabilize device AMR restrict across regrid	2026-04-30 20:01:18 +08:00
CGH0S7	be9033f449	Add optional CUDA surface interpolation	2026-04-30 19:21:19 +08:00
CGH0S7	6835608f92	Add configurable analysis MAP cadence	2026-04-30 19:10:12 +08:00
CGH0S7	e0d0673c8e	Enable optimized GPU runs from Python launcher	2026-04-30 18:31:31 +08:00
CGH0S7	da4d56ccf7	Optimize BSSN surface interpolation fast path	2026-04-30 18:25:21 +08:00
CGH0S7	a6483d013d	Add CUDA AMR restrict diagnostics	2026-04-30 12:20:44 +08:00
CGH0S7	8486532920	Add resident BSSN GPU point interpolation	2026-04-30 11:39:15 +08:00
CGH0S7	18e9c9cc50	Optimize BSSN CUDA resident AMR prolong path	2026-04-30 10:58:15 +08:00
CGH0S7	1ee229a91f	Add keyed BSSN CUDA resident banks	2026-04-29 19:44:19 +08:00
CGH0S7	68eab03bac	Add opt-in BSSN CUDA resident AMR path	2026-04-29 19:15:37 +08:00
CGH0S7	090d8657ae	Optimize BSSN CUDA state transfers	2026-04-29 18:34:31 +08:00
CGH0S7	22c1e7168b	Optimize BSSN CUDA resident state and CUDA-aware MPI	2026-04-29 17:05:10 +08:00
ianchb	a0dab90bcb	Switch to NVIDIA HPC Toolchain	2026-04-29 08:31:49 +08:00
ianchb	c689cc8dc9	[WIP] Add CUDA support for Z4C Rewritten done by Codex. This still has errors, do not pick this one now.	2026-04-27 11:58:43 +08:00
ianchb	60fee8f1c1	Fix Z4C C++ gauge damping ordering	2026-04-26 15:38:13 +08:00
ianchb	843b116954	Add C++ Z4C RHS path and port some BSSN optimizations	2026-04-25 10:39:01 +08:00
ianchb	c768e1220b	Also disable cached sync for Z4C	2026-04-25 10:25:54 +08:00
CGH0S7	02f149e2e3	Disable cached sync for BSSN-EScalar	2026-04-25 10:17:47 +08:00
CGH0S7	422e8ec4dc	Fallback BSSN-EScalar restrict/prolong path	2026-04-25 10:10:34 +08:00
CGH0S7	c4909b9843	更新精度检查脚本加入图像比对检查 (cherry picked from commit `ac82ebd889`)	2026-04-25 09:40:12 +08:00
ianchb	f521a97563	Fix ABE CPU version build error	2026-04-25 09:39:49 +08:00
ianchb	53c55451b3	Update makefile and scripts for CUDA BSSN configuration and build commands	2026-04-25 09:19:50 +08:00
CGH0S7	768345954f	Add optional BSSN kernel profiling switches (cherry picked from commit `9c31384b2f`)	2026-04-25 08:39:43 +08:00
CGH0S7	9a6df6438b	Remove dead chi derivative setup in BSSN RHS (cherry picked from commit `e4e741caa1`)	2026-04-25 08:38:01 +08:00
CGH0S7	8e9463aa90	Localize chi Ricci intermediates in RHS (cherry picked from commit `65e0f95f40`)	2026-04-25 08:37:41 +08:00
CGH0S7	7c6f15002e	Elide dead stores in BSSN RHS hot path (cherry picked from commit `f9fbf97e64`)	2026-04-25 08:37:40 +08:00
CGH0S7	6410c62e3e	Add fine-grained step timing and trim BH RHS overhead (cherry picked from commit `968522995b`)	2026-04-25 08:37:19 +08:00
CGH0S7	11977eb82f	Merge wave and mass extraction interpolation (cherry picked from commit `f3988ac8ca`)	2026-04-25 08:25:34 +08:00
CGH0S7	cce8a44fc4	Cache wave extraction angular kernels (cherry picked from commit `e4c25eb21f`)	2026-04-25 08:24:36 +08:00
CGH0S7	c589097618	Reuse mass integrand across detector radii (cherry picked from commit `4b10519876`)	2026-04-25 08:24:11 +08:00
CGH0S7	b713e5a9be	Batch constraint norm reductions (cherry picked from commit `3a58273501`)	2026-04-25 08:22:00 +08:00
CGH0S7	0396701572	Optimize constraint refresh after regrid (cherry picked from commit `5c65cea2f0`)	2026-04-25 08:18:51 +08:00
ianchb	bb20c9a876	fix ADM Constrant Violation Analysis	2026-04-15 19:19:16 +08:00
ianchb	8fe60ea703	Add zero matter handling and interpolation for resident state in CUDA BSSN	2026-04-15 00:25:53 +08:00
ianchb	9ab7e7c7f9	Fuse phases 5 and 6 for Gamma_rhs computation and optimize phases 8 and 9 for efficiency	2026-04-14 23:23:04 +08:00
ianchb	f9119e8a2a	Add resident-GA mode switch and simplify sync logic	2026-04-14 21:09:27 +08:00
ianchb	726d743376	Fuse Ricci assembly and optimize trK/Aij gauge kernels	2026-04-14 19:20:12 +08:00
ianchb	af344bf1e5	Add Phase-10 Ricci kernels and batch launch flow	2026-04-14 19:00:22 +08:00
ianchb	7191fc0b96	Move resident sync comm buffers into StepAllocation pool	2026-04-13 21:04:44 +08:00
ianchb	b3ec244cf9	Add batched first/second derivative kernels for CUDA RHS	2026-04-13 20:51:08 +08:00
ianchb	e952ee8e91	Batch GA/BH subset sync with indexed GPU pack/unpack buffers	2026-04-13 20:40:09 +08:00
ianchb	c5d1268dd1	Batch patch-boundary copy and gate CPU BC in GPU substeps	2026-04-13 11:52:17 +08:00
ianchb	4bdfc90f22	Pass pointer tables as kernel args and skip redundant symbol uploads	2026-04-13 11:19:00 +08:00
ianchb	c49a4e00c9	Batch symbd_pack/lopsided/kodiss over all state variables	2026-04-13 11:02:55 +08:00
ianchb	1b3c0b80d2	Refactor CUDA step buffers to remove loop-time allocations	2026-04-13 10:33:03 +08:00
ianchb	636e35bfd8	Add direct CUDA resident-state sync path and profiling hooks	2026-04-13 00:57:05 +08:00
ianchb	7f2a391dd2	Cache matter fields in StepContext across RK4 substeps	2026-04-12 22:19:45 +08:00
ianchb	4fa12a2009	Integrate CUDA support into RK4 substep execution	2026-04-12 22:11:44 +08:00
ianchb	86a683de26	Replace legacy ABEGPU stack with ABE_CUDA backend	2026-04-12 21:19:14 +08:00
ianchb	aaf7bf0a26	Merge remote-tracking branch 'origin/main'	2026-04-12 20:55:42 +08:00
ianchb	9c44d1c885	fix(bssn_rhs)	2026-03-03 16:00:45 +08:00
ianchb	4b9de28feb	将 Restrict/Prolong 链路里的 coarse-level Sync_cached 改为可选（默认跳过） OutBdLow2Hi_cached 读的是 coarse owned 区域（非 coarse ghost/buffer）回退旧行为：编译时定义 RP_SYNC_COARSE_AFTER_RESTRICT=1	2026-03-03 14:25:27 +08:00
ianchb	4eb5dc4ddb	删除重复的一次 chi 一阶导计算	2026-03-03 14:23:56 +08:00