完善mps的vidal机制,多节点并行;补充tn搜索时dask集群搜索的方式

2026-05-12 15:44:19 +08:00
parent aa122964b4
commit 72f95599bb
32 changed files with 3529 additions and 320 deletions
@@ -5,11 +5,6 @@ __pycache__/
 data/
 # C extensions
 *.so
-bak/
-path/
-profiles/
-vtune_expval/
-perf*
 # Distribution / packaging
 .Python
 build/
@@ -164,3 +159,13 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 .devenv
+
+
+# yx
+bak/
+path/
+profiles/
+vtune_expval/
+perf*
+experiments/
+references/
@@ -12,7 +12,7 @@ Tensor Network Types:
 Tensor Network contractions to:

 - dense vectors
- expecation values of given Pauli string
+- expectation values of given Pauli strings or Pauli-sum observables

 The supported HPC configurations are:

@@ -26,6 +26,18 @@ Currently, the supported tensor network libraries are:
 - [cuQuantum](https://github.com/NVIDIA/cuQuantum), an NVIDIA SDK of optimized libraries and tools for accelerating quantum computing workflows.
 - [quimb](https://quimb.readthedocs.io/en/latest/), an easy but fast python library for ‘quantum information many-body’ calculations, focusing primarily on tensor networks.

+## CPU expectation benchmarks
+
+The current CPU expectation entrypoint is:
+
+```sh
+python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz
+```
+
+Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs.
+Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`.
+For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`.
+
 ## Installation

 To get started:
@@ -0,0 +1,145 @@
+"""CLI for CPU TN/MPS expectation benchmarks."""
+
+from __future__ import annotations
+
+import argparse
+
+from qibotn.benchmark_cases import (
+    CIRCUITS,
+    OBSERVABLES,
+    build_circuit,
+    observable_terms,
+    parse_names,
+    terms_to_dict,
+)
+from qibotn.expectation_runner import (
+    ExpectationConfig,
+    exact_for_observable,
+    run_cpu_expectation,
+)
+
+
+def build_parallel_opts(args):
+    slicing_opts = {}
+    if args.tn_target_slices is not None:
+        slicing_opts["target_slices"] = args.tn_target_slices
+    if args.tn_target_size is not None:
+        slicing_opts["target_size"] = args.tn_target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": args.tn_search_workers or args.torch_threads,
+        "max_repeats": args.tn_search_repeats,
+        "max_time": args.tn_search_time,
+    }
+    if args.tn_search_backend is not None:
+        opts["search_backend"] = args.tn_search_backend
+    if args.dask_address is not None:
+        opts["dask_address"] = args.dask_address
+    return opts
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=40)
+    parser.add_argument("--nlayers", type=int, default=30)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024)
+    parser.add_argument("--cut-ratio", type=float, default=1e-12)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--torch-threads", type=int, default=8)
+    parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
+    parser.add_argument("--mps", action="store_true")
+    parser.add_argument("--mpi", action="store_true")
+    parser.add_argument("--exact", action="store_true")
+    parser.add_argument("--exact-max-qubits", type=int, default=24)
+    parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"])
+    parser.add_argument("--observables", nargs="+", default=["ring_xz"])
+    parser.add_argument("--pauli-pattern")
+    parser.add_argument("--tn-target-slices", type=int)
+    parser.add_argument("--tn-target-size", type=int)
+    parser.add_argument("--tn-search-workers", type=int)
+    parser.add_argument("--tn-search-repeats", type=int, default=128)
+    parser.add_argument("--tn-search-time", type=float, default=60.0)
+    parser.add_argument(
+        "--tn-search-backend",
+        choices=("processpool", "dask"),
+        help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
+    )
+    parser.add_argument(
+        "--dask-address",
+        help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
+    )
+    args = parser.parse_args()
+
+    ansatz = "mps" if args.mps else (args.ansatz or "tn")
+    circuits = parse_names(args.circuits, CIRCUITS, "circuits")
+    observables = [] if args.pauli_pattern else parse_names(
+        args.observables, OBSERVABLES, "observables"
+    )
+
+    rank = 0
+    if args.mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    config = ExpectationConfig(
+        ansatz=ansatz,
+        mpi=args.mpi,
+        bond=args.bond,
+        cut_ratio=args.cut_ratio,
+        tensor_module="torch",
+        torch_threads=args.torch_threads,
+        parallel_opts=build_parallel_opts(args),
+    )
+
+    if rank == 0:
+        mode = "MPI" if args.mpi else "serial"
+        print(
+            f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
+            f"nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} "
+            f"torch_threads={args.torch_threads} "
+            f"tn_search_backend={args.tn_search_backend or 'processpool'}"
+        )
+        print("circuit observable exact value abs_error rel_error seconds")
+
+    for circuit_kind in circuits:
+        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
+        named_observables = (
+            [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
+            if args.pauli_pattern
+            else [
+                (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
+                for obs_kind in observables
+            ]
+        )
+
+        for obs_name, observable in named_observables:
+            exact = None
+            if args.exact and rank == 0:
+                if args.nqubits > args.exact_max_qubits:
+                    raise ValueError(
+                        f"--exact is limited to {args.exact_max_qubits} qubits by default."
+                    )
+                exact = exact_for_observable(circuit, observable, args.nqubits)
+
+            result = run_cpu_expectation(circuit, observable, config)
+            if args.mpi and result.rank != 0:
+                continue
+
+            abs_error = float("nan") if exact is None else abs(result.value - exact)
+            rel_error = (
+                float("nan")
+                if exact is None
+                else abs_error / max(abs(exact), 1e-15)
+            )
+            exact_text = "nan" if exact is None else f"{exact:.16e}"
+            print(
+                f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
+                f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
+            )
+
+
+if __name__ == "__main__":
+    main()
@@ -31,11 +31,13 @@ cuquantum-python-cu12 = { version = "^25.9.1", optional = true }
 qmatchatea = { version = "^1.4.3", optional = true }
 qiskit = { version = "^1.4.0", optional = true }
 qtealeaves = { version = "^1.5.20", optional = true }
+distributed = { version = ">=2024", optional = true }


 [tool.poetry.extras]
 cuda = ["cupy-cuda12x", "cuda-toolkit", "nvidia-nccl-cu12", "cuquantum-python-cu12", "mpi4py"]
 qmatchatea = ["qmatchatea"]
+dask = ["distributed"]

 [tool.poetry.group.docs]
 optional = true
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Focused Vidal/MPS expectation test cases for 1D chain circuits.
+#
+# These cases intentionally avoid qmatchatea and generic TN paths.  They target
+# the current supported scope: one-qubit gates, adjacent two-qubit gates, and
+# Pauli-sum expectation values on a 1D chain.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+HOSTFILE="${HOSTFILE:-hostfile}"
+
+THREADS="${THREADS:-32}"
+MPI_RANKS="${MPI_RANKS:-16}"
+MPI_THREADS="${MPI_THREADS:-12}"
+
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+
+run() {
+  echo
+  echo "--------------------------------------------------------------------------------"
+  echo "$*"
+  echo "--------------------------------------------------------------------------------"
+  "$@"
+}
+
+case "${1:-help}" in
+  smoke)
+    # Short correctness-oriented run.  Useful before starting long jobs.
+    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      --mps \
+      --nqubits 40 \
+      --nlayers 10 \
+      --bond 2048 \
+      --torch-threads "$THREADS" \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz range2_xx long_z_string
+    ;;
+
+  convergence)
+    # Same circuit/observable, increasing bond.  Check value convergence.
+    for bond in ${BONDS:-4096 16384 65536}; do
+      run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+        --mps \
+        --nqubits "${NQ:-80}" \
+        --nlayers "${LAYERS:-16}" \
+        --bond "$bond" \
+        --torch-threads "$THREADS" \
+        --circuits "${CIRCUIT:-brickwall_cnot}" \
+        --observables "${OBSERVABLE:-ring_xz}"
+    done
+    ;;
+
+  single-long)
+    # Single long Vidal run.  On node-3, a similar n=40,l=30,bond=2048 case
+    # took about 9 minutes for one expectation.  This one is meant to be longer.
+    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      --mps \
+      --nqubits "${NQ:-80}" \
+      --nlayers "${LAYERS:-16}" \
+      --bond "${BOND:-65536}" \
+      --torch-threads "$THREADS" \
+      --circuits "${CIRCUIT:-brickwall_cnot}" \
+      --observables "${OBSERVABLE:-ring_xz}"
+    ;;
+
+  suite-long)
+    # Application-style multi-circuit, multi-observable MPS run.
+    # This is intentionally multi-term and should run much longer than single-long.
+    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      --mps \
+      --nqubits "${NQ:-80}" \
+      --nlayers "${LAYERS:-16}" \
+      --bond "${BOND:-65536}" \
+      --torch-threads "$THREADS" \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx long_z_string
+    ;;
+
+  mpi-long)
+    # Multi-node Vidal segmented MPS run.  Uses HOSTFILE.
+    run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      --mpi --mps \
+      --nqubits "${NQ:-80}" \
+      --nlayers "${LAYERS:-16}" \
+      --bond "${BOND:-65536}" \
+      --torch-threads "$MPI_THREADS" \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx long_z_string
+    ;;
+
+  stress)
+    # Heavier entanglement.  Start only after single-long is stable.
+    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      --mps \
+      --nqubits "${NQ:-80}" \
+      --nlayers "${LAYERS:-18}" \
+      --bond "${BOND:-262144}" \
+      --torch-threads "${THREADS:-48}" \
+      --circuits "${CIRCUIT:-rxx_rzz}" \
+      --observables ring_xz open_zz range2_xx
+    ;;
+
+  help|*)
+    cat <<'EOF'
+Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress]
+
+Common overrides:
+  PYTHON_BIN=.venv/bin/python
+  THREADS=32
+  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
+
+Single-node scale overrides:
+  NQ=80 LAYERS=16 BOND=65536
+  CIRCUIT=brickwall_cnot
+  OBSERVABLE=ring_xz
+  BONDS="4096 16384 65536"   # for convergence mode
+
+Multi-node overrides:
+  HOSTFILE=hostfile
+  MPI_RANKS=16 MPI_THREADS=12
+
+Recommended first runs:
+  ./run_vidal_mps_cases.sh smoke
+  ./run_vidal_mps_cases.sh convergence
+  ./run_vidal_mps_cases.sh single-long
+EOF
+    ;;
+esac
@@ -3,9 +3,10 @@ from typing import Union
 from qibo.config import raise_error

 from qibotn.backends.abstract import QibotnBackend
+from qibotn.backends.cpu import CpuTensorNet
 from qibotn.backends.cutensornet import CuTensorNet  # pylint: disable=E0401

-PLATFORMS = ("cutensornet", "quimb", "qmatchatea")
+PLATFORMS = ("cutensornet", "cpu", "quimb", "qmatchatea", "vidal")


 class MetaBackend:
@@ -24,6 +25,8 @@ class MetaBackend:

        if platform == "cutensornet":  # pragma: no cover
            return CuTensorNet(runcard)
+        elif platform == "cpu":
+            return CpuTensorNet(runcard)
        elif platform == "quimb":  # pragma: no cover
            import qibotn.backends.quimb as qmb

@@ -36,6 +39,10 @@ class MetaBackend:
            from qibotn.backends.qmatchatea import QMatchaTeaBackend

            return QMatchaTeaBackend()
+        elif platform == "vidal":
+            from qibotn.backends.vidal import VidalBackend
+
+            return VidalBackend()
        else:
            raise_error(
                NotImplementedError,
@@ -0,0 +1,415 @@
+"""CPU tensor-network backend with cutensornet-style runcard support."""
+
+from __future__ import annotations
+
+import os
+
+import numpy as np
+from qibo import hamiltonians
+from qibo.backends import NumpyBackend
+from qibo.config import raise_error
+
+from qibotn.backends.abstract import QibotnBackend
+from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason
+from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+from qibotn.backends.vidal_tebd import VidalTEBDExecutor
+from qibotn.observables import check_observable
+from qibotn.result import TensorNetworkResult
+
+
+def _as_bool_or_dict(value, name):
+    if isinstance(value, (bool, dict)):
+        return value
+    raise TypeError(f"{name} has an unexpected type")
+
+
+def _parse_cpu_list(text):
+    cpus = set()
+    for item in text.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        if "-" in item:
+            start, stop = item.split("-", 1)
+            cpus.update(range(int(start), int(stop) + 1))
+        else:
+            cpus.add(int(item))
+    return cpus
+
+
+class CpuTensorNet(QibotnBackend, NumpyBackend):
+    """CPU replacement for the cutensornet runcard execution surface.
+
+    The backend preserves the high-level runcard knobs used by the GPU backend:
+    ``MPI_enabled``, ``MPS_enabled`` and ``expectation_enabled``.  Generic TN
+    work is delegated to quimb on CPU; MPS expectation uses the Vidal fast path
+    when the circuit is nearest-neighbor and falls back to quimb otherwise.
+    """
+
+    def __init__(self, runcard=None):
+        super().__init__()
+        self.name = "qibotn"
+        self.platform = "cpu"
+        self.precision = "double"
+        self.configure_tn_simulation(runcard)
+
+    def configure_tn_simulation(self, runcard=None):
+        runcard = {} if runcard is None else runcard
+        self.rank = 0
+        self.MPI_enabled = bool(runcard.get("MPI_enabled", False))
+        self.NCCL_enabled = bool(runcard.get("NCCL_enabled", False))
+        if self.NCCL_enabled:
+            raise_error(NotImplementedError, "NCCL is only available for GPU backends.")
+
+        expectation = runcard.get("expectation_enabled", False)
+        if expectation is True:
+            self.expectation_enabled = True
+            self.observable = None
+        elif expectation is False:
+            self.expectation_enabled = False
+            self.observable = None
+        elif isinstance(expectation, (dict, hamiltonians.SymbolicHamiltonian)):
+            self.expectation_enabled = True
+            self.observable = expectation
+        else:
+            raise TypeError("expectation_enabled has an unexpected type")
+
+        mps = _as_bool_or_dict(runcard.get("MPS_enabled", False), "MPS_enabled")
+        self.MPS_enabled = bool(mps)
+        self.mps_options = mps if isinstance(mps, dict) else {}
+
+        self.max_bond_dimension = runcard.get(
+            "max_bond_dimension",
+            self.mps_options.get("max_bond_dimension", 512),
+        )
+        self.cut_ratio = runcard.get(
+            "cut_ratio",
+            self.mps_options.get(
+                "cut_ratio",
+                self.mps_options.get("svd_method", {}).get("abs_cutoff", 1e-12),
+            ),
+        )
+        self.tensor_module = runcard.get("tensor_module", "torch")
+        self.torch_threads = runcard.get("torch_threads", None)
+        self.quimb_backend = runcard.get("quimb_backend", "numpy")
+        self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq")
+        self.parallel_opts = runcard.get("parallel_opts", {})
+        self.parallel_stats = []
+
+    def execute_circuit(
+        self,
+        circuit,
+        initial_state=None,
+        nshots=None,
+        prob_type=None,
+        return_array=False,
+        **prob_kwargs,
+    ):
+        if initial_state is not None:
+            raise_error(NotImplementedError, "QiboTN CPU backend does not support initial state.")
+
+        if self.torch_threads is not None and self.tensor_module == "torch":
+            import torch
+
+            torch.set_num_threads(self.torch_threads)
+
+        if self.expectation_enabled:
+            value = self.expectation(circuit, self.observable)
+            if self.MPI_enabled and self.rank > 0:
+                return np.asarray([0], dtype=np.int64)
+            return np.asarray([value], dtype=np.float64)
+
+        backend = self._quimb_backend()
+        backend.configure_tn_simulation(
+            ansatz="mps" if self.MPS_enabled else None,
+            max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
+            svd_cutoff=self.cut_ratio,
+        )
+        return backend.execute_circuit(
+            circuit=circuit,
+            nshots=nshots,
+            return_array=return_array,
+        )
+
+    def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None):
+        observable = check_observable(observable, circuit.nqubits)
+
+        if self.MPS_enabled:
+            reason = _unsupported_reason(circuit)
+            if reason is None:
+                return self._vidal_expectation(circuit, observable)
+
+        backend = self._quimb_backend()
+        backend.configure_tn_simulation(
+            ansatz="mps" if self.MPS_enabled else None,
+            max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
+            svd_cutoff=self.cut_ratio,
+        )
+        if self.MPI_enabled:
+            return self._quimb_expectation_mpi(backend, circuit, observable)
+        return self._quimb_expectation_processpool(backend, circuit, observable)
+
+    def _vidal_expectation(self, circuit, observable):
+        terms = _symbolic_hamiltonian_to_pauli_terms(observable)
+        if self.MPI_enabled:
+            from mpi4py import MPI
+
+            comm = MPI.COMM_WORLD
+            self.rank = comm.Get_rank()
+            executor = SegmentVidalMPIExecutor(
+                nqubits=circuit.nqubits,
+                max_bond=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+                comm=comm,
+            )
+            executor.run_circuit(circuit)
+            value = executor.expectation_pauli_sum_root(terms)
+            return np.nan if self.rank != 0 else float(np.real(value))
+
+        executor = VidalTEBDExecutor(
+            nqubits=circuit.nqubits,
+            max_bond=self.max_bond_dimension,
+            cut_ratio=self.cut_ratio,
+            tensor_module=self.tensor_module,
+        )
+        executor.run_circuit(circuit)
+        return float(np.real(executor.expectation_pauli_sum(terms)))
+
+    def _quimb_backend(self):
+        import qibotn.backends.quimb as qmb
+
+        return qmb.BACKENDS[self.quimb_backend](
+            quimb_backend=self.quimb_backend,
+            contraction_optimizer=self.contraction_optimizer,
+        )
+
+    def _bind_rank_to_numa_domain(self, rank):
+        domain = rank % 2
+        cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
+        try:
+            cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
+            if cpus:
+                os.sched_setaffinity(0, cpus)
+        except (FileNotFoundError, OSError):
+            pass
+
+        try:
+            import ctypes
+
+            libnuma = ctypes.CDLL("libnuma.so.1")
+            if libnuma.numa_available() >= 0:
+                libnuma.numa_run_on_node(ctypes.c_int(domain))
+                libnuma.numa_set_preferred(ctypes.c_int(domain))
+        except Exception:
+            pass
+
+        self.numa_domain = domain
+
+    def _default_search_workers(self, nranks=1):
+        if self.torch_threads:
+            return max(1, int(self.torch_threads))
+        return max(1, (os.cpu_count() or 1) // max(1, nranks))
+
+    def _quimb_expectation_processpool(self, backend, circuit, observable):
+        return self._quimb_expectation_search(
+            backend,
+            circuit,
+            observable,
+            method="processpool",
+            comm=None,
+        )
+
+    def _quimb_expectation_mpi(self, backend, circuit, observable):
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+        self.rank = comm.Get_rank()
+        self._bind_rank_to_numa_domain(self.rank)
+
+        return self._quimb_expectation_search(
+            backend,
+            circuit,
+            observable,
+            method="mpi",
+            comm=comm,
+        )
+
+    def _quimb_expectation_search(self, backend, circuit, observable, method, comm=None):
+        rank = comm.Get_rank() if comm is not None else 0
+        size = comm.Get_size() if comm is not None else 1
+        self.rank = rank
+
+        from qibotn.observables import extract_gates_and_qubits
+        from qibotn.parallel import (
+            contraction_tree_costs,
+            parallel_contract,
+            parallel_path_search,
+        )
+        from qibotn.backends.quimb import (
+            PAULI_DENSE_MAX_QUBITS,
+            _pauli_term_to_dense_operator,
+            pauli_product_expectation_tn,
+        )
+
+        opts = dict(self.parallel_opts)
+        user_slicing_opts = opts.get("slicing_opts")
+        search_workers = opts.get("search_workers", self._default_search_workers(size))
+        search_repeats = opts.get("max_repeats", 128)
+        search_time = opts.get("max_time", 60)
+        search_backend = opts.get("search_backend")
+        dask_address = opts.get("dask_address")
+
+        qc = backend._qibo_circuit_to_quimb(
+            circuit,
+            quimb_circuit_type=backend.circuit_ansatz,
+            gate_opts={
+                "max_bond": self.max_bond_dimension,
+                "cutoff": self.cut_ratio,
+            },
+        )
+
+        total_value = 0.0 + 0.0j
+        terms = extract_gates_and_qubits(observable)
+        for coeff, factors in terms:
+            if not factors:
+                if self.rank == 0:
+                    total_value += coeff
+                continue
+
+            if len(factors) > PAULI_DENSE_MAX_QUBITS:
+                tn = pauli_product_expectation_tn(
+                    qc,
+                    factors,
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
+            else:
+                op, where = _pauli_term_to_dense_operator(factors)
+                tn = qc.local_expectation(
+                    op,
+                    where,
+                    rehearse="tn",
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
+            slicing_opts = self._mpi_slicing_opts(
+                user_slicing_opts,
+            )
+
+            tree = parallel_path_search(
+                tn,
+                tn.outer_inds(),
+                method="dask" if method != "mpi" and search_backend == "dask" else method,
+                total_repeats=search_repeats,
+                max_time=search_time,
+                n_workers=search_workers,
+                slicing_opts=slicing_opts,
+                trial_timeout=opts.get("trial_timeout"),
+                search_backend=search_backend,
+                dask_address=dask_address,
+            )
+            if tree is None:
+                raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
+
+            if int(getattr(tree, "multiplicity", 1)) <= 1:
+                if self.rank == 0:
+                    value = self._contract_term_unsliced(tn, tree, backend)
+                    self.parallel_stats.append(
+                        {
+                            "term_factors": tuple(factors),
+                            "path_cost": contraction_tree_costs(tree),
+                            "tree_slices": 1,
+                            "slice_assignment": "root",
+                            "rank_slices": [1] + [0] * (size - 1),
+                            "search_workers": search_workers,
+                            "search_repeats": search_repeats,
+                            "search_time": search_time,
+                            "search_backend": search_backend or method,
+                            "dask_address": dask_address,
+                            "numa_domain": getattr(self, "numa_domain", None),
+                        }
+                    )
+                    total_value += coeff * complex(value)
+                continue
+
+            if comm is None:
+                value = self._contract_term_unsliced(tn, tree, backend)
+                self.parallel_stats.append(
+                    {
+                        "term_factors": tuple(factors),
+                        "path_cost": contraction_tree_costs(tree),
+                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
+                        "slice_assignment": "local",
+                        "rank_slices": [int(getattr(tree, "multiplicity", 1))],
+                        "search_workers": search_workers,
+                        "search_repeats": search_repeats,
+                        "search_time": search_time,
+                        "search_backend": search_backend or method,
+                        "dask_address": dask_address,
+                        "numa_domain": getattr(self, "numa_domain", None),
+                    }
+                )
+                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
+                continue
+
+            arrays = self._term_arrays(tn, backend)
+            value, stats = parallel_contract(
+                tree,
+                arrays,
+                method="mpi",
+                comm=comm,
+                return_stats=True,
+            )
+            gathered_stats = comm.gather(stats, root=0)
+            if rank == 0:
+                self.parallel_stats.append(
+                    {
+                        "term_factors": tuple(factors),
+                        "path_cost": contraction_tree_costs(tree),
+                        "tree_slices": stats.nslices,
+                        "slice_assignment": stats.assignment,
+                        "rank_slices": [
+                            item.local_slices for item in gathered_stats
+                        ],
+                        "search_workers": search_workers,
+                        "search_repeats": search_repeats,
+                        "search_time": search_time,
+                        "search_backend": search_backend or method,
+                        "dask_address": dask_address,
+                        "numa_domain": getattr(self, "numa_domain", None),
+                    }
+                )
+                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
+
+        return np.nan if rank != 0 else float(np.real(total_value))
+
+    def _contract_term_unsliced(self, tn, tree, backend):
+        if backend.backend == "torch":
+            import torch
+            from qibotn.backends.quimb import _torch_cpu_array
+
+            for tensor in tn.tensors:
+                tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128)
+            return tn.contract(all, output_inds=(), optimize=tree, backend="torch")
+
+        return tn.contract(
+            all,
+            output_inds=(),
+            optimize=tree,
+            backend=backend.backend,
+        )
+
+    def _mpi_slicing_opts(self, user_slicing_opts):
+        return None if user_slicing_opts is None else dict(user_slicing_opts)
+
+    def _term_arrays(self, tn, backend):
+        if backend.backend == "torch":
+            import torch
+            from qibotn.backends.quimb import _torch_cpu_array
+
+            return [
+                _torch_cpu_array(array, dtype=torch.complex128)
+                for array in tn.arrays
+            ]
+        return [backend.engine.asarray(array) for array in tn.arrays]
@@ -37,6 +37,8 @@ GATE_MAP = {
    "measure": "measure",
 }

+PAULI_DENSE_MAX_QUBITS = 8
+

 def _torch_cpu_array(data, dtype=None):
    """Convert array-like data to a contiguous CPU torch tensor."""
@@ -68,6 +70,81 @@ def _arrays_to_backend(arrays, backend, engine):
    return [engine.asarray(array) for array in arrays]


+def _pauli_term_to_dense_operator(factors):
+    op = None
+    where = []
+    for qubit, gate_name in factors:
+        pauli = qu.pauli(gate_name.lower())
+        op = pauli if op is None else op & pauli
+        where.append(qubit)
+    return op, tuple(where)
+
+
+def pauli_product_expectation_tn(
+    quimb_circuit,
+    factors,
+    simplify_sequence="ADCRS",
+    simplify_atol=1e-12,
+    simplify_equalize_norms=True,
+):
+    """Build the scalar TN for ``<psi|P|psi>`` without dense Pauli strings."""
+    import numpy as np
+
+    op_by_site = {
+        int(qubit): qu.pauli(str(gate_name).lower())
+        for qubit, gate_name in factors
+        if str(gate_name).upper() != "I"
+    }
+    ket = quimb_circuit.get_psi_simplified(
+        seq=simplify_sequence,
+        atol=simplify_atol,
+        equalize_norms=simplify_equalize_norms,
+    )
+    bra = ket.conj().reindex(
+        {
+            quimb_circuit.ket_site_ind(qubit): quimb_circuit.bra_site_ind(qubit)
+            for qubit in range(quimb_circuit.N)
+        }
+    )
+
+    tn = bra | ket
+    identity = np.eye(2, dtype=complex)
+    for qubit in range(quimb_circuit.N):
+        data = op_by_site.get(qubit, identity)
+        tn |= qtn.Tensor(
+            data=data,
+            inds=(
+                quimb_circuit.bra_site_ind(qubit),
+                quimb_circuit.ket_site_ind(qubit),
+            ),
+        )
+
+    tn.full_simplify_(
+        output_inds=(),
+        seq=simplify_sequence,
+        atol=simplify_atol,
+        equalize_norms=simplify_equalize_norms,
+    )
+    return tn
+
+
+def pauli_product_expectation(
+    quimb_circuit,
+    factors,
+    backend,
+    optimize,
+    simplify_sequence="ADCRS",
+    simplify_atol=1e-12,
+):
+    tn = pauli_product_expectation_tn(
+        quimb_circuit,
+        factors,
+        simplify_sequence=simplify_sequence,
+        simplify_atol=simplify_atol,
+    )
+    return tn.contract(all, output_inds=(), optimize=optimize, backend=backend)
+
+
 def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"):
    super(self.__class__, self).__init__()

@@ -340,6 +417,9 @@ def _qibo_circuit_to_quimb(
            circ.apply_gate("CNOT", c, t)
            continue
        if quimb_gate_name is None:
+            if hasattr(gate, "matrix"):
+                circ.apply_gate_raw(gate.matrix(), getattr(gate, "qubits", ()))
+                continue
            raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")

        params = getattr(gate, "parameters", ())
@@ -426,20 +506,24 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):

        exp_val = 0.0
        for coeff, factors in all_terms:
-            op = None
-            where = []
-            for qubit, gate_name in factors:
-                p = qu.pauli(gate_name.lower())
-                op = p if op is None else op & p
-                where.append(qubit)
-
-            val = qc.local_expectation(
-                op, tuple(where),
-                backend=self.backend,
-                optimize=self.contractions_optimizer,
-                simplify_sequence="ADCRS",
-                simplify_atol=1e-12,
-            )
+            if len(factors) > PAULI_DENSE_MAX_QUBITS:
+                val = pauli_product_expectation(
+                    qc,
+                    factors,
+                    backend=self.backend,
+                    optimize=self.contractions_optimizer,
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
+            else:
+                op, where = _pauli_term_to_dense_operator(factors)
+                val = qc.local_expectation(
+                    op, where,
+                    backend=self.backend,
+                    optimize=self.contractions_optimizer,
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
            exp_val += coeff * val

        return self.real(exp_val)
@@ -487,14 +571,11 @@ def _expectation_parallel(self, circuit, observable, method, opts):

    my_exp = 0.0
    for coeff, factors in my_terms:
-        op = None
-        where = []
-        for qubit, gate_name in factors:
-            p = qu.pauli(gate_name.lower())
-            op = p if op is None else op & p
-            where.append(qubit)
-
-        tn = qc.local_expectation(op, tuple(where), rehearse='tn')
+        if len(factors) > PAULI_DENSE_MAX_QUBITS:
+            tn = pauli_product_expectation_tn(qc, factors)
+        else:
+            op, where = _pauli_term_to_dense_operator(factors)
+            tn = qc.local_expectation(op, where, rehearse='tn')

        tree = parallel_path_search(
            tn, tn.outer_inds(),
@@ -0,0 +1,250 @@
+"""Vidal/TEBD fast-path backend with qmatchatea fallback.
+
+This backend targets MPS-friendly one-dimensional circuits: one-qubit gates and
+adjacent two-qubit gates, measured with Pauli-sum expectation values. Unsupported
+features fall back to the qmatchatea backend so the public behavior remains
+usable while the fast path is expanded.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+import numpy as np
+from qibo.backends import NumpyBackend
+
+from qibotn.backends.abstract import QibotnBackend
+from qibotn.backends.qmatchatea import QMatchaTeaBackend
+from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+from qibotn.backends.vidal_tebd import VidalTEBDExecutor, _gate_sites
+from qibotn.observables import check_observable
+
+
+def _symbolic_hamiltonian_to_pauli_terms(hamiltonian):
+    terms = []
+    factor_pattern = re.compile(r"([^\d]+)(\d+)")
+    for term in hamiltonian.terms:
+        ops = []
+        for factor in term.factors:
+            match = factor_pattern.match(str(factor))
+            if match is None:
+                raise ValueError(f"Unsupported observable factor {factor!r}.")
+            name = match.group(1).upper()
+            if name not in ("I", "X", "Y", "Z"):
+                raise ValueError(f"Unsupported observable operator {name!r}.")
+            if name != "I":
+                ops.append((name, int(match.group(2))))
+        terms.append((complex(term.coefficient), tuple(ops)))
+    return terms
+
+
+def _unsupported_reason(circuit):
+    for gate in circuit.queue:
+        name = getattr(gate, "name", gate.__class__.__name__)
+        sites = _gate_sites(gate)
+        if not sites:
+            return f"gate {name} has no target qubits"
+        if len(sites) > 2:
+            return f"gate {name} acts on {len(sites)} qubits"
+        if len(sites) == 2 and abs(sites[0] - sites[1]) != 1:
+            return f"gate {name} is non-adjacent on qubits {sites}"
+        if not hasattr(gate, "matrix"):
+            return f"gate {name} does not expose a matrix"
+    return None
+
+
+def _can_route_non_adjacent(circuit):
+    """True if the circuit's only unsupported feature is non-adjacent 2Q gates.
+
+    SWAP routing can fix non-adjacent gates at compile time.  Multi-qubit
+    gates and matrix-less gates are truly unsupported.
+    """
+    for gate in circuit.queue:
+        sites = _gate_sites(gate)
+        if not sites:
+            return False
+        if len(sites) > 2:
+            return False
+        if not hasattr(gate, "matrix"):
+            return False
+    return True
+
+
+@dataclass
+class VidalBackend(QibotnBackend, NumpyBackend):
+    """QiboTN backend using Vidal/TEBD when possible.
+
+    The fast path supports:
+    - one-qubit gates with ``gate.matrix()``;
+    - adjacent two-qubit gates with ``gate.matrix()``;
+    - Qibo ``SymbolicHamiltonian`` / qibotn dict Pauli-sum expectation values;
+    - MPI chain segmentation through ``mpi_approach="CT"``.
+
+    Unsupported operations are delegated to qmatchatea.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.name = "qibotn"
+        self.platform = "vidal"
+        self.precision = "double"
+        self.last_truncation_error = 0.0
+        self.configure_tn_simulation()
+
+    def configure_tn_simulation(
+        self,
+        ansatz: str = "MPS",
+        max_bond_dimension: int = 10,
+        cut_ratio: float = 1e-9,
+        trunc_tracking_mode: str = "C",
+        svd_control: str = "E!",
+        ini_bond_dimension: int = 1,
+        tensor_module: str = "torch",
+        compile_circuit: bool = False,
+        cache_gate_tensors: bool = True,
+        track_memory: bool = False,
+        mpi_approach: str = "SR",
+        mpi_num_procs: int = 1,
+        mpi_where_barriers: int = -1,
+        mpi_isometrization: int = -1,
+        fallback: bool = True,
+    ):
+        self.ansatz = ansatz
+        self.max_bond_dimension = max_bond_dimension
+        self.cut_ratio = cut_ratio
+        self.trunc_tracking_mode = trunc_tracking_mode
+        self.svd_control = svd_control
+        self.ini_bond_dimension = ini_bond_dimension
+        self.tensor_module = tensor_module
+        self.compile_circuit = compile_circuit
+        self.cache_gate_tensors = cache_gate_tensors
+        self.track_memory = track_memory
+        self.mpi_approach = mpi_approach.upper()
+        self.mpi_num_procs = mpi_num_procs
+        self.mpi_where_barriers = mpi_where_barriers
+        self.mpi_isometrization = mpi_isometrization
+        self.fallback = fallback
+        self._fallback_backend = None
+
+    def _setup_backend_specifics(self):
+        return None
+
+    def _qmatchatea_fallback(self):
+        if self._fallback_backend is None:
+            backend = QMatchaTeaBackend()
+            backend.configure_tn_simulation(
+                ansatz=self.ansatz,
+                max_bond_dimension=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                trunc_tracking_mode=self.trunc_tracking_mode,
+                svd_control=self.svd_control,
+                ini_bond_dimension=self.ini_bond_dimension,
+                tensor_module=self.tensor_module,
+                compile_circuit=self.compile_circuit,
+                cache_gate_tensors=self.cache_gate_tensors,
+                track_memory=self.track_memory,
+                mpi_approach=self.mpi_approach,
+                mpi_num_procs=self.mpi_num_procs,
+                mpi_where_barriers=self.mpi_where_barriers,
+                mpi_isometrization=self.mpi_isometrization,
+            )
+            self._fallback_backend = backend
+        return self._fallback_backend
+
+    def _fallback_or_raise(self, reason):
+        if not self.fallback:
+            raise NotImplementedError(reason)
+        return self._qmatchatea_fallback()
+
+    def _run_fast_executor(self, circuit, compile_circuit=True):
+        if self.mpi_approach == "CT":
+            from mpi4py import MPI
+
+            executor = SegmentVidalMPIExecutor(
+                nqubits=circuit.nqubits,
+                max_bond=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+                comm=MPI.COMM_WORLD,
+            )
+        else:
+            executor = VidalTEBDExecutor(
+                nqubits=circuit.nqubits,
+                max_bond=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+            )
+        executor.run_circuit(circuit, compile_circuit=compile_circuit)
+        return executor
+
+    def expectation(self, circuit, observable, preprocess=True, compile_circuit=None):
+        if self.ansatz.upper() != "MPS":
+            backend = self._fallback_or_raise("VidalBackend supports only MPS.")
+            return backend.expectation(circuit, observable, preprocess, compile_circuit)
+
+        if compile_circuit is None:
+            compile_circuit = self.compile_circuit
+
+        reason = _unsupported_reason(circuit)
+        if reason is not None:
+            # Non-adjacent gates can be routed at compile time
+            if compile_circuit and _can_route_non_adjacent(circuit):
+                pass  # proceed with Vidal + SWAP routing
+            else:
+                backend = self._fallback_or_raise(reason)
+                return backend.expectation(circuit, observable, preprocess, compile_circuit)
+
+        # preprocess=True requests qmatchatea-style transpilation to MPS topology.
+        # The fast path validates MPS-compatibility via _unsupported_reason above;
+        # if the circuit is already MPS-compatible, preprocessing is a no-op.
+        # If it isn't, _unsupported_reason already routed to fallback.
+        # So the fast path skips preprocessing — just warn once.
+        if preprocess:
+            import warnings
+
+            warnings.warn(
+                "VidalBackend fast path does not preprocess circuits. "
+                "If the circuit passes _unsupported_reason (all gates are "
+                "MPS-compatible), fast-path execution proceeds directly.",
+                stacklevel=2,
+            )
+
+        hamiltonian = check_observable(observable, circuit.nqubits)
+        try:
+            terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian)
+        except ValueError as exc:
+            backend = self._fallback_or_raise(str(exc))
+            return backend.expectation(circuit, observable, preprocess, compile_circuit)
+
+        executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
+        self.last_truncation_error = float(executor.truncation_error)
+        if self.mpi_approach == "CT":
+            value = executor.expectation_pauli_sum_root(terms)
+            from qtealeaves.tooling.mpisupport import MPI
+
+            if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                return np.nan
+            return np.real(value)
+        return np.real(executor.expectation_pauli_sum(terms))
+
+    def execute_circuit(
+        self,
+        circuit,
+        initial_state=None,
+        nshots=None,
+        prob_type=None,
+        return_array=False,
+        **prob_kwargs,
+    ):
+        backend = self._fallback_or_raise(
+            "VidalBackend.execute_circuit is delegated to qmatchatea."
+        )
+        return backend.execute_circuit(
+            circuit,
+            initial_state=initial_state,
+            nshots=nshots,
+            prob_type=prob_type,
+            return_array=return_array,
+            **prob_kwargs,
+        )
@@ -0,0 +1,338 @@
+"""Segmented MPI Vidal/TEBD executor.
+
+Each rank owns a contiguous interval of sites. Gates fully inside an interval
+are applied locally. Only two-site gates crossing a rank boundary communicate
+the neighboring edge tensor and the resulting boundary update.
+"""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from mpi4py import MPI
+
+from qibotn.backends.vidal_tebd import (
+    _asarray,
+    _backend_module,
+    _build_theta_svd_matrix,
+    _disjoint_batches,
+    _fuse_one_site_blocks,
+    _gate_sites,
+    _is_two_qubit_batch,
+    _make_two_site_update,
+    _ones,
+    _route_non_adjacent_gates,
+    _svd,
+    _tensor_update_from_numpy,
+    _tensor_update_to_numpy,
+    _to_float,
+    _to_numpy,
+    _transpose,
+    VidalTEBDExecutor,
+)
+
+_EDGE_TAG = 1701
+_UPDATE_TAG = 1702
+
+
+def _partition_sites(nsites, nranks):
+    base = nsites // nranks
+    rem = nsites % nranks
+    starts = [0]
+    for rank in range(nranks):
+        starts.append(starts[-1] + base + int(rank < rem))
+    return starts
+
+
+@dataclass
+class SegmentVidalMPIExecutor:
+    nqubits: int
+    max_bond: int
+    comm: object
+    cut_ratio: float = 1e-12
+    tensor_module: str = "torch"
+
+    def __post_init__(self):
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.starts = _partition_sites(self.nqubits, self.size)
+        self.start = self.starts[self.rank]
+        self.end = self.starts[self.rank + 1]
+        if self.start == self.end:
+            raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.")
+
+        self.xp = _backend_module(self.tensor_module)
+        if self.xp is np:
+            self.dtype = np.complex128
+            self.device = None
+        else:
+            self.dtype = self.xp.complex128
+            self.device = self.xp.device("cpu")
+
+        self.gammas = {}
+        for site in range(self.start, self.end):
+            self.gammas[site] = _asarray(
+                self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype
+            )
+
+        self.lambdas = {
+            bond: _ones(self.xp, 1, self.dtype, self.device)
+            for bond in range(self.start, self.end + 1)
+        }
+        self._accumulated_truncation_error = 0.0
+
+    @property
+    def truncation_error(self):
+        return self._accumulated_truncation_error
+
+    def owns_site(self, site):
+        return self.start <= site < self.end
+
+    def owner_of(self, site):
+        return int(np.searchsorted(self.starts, site, side="right") - 1)
+
+    def run_circuit(self, circuit, compile_circuit=True):
+        timings = {
+            "local_compute": 0.0,
+            "edge_exchange": 0.0,
+            "boundary_compute": 0.0,
+            "boundary_update": 0.0,
+            "one_site": 0.0,
+            "gather": 0.0,
+        }
+
+        gates = circuit.queue
+        if compile_circuit:
+            gates = _route_non_adjacent_gates(gates, circuit.nqubits)
+            gates = _fuse_one_site_blocks(gates)
+        for batch in _disjoint_batches(gates):
+            if _is_two_qubit_batch(batch):
+                self._apply_two_site_batch(batch, timings)
+            else:
+                tic = time.perf_counter()
+                for gate in batch:
+                    sites = _gate_sites(gate)
+                    if len(sites) == 1 and self.owns_site(sites[0]):
+                        op = _asarray(self.xp, gate.matrix(), self.dtype)
+                        self.apply_one_site(op, sites[0])
+                    elif len(sites) == 2:
+                        self._apply_two_site_batch([gate], timings)
+                    elif len(sites) > 2:
+                        raise NotImplementedError("Only one- and two-qubit gates are supported.")
+                timings["one_site"] += time.perf_counter() - tic
+
+        return timings
+
+    def apply_one_site(self, op, pos):
+        self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
+
+    def _apply_two_site_batch(self, batch, timings):
+        local_gates = []
+        boundary_specs = []
+        recv_left_update = False
+        for gate in batch:
+            sites = _gate_sites(gate)
+            if abs(sites[0] - sites[1]) != 1:
+                raise NotImplementedError("Segment Vidal supports adjacent two-qubit gates only.")
+            left, right = sorted(sites)
+            left_owner = self.owner_of(left)
+            right_owner = self.owner_of(right)
+            if left_owner == self.rank and right_owner == self.rank:
+                local_gates.append(gate)
+            elif left_owner == self.rank:
+                boundary_specs.append((gate, left, right))
+            elif right_owner == self.rank:
+                recv_left_update = True
+
+        tic = time.perf_counter()
+        edge_send_req = None
+        if recv_left_update:
+            edge_send_req = self.comm.isend(
+                self._edge_payload(), dest=self.rank - 1, tag=_EDGE_TAG
+            )
+        right_edge = (
+            self.comm.recv(source=self.rank + 1, tag=_EDGE_TAG)
+            if boundary_specs
+            else None
+        )
+        timings["edge_exchange"] += time.perf_counter() - tic
+
+        boundary_update = None
+        tic = time.perf_counter()
+        for gate, left, right in boundary_specs:
+            boundary_update = self._compute_boundary_update(
+                gate, left, right, right_edge
+            )
+        timings["boundary_compute"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        update_send_req = None
+        if boundary_update is not None:
+            update_send_req = self.comm.isend(
+                boundary_update, dest=self.rank + 1, tag=_UPDATE_TAG
+            )
+        timings["boundary_update"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        local_items = [
+            self._compute_owned_two_site_update(gate)
+            for gate in local_gates
+        ]
+        timings["local_compute"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        left_boundary_update = (
+            self.comm.recv(source=self.rank - 1, tag=_UPDATE_TAG)
+            if recv_left_update
+            else None
+        )
+        if update_send_req is not None:
+            update_send_req.wait()
+        if edge_send_req is not None:
+            edge_send_req.wait()
+        timings["boundary_update"] += time.perf_counter() - tic
+
+        for update in local_items:
+            self._install_update(update)
+        if boundary_update is not None:
+            self._install_update(boundary_update)
+        if left_boundary_update is not None:
+            self._install_update(left_boundary_update)
+
+    def _edge_payload(self):
+        return {
+            "start": self.start,
+            "end": self.end,
+            "gamma_start": _to_numpy(self.gammas[self.start]),
+            "lambda_after_start": _to_numpy(self.lambdas[self.start + 1]),
+        }
+
+    def _compute_owned_two_site_update(self, gate):
+        sites = _gate_sites(gate)
+        op = _asarray(self.xp, gate.matrix(), self.dtype)
+        left, right = sites
+        if left > right:
+            left, right = right, left
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
+        item = self._build_item(
+            left,
+            op,
+            self.lambdas[left],
+            self.lambdas[left + 1],
+            self.lambdas[left + 2],
+            self.gammas[left],
+            self.gammas[right],
+        )
+        split = _svd(self.xp, item["matrix"])
+        return _make_two_site_update(
+            item, *split, self.max_bond, self.cut_ratio, self.xp
+        )
+
+    def _compute_boundary_update(self, gate, left, right, remote):
+        op = _asarray(self.xp, gate.matrix(), self.dtype)
+        sites = _gate_sites(gate)
+        if sites[0] > sites[1]:
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
+
+        gamma_right = _asarray(self.xp, remote["gamma_start"], self.dtype)
+        lam_right = _asarray(
+            self.xp,
+            remote["lambda_after_start"],
+            self.xp.float64 if self.xp is not np else np.float64,
+        )
+        item = self._build_item(
+            left,
+            op,
+            self.lambdas[left],
+            self.lambdas[left + 1],
+            lam_right,
+            self.gammas[left],
+            gamma_right,
+        )
+        split = _svd(self.xp, item["matrix"])
+        return _tensor_update_to_numpy(
+            _make_two_site_update(
+                item, *split, self.max_bond, self.cut_ratio, self.xp
+            )
+        )
+
+    def _build_item(self, site, op, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
+        result = _build_theta_svd_matrix(
+            op, self.xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right
+        )
+        result["site"] = site
+        result["lam_left"] = lam_left
+        result["lam_right"] = lam_right
+        return result
+
+    def _install_update(self, update):
+        if isinstance(update["left"], np.ndarray):
+            update = _tensor_update_from_numpy(self.xp, update, self.dtype)
+        self._accumulated_truncation_error += update.get("truncation_error", 0.0)
+        site = update["site"]
+        if self.owns_site(site):
+            self.gammas[site] = update["left"]
+        if self.owns_site(site + 1):
+            self.gammas[site + 1] = update["right"]
+        if self.start <= site + 1 <= self.end:
+            self.lambdas[site + 1] = update["lambda"]
+
+    def gather_full_state(self):
+        payload = {
+            "start": self.start,
+            "end": self.end,
+            "gammas": {site: _to_numpy(tensor) for site, tensor in self.gammas.items()},
+            "lambdas": {bond: _to_numpy(tensor) for bond, tensor in self.lambdas.items()},
+        }
+        return self.comm.gather(payload, root=0)
+
+    def expectation_pauli_sum_root(self, terms):
+        payloads = self.gather_full_state()
+        if self.rank != 0:
+            return None
+        full = VidalTEBDExecutor(
+            self.nqubits,
+            self.max_bond,
+            cut_ratio=self.cut_ratio,
+            tensor_module=self.tensor_module,
+        )
+        for payload in payloads:
+            for site, tensor in payload["gammas"].items():
+                full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype)
+            for bond, tensor in payload["lambdas"].items():
+                full.lambdas[int(bond)] = _asarray(
+                    self.xp,
+                    tensor,
+                    self.xp.float64 if self.xp is not np else np.float64,
+                )
+        return full.expectation_pauli_sum(terms)
+
+    def expectation_ring_xz_root(self):
+        terms = [
+            (0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
+            for site in range(self.nqubits)
+        ]
+        return self.expectation_pauli_sum_root(terms)
+
+
+def run_segment_vidal_mpi_ring_xz(
+    circuit,
+    max_bond,
+    comm,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+):
+    executor = SegmentVidalMPIExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    timings = executor.run_circuit(circuit)
+    tic = time.perf_counter()
+    value = executor.expectation_ring_xz_root()
+    timings["gather"] = time.perf_counter() - tic
+    return value, timings
@@ -62,6 +62,41 @@ def _to_float(x):
    return float(x)


+def _to_numpy(tensor):
+    if hasattr(tensor, "detach"):
+        return tensor.detach().cpu().numpy()
+    return np.asarray(tensor)
+
+
+def _tensor_update_to_numpy(update):
+    result = {
+        "site": int(update["site"]),
+        "left": _to_numpy(update["left"]),
+        "right": _to_numpy(update["right"]),
+        "lambda": _to_numpy(update["lambda"]),
+    }
+    if "truncation_error" in update:
+        result["truncation_error"] = float(update["truncation_error"])
+    return result
+
+
+def _tensor_update_from_numpy(xp, update, dtype):
+    if xp is np:
+        return update
+    result = {
+        "site": update["site"],
+        "left": _asarray(xp, update["left"], dtype),
+        "right": _asarray(xp, update["right"], dtype),
+        "lambda": xp.as_tensor(
+            update["lambda"],
+            dtype=xp.float64 if dtype == xp.complex128 else xp.float32,
+        ),
+    }
+    if "truncation_error" in update:
+        result["truncation_error"] = float(update["truncation_error"])
+    return result
+
+
 def _svd(xp, matrix):
    return _svd_eigh(xp, matrix)

@@ -92,32 +127,6 @@ def _svd_eigh(xp, matrix):
    return umat, singvals, _conj(xp, eigvecs).T


-def _batched_svd_eigh(xp, matrices):
-    """Batched EVD SVD for a stack of same-shape torch matrices."""
-
-    m_dim, n_dim = matrices.shape[-2:]
-    if m_dim <= n_dim:
-        grams = matrices @ _conj(xp, matrices).transpose(-1, -2)
-        eigvals, eigvecs = xp.linalg.eigh(grams)
-        idx = xp.argsort(eigvals, dim=-1, descending=True)
-        eigvals = xp.gather(eigvals, -1, idx)
-        eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
-        singvals = _sqrt_clamped(xp, eigvals)
-        inv_s = _safe_inverse(xp, singvals)
-        vhs = (eigvecs.conj().transpose(-1, -2) @ matrices) * inv_s.unsqueeze(-1)
-        return eigvecs, singvals, vhs
-
-    grams = _conj(xp, matrices).transpose(-1, -2) @ matrices
-    eigvals, eigvecs = xp.linalg.eigh(grams)
-    idx = xp.argsort(eigvals, dim=-1, descending=True)
-    eigvals = xp.gather(eigvals, -1, idx)
-    eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
-    singvals = _sqrt_clamped(xp, eigvals)
-    inv_s = _safe_inverse(xp, singvals)
-    umats = (matrices @ eigvecs) * inv_s.unsqueeze(-2)
-    return umats, singvals, eigvecs.conj().transpose(-1, -2)
-
-
 def _eigh(xp, matrix):
    if xp is np:
        return np.linalg.eigh(matrix)
@@ -126,10 +135,8 @@ def _eigh(xp, matrix):

 def _sort_eigh_desc(xp, eigvals, eigvecs):
    if xp is np:
-        idx = np.argsort(eigvals)[::-1].copy()
-        return eigvals[idx], eigvecs[:, idx]
-    idx = xp.argsort(eigvals, descending=True)
-    return eigvals[idx], eigvecs[:, idx]
+        return eigvals[::-1].copy(), eigvecs[:, ::-1].copy()
+    return xp.flip(eigvals, dims=(0,)), xp.flip(eigvecs, dims=(1,))


 def _sqrt_clamped(xp, eigvals):
@@ -150,8 +157,6 @@ class VidalTEBDExecutor:
    max_bond: int
    cut_ratio: float = 1e-12
    tensor_module: str = "torch"
-    workers: int = 1
-    use_batched: bool = False

    def __post_init__(self):
        self.xp = _backend_module(self.tensor_module)
@@ -169,19 +174,20 @@ class VidalTEBDExecutor:
        self.lambdas = [
            _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
        ]
+        self._accumulated_truncation_error = 0.0

-    def run_circuit(self, circuit):
-        for batch in _disjoint_batches(circuit.queue):
-            if (
-                self.use_batched
-                and _is_two_qubit_batch(batch)
-                and self.workers > 1
-                and len(batch) > 1
-            ):
-                self.apply_two_site_batch(batch)
-            else:
-                for gate in batch:
-                    self._apply_gate(gate)
+    def run_circuit(self, circuit, compile_circuit=True):
+        gates = circuit.queue
+        if compile_circuit:
+            gates = _route_non_adjacent_gates(gates, circuit.nqubits)
+            gates = _fuse_one_site_blocks(gates)
+        for batch in _disjoint_batches(gates):
+            for gate in batch:
+                self._apply_gate(gate)
+
+    @property
+    def truncation_error(self):
+        return self._accumulated_truncation_error

    def _apply_gate(self, gate):
        sites = _gate_sites(gate)
@@ -204,40 +210,6 @@ class VidalTEBDExecutor:
        umat, singvals, vh = _svd(self.xp, item["matrix"])
        self._install_two_site_split(item, umat, singvals, vh)

-    def apply_two_site_batch(self, batch):
-        items = []
-        for gate in batch:
-            sites = _gate_sites(gate)
-            op = _asarray(self.xp, gate.matrix(), self.dtype)
-            items.append(self._build_two_site_matrix(op, sites[0], sites[1]))
-
-        if self.xp is not np:
-            grouped = {}
-            for idx, item in enumerate(items):
-                grouped.setdefault(tuple(item["matrix"].shape), []).append(idx)
-
-            for indices in grouped.values():
-                if len(indices) == 1:
-                    item = items[indices[0]]
-                    umat, singvals, vh = _svd(self.xp, item["matrix"])
-                    item["split"] = (umat, singvals, vh)
-                    continue
-
-                matrices = self.xp.stack([items[idx]["matrix"] for idx in indices])
-                umats, singvals, vhs = _batched_svd_eigh(self.xp, matrices)
-                for out_idx, item_idx in enumerate(indices):
-                    items[item_idx]["split"] = (
-                        umats[out_idx],
-                        singvals[out_idx],
-                        vhs[out_idx],
-                    )
-        else:
-            for item in items:
-                item["split"] = _svd(self.xp, item["matrix"])
-
-        for item in items:
-            self._install_two_site_split(item, *item["split"])
-
    def _build_two_site_matrix(self, op, left_pos, right_pos):
        if left_pos > right_pos:
            left_pos, right_pos = right_pos, left_pos
@@ -246,101 +218,69 @@ class VidalTEBDExecutor:
            )

        i = left_pos
-        lam_left = self.lambdas[i]
-        lam_mid = self.lambdas[i + 1]
-        lam_right = self.lambdas[i + 2]
-        gamma_left = self.gammas[i]
-        gamma_right = self.gammas[i + 1]
-
-        theta = self.xp.einsum(
-            "a,asb,b,btc,c->astc",
-            lam_left,
-            gamma_left,
-            lam_mid,
-            gamma_right,
-            lam_right,
+        result = _build_theta_svd_matrix(
+            op, self.xp,
+            self.lambdas[i], self.lambdas[i + 1], self.lambdas[i + 2],
+            self.gammas[i], self.gammas[i + 1],
        )
-        gate = op.reshape(2, 2, 2, 2)
-        theta = self.xp.einsum("uvst,astc->auvc", gate, theta)
-
-        chi_left = theta.shape[0]
-        chi_right = theta.shape[3]
-        matrix = theta.reshape(chi_left * 2, 2 * chi_right)
-        return {
-            "site": i,
-            "chi_left": chi_left,
-            "chi_right": chi_right,
-            "lam_left": lam_left,
-            "lam_right": lam_right,
-            "matrix": matrix,
-        }
+        result["site"] = i
+        result["lam_left"] = self.lambdas[i]
+        result["lam_right"] = self.lambdas[i + 2]
+        return result

    def _install_two_site_split(self, item, umat, singvals, vh):
-        keep = self._choose_bond(singvals)
-        umat = umat[:, :keep]
-        kept = singvals[:keep]
-        cut = singvals[keep:]
-        vh = vh[:keep, :]
-
-        if cut.shape[0] > 0:
-            norm_kept = (kept * kept).sum()
-            norm_cut = (cut * cut).sum()
-            kept = kept / self.xp.sqrt(norm_kept / (norm_kept + norm_cut))
-
-        new_left = umat.reshape(item["chi_left"], 2, keep)
-        new_right = vh.reshape(keep, 2, item["chi_right"])
-        new_left = self._divide_left_lambda(new_left, item["lam_left"])
-        new_right = self._divide_right_lambda(new_right, item["lam_right"])
-
-        i = item["site"]
-        self.gammas[i] = new_left
-        self.gammas[i + 1] = new_right
-        self.lambdas[i + 1] = kept
-
-    def _choose_bond(self, singvals):
-        max_possible = int(singvals.shape[0])
-        keep = min(max_possible, self.max_bond)
-        if self.cut_ratio > 0 and max_possible > 0:
-            threshold = singvals[0] * self.cut_ratio
-            if self.xp is np:
-                ratio_keep = int(np.count_nonzero(singvals > threshold))
-            else:
-                ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
-            keep = min(keep, max(1, ratio_keep))
-        return keep
-
-    def _divide_left_lambda(self, tensor, lambdas):
-        if self.xp is np:
-            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
-        else:
-            safe = self.xp.where(
-                self.xp.abs(lambdas) > 1e-300,
-                lambdas,
-                self.xp.ones_like(lambdas),
-            )
-        return tensor / safe.reshape(-1, 1, 1)
-
-    def _divide_right_lambda(self, tensor, lambdas):
-        if self.xp is np:
-            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
-        else:
-            safe = self.xp.where(
-                self.xp.abs(lambdas) > 1e-300,
-                lambdas,
-                self.xp.ones_like(lambdas),
-            )
-        return tensor / safe.reshape(1, 1, -1)
+        update = _make_two_site_update(item, umat, singvals, vh,
+                                       self.max_bond, self.cut_ratio, self.xp)
+        self._accumulated_truncation_error += update["truncation_error"]
+        i = update["site"]
+        self.gammas[i] = update["left"]
+        self.gammas[i + 1] = update["right"]
+        self.lambdas[i + 1] = update["lambda"]

    def expectation_ring_xz(self):
-        xmat = _asarray(self.xp, [[0, 1], [1, 0]], self.dtype)
-        zmat = _asarray(self.xp, [[1, 0], [0, -1]], self.dtype)
-        value = 0.0
-        for site in range(self.nqubits - 1):
-            value += 0.5 * _to_float(self._expect_adjacent(site, xmat, zmat))
-        value += 0.5 * _to_float(
-            self.expect_product_operators({self.nqubits - 1: xmat, 0: zmat})
+        return self.expectation_pauli_sum(
+            [
+                (0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
+                for site in range(self.nqubits)
+            ]
        )
-        return value / self.norm()
+
+    def expectation_pauli_sum(self, terms):
+        paulis = {
+            "I": _eye(self.xp, 2, self.dtype, self.device),
+            "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
+            "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
+            "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
+        }
+        value = 0.0
+        norm = self.norm()
+        for coeff, ops in terms:
+            ops = tuple((name.upper(), site) for name, site in ops)
+            if len(ops) == 0:
+                term_value = norm
+            elif len(ops) == 1:
+                name, site = ops[0]
+                term_value = _to_float(self._expect_one_site(site, paulis[name]))
+            elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1:
+                (name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1])
+                term_value = _to_float(
+                    self._expect_adjacent(site0, paulis[name0], paulis[name1])
+                )
+            else:
+                operators = {site: paulis[name] for name, site in ops}
+                term_value = _to_float(self.expect_product_operators(operators))
+            value += float(np.real(coeff)) * term_value
+        return value / norm
+
+    def _expect_one_site(self, site, op):
+        theta = self.xp.einsum(
+            "a,asb,b->asb",
+            self.lambdas[site],
+            self.gammas[site],
+            self.lambdas[site + 1],
+        )
+        op_theta = self.xp.einsum("us,asb->aub", op, theta)
+        return _vdot_real(self.xp, theta, op_theta)

    def _expect_adjacent(self, site, op_left, op_right):
        theta = self.xp.einsum(
@@ -369,6 +309,79 @@ class VidalTEBDExecutor:
        return _to_float(self.expect_product_operators({}))


+def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
+    """Merge and apply a two-site gate, returning the SVD-ready matrix."""
+    theta = xp.einsum(
+        "a,asb,b,btc,c->astc",
+        lam_left, gamma_left, lam_mid, gamma_right, lam_right,
+    )
+    gate = op.reshape(2, 2, 2, 2)
+    theta = xp.einsum("uvst,astc->auvc", gate, theta)
+    chi_left = theta.shape[0]
+    chi_right = theta.shape[3]
+    return {
+        "chi_left": chi_left,
+        "chi_right": chi_right,
+        "matrix": theta.reshape(chi_left * 2, 2 * chi_right),
+    }
+
+
+def _choose_bond(singvals, max_bond, cut_ratio, xp):
+    max_possible = int(singvals.shape[0])
+    keep = min(max_possible, max_bond)
+    if cut_ratio > 0 and max_possible > 0:
+        threshold = singvals[0] * cut_ratio
+        if xp is np:
+            ratio_keep = int(np.count_nonzero(singvals > threshold))
+        else:
+            ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
+        keep = min(keep, max(1, ratio_keep))
+    return keep
+
+
+def _divide_left_lambda(tensor, lambdas, xp):
+    if xp is np:
+        safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+    else:
+        safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
+    return tensor / safe.reshape(-1, 1, 1)
+
+
+def _divide_right_lambda(tensor, lambdas, xp):
+    if xp is np:
+        safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+    else:
+        safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
+    return tensor / safe.reshape(1, 1, -1)
+
+
+def _make_two_site_update(item, umat, singvals, vh, max_bond, cut_ratio, xp):
+    keep = _choose_bond(singvals, max_bond, cut_ratio, xp)
+    umat = umat[:, :keep]
+    kept = singvals[:keep]
+    cut = singvals[keep:]
+    vh = vh[:keep, :]
+
+    discarded_weight = 0.0
+    if cut.shape[0] > 0:
+        norm_kept = (kept * kept).sum()
+        norm_cut = (cut * cut).sum()
+        discarded_weight = float(_to_float(norm_cut))
+        kept = kept / xp.sqrt(norm_kept / (norm_kept + norm_cut))
+
+    new_left = umat.reshape(item["chi_left"], 2, keep)
+    new_right = vh.reshape(keep, 2, item["chi_right"])
+    new_left = _divide_left_lambda(new_left, item["lam_left"], xp)
+    new_right = _divide_right_lambda(new_right, item["lam_right"], xp)
+    return {
+        "site": item["site"],
+        "left": new_left,
+        "right": new_right,
+        "lambda": kept,
+        "truncation_error": discarded_weight,
+    }
+
+
 def _gate_sites(gate):
    controls = tuple(getattr(gate, "control_qubits", ()))
    targets = tuple(getattr(gate, "target_qubits", ()))
@@ -377,19 +390,90 @@ def _gate_sites(gate):
    return targets


+# ── SWAP routing for non-adjacent two-qubit gates ──────────────────────
+
+class _SWAPGate:
+    """Minimal SWAP gate wrapper for routing non-adjacent gates."""
+    name = "swap"
+    control_qubits = ()
+
+    def __init__(self, left, right):
+        self.target_qubits = (left, right)
+
+    def matrix(self):
+        return np.array(
+            [[1, 0, 0, 0],
+             [0, 0, 1, 0],
+             [0, 1, 0, 0],
+             [0, 0, 0, 1]],
+            dtype=complex,
+        )
+
+
+class _RoutedTwoQubitGate:
+    """Wraps a two-qubit gate with remapped physical sites after SWAP routing."""
+    name = "routed_two_qubit"
+    control_qubits = ()
+
+    def __init__(self, original_gate, left_phys, right_phys):
+        self.target_qubits = (left_phys, right_phys)
+        self._matrix = original_gate.matrix()
+
+    def matrix(self):
+        return self._matrix
+
+
+def _route_non_adjacent_gates(gates, nqubits):
+    """Insert SWAP networks to make all two-qubit gates adjacent.
+
+    For each non-adjacent two-qubit gate, inserts SWAP gates to bring the
+    farther qubit adjacent, applies the original gate, then inserts reverse
+    SWAPs to restore the qubit ordering.  The resulting gate sequence
+    contains only adjacent two-qubit gates and is safe for VidalTEBDExecutor.
+    """
+    routed = []
+    for gate in gates:
+        sites = _gate_sites(gate)
+        if len(sites) <= 1:
+            routed.append(gate)
+            continue
+
+        left, right = sorted(sites)
+        if right - left == 1:
+            routed.append(gate)
+            continue
+
+        # Move qubit 'right' leftwards to sit at left+1
+        for pos in range(right - 1, left, -1):
+            routed.append(_SWAPGate(pos, pos + 1))
+
+        # Apply the original gate at the adjacent physical positions
+        routed.append(_RoutedTwoQubitGate(gate, left, left + 1))
+
+        # Reverse SWAPs to restore original ordering
+        for pos in range(left + 1, right):
+            routed.append(_SWAPGate(pos, pos + 1))
+
+    return routed
+
+
 def _disjoint_batches(gates):
    batches = []
    current = []
    touched = set()
+    current_arity = None
    for gate in gates:
        sites = _gate_sites(gate)
+        arity = len(sites)
        site_set = set(sites)
-        if current and touched & site_set:
+        if current and (current_arity != arity or touched & site_set):
            batches.append(current)
            current = []
            touched = set()
+            current_arity = None
        current.append(gate)
        touched |= site_set
+        current_arity = arity
    if current:
        batches.append(current)
    return batches
@@ -399,21 +483,59 @@ def _is_two_qubit_batch(batch):
    return batch and all(len(_gate_sites(gate)) == 2 for gate in batch)


+class _FusedOneSiteGate:
+    name = "fused_one_site"
+
+    def __init__(self, site, matrix):
+        self.target_qubits = (site,)
+        self.control_qubits = ()
+        self._matrix = matrix
+
+    def matrix(self):
+        return self._matrix
+
+
+def _fuse_one_site_blocks(gates):
+    fused = []
+    block = []
+
+    def flush_block():
+        nonlocal block
+        if not block:
+            return
+        per_site = {}
+        for gate in block:
+            site = _gate_sites(gate)[0]
+            mat = gate.matrix()
+            if site in per_site:
+                per_site[site] = mat @ per_site[site]
+            else:
+                per_site[site] = mat
+        for site in sorted(per_site):
+            fused.append(_FusedOneSiteGate(site, per_site[site]))
+        block = []
+
+    for gate in gates:
+        if len(_gate_sites(gate)) == 1:
+            block.append(gate)
+            continue
+        flush_block()
+        fused.append(gate)
+    flush_block()
+    return fused
+
+
 def run_vidal_ring_xz(
    circuit,
    max_bond,
    cut_ratio=1e-12,
    tensor_module="torch",
-    workers=1,
-    use_batched=False,
 ):
    executor = VidalTEBDExecutor(
        nqubits=circuit.nqubits,
        max_bond=max_bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
-        workers=workers,
-        use_batched=use_batched,
    )
    executor.run_circuit(circuit)
    return executor.expectation_ring_xz()
@@ -0,0 +1,151 @@
+"""Reusable benchmark circuits and observables for expectation runs."""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+from qibo import Circuit, gates
+
+
+CIRCUITS = (
+    "brickwall_cnot",
+    "reversed_cnot",
+    "shifted_cz",
+    "rxx_rzz",
+    "swap_scramble",
+    "ghz_ladder",
+)
+
+OBSERVABLES = (
+    "ring_xz",
+    "open_zz",
+    "mixed_local",
+    "range2_xx",
+    "long_z_string",
+)
+
+
+def parse_names(raw, valid, label):
+    if raw == ["all"]:
+        return list(valid)
+    unknown = sorted(set(raw) - set(valid))
+    if unknown:
+        raise ValueError(f"Unknown {label}: {', '.join(unknown)}")
+    return raw
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    if kind == "ghz_ladder":
+        circuit.add(gates.H(0))
+        for qubit in range(nqubits - 1):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        return circuit
+
+    for layer in range(nlayers):
+        for qubit in range(nqubits):
+            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            if kind in ("rxx_rzz", "swap_scramble"):
+                circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind == "brickwall_cnot":
+            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
+        elif kind == "reversed_cnot":
+            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
+        elif kind == "shifted_cz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+        elif kind == "rxx_rzz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+        elif kind == "swap_scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+                if layer % 4 == 3:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def add_brickwall(circuit, nqubits, gate, layer, reverse):
+    for qubit in range(0, nqubits - 1, 2):
+        if reverse and layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+    for qubit in range(1, nqubits - 1, 2):
+        if reverse and not layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+
+
+def observable_terms(kind, nqubits):
+    if kind == "ring_xz":
+        return [
+            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
+            for site in range(nqubits)
+        ]
+    if kind == "open_zz":
+        return [
+            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
+            for site in range(nqubits - 1)
+        ]
+    if kind == "mixed_local":
+        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
+        terms += [
+            (0.125, (("Y", site), ("Y", site + 1)))
+            for site in range(0, nqubits - 1, 3)
+        ]
+        return terms
+    if kind == "range2_xx":
+        return [
+            (1.0 / max(1, nqubits - 2), (("X", site), ("X", site + 2)))
+            for site in range(nqubits - 2)
+        ]
+    if kind == "long_z_string":
+        stride = max(1, nqubits // 16)
+        return [(1.0, tuple(("Z", site) for site in range(0, nqubits, stride)))]
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def terms_to_dict(terms):
+    return {
+        "terms": [
+            {
+                "coefficient": float(np.real(coeff)),
+                "operators": [(name, int(site)) for name, site in ops],
+            }
+            for coeff, ops in terms
+        ]
+    }
+
+
+def exact_pauli_sum(circuit, terms, nqubits):
+    state = circuit().state(numpy=True).reshape(-1)
+    indices = np.arange(state.size, dtype=np.int64)
+    value = 0.0 + 0.0j
+    for coeff, ops in terms:
+        flipped = indices.copy()
+        phase = np.ones(state.size, dtype=np.complex128)
+        for name, site in ops:
+            shift = nqubits - 1 - site
+            bit = (indices >> shift) & 1
+            if name == "X":
+                flipped ^= 1 << shift
+            elif name == "Y":
+                flipped ^= 1 << shift
+                phase *= 1j * (1 - 2 * bit)
+            elif name == "Z":
+                phase *= 1 - 2 * bit
+            elif name != "I":
+                raise ValueError(f"Unsupported Pauli {name!r}.")
+        value += coeff * np.vdot(state[flipped], phase * state)
+    return float(value.real)
@@ -0,0 +1,71 @@
+"""High-level CPU expectation runner used by CLI scripts."""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from qibo.backends import construct_backend
+
+from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.observables import check_observable
+
+
+@dataclass
+class ExpectationConfig:
+    ansatz: str = "tn"
+    mpi: bool = False
+    bond: int = 1024
+    cut_ratio: float = 1e-12
+    tensor_module: str = "torch"
+    torch_threads: int = 8
+    parallel_opts: dict | None = None
+
+
+@dataclass
+class ExpectationResult:
+    value: float
+    seconds: float
+    rank: int = 0
+
+
+def exact_for_observable(circuit, observable, nqubits):
+    if isinstance(observable, dict) and "terms" in observable:
+        terms = [
+            (
+                term["coefficient"],
+                tuple((name, site) for name, site in term["operators"]),
+            )
+            for term in observable["terms"]
+        ]
+        return exact_pauli_sum(circuit, terms, nqubits)
+
+    hamiltonian = check_observable(observable, nqubits)
+    return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
+
+
+def run_cpu_expectation(circuit, observable, config):
+    runcard = {
+        "MPI_enabled": config.mpi,
+        "MPS_enabled": config.ansatz.lower() == "mps",
+        "NCCL_enabled": False,
+        "expectation_enabled": observable,
+        "max_bond_dimension": config.bond,
+        "cut_ratio": config.cut_ratio,
+        "tensor_module": config.tensor_module,
+        "torch_threads": config.torch_threads,
+        "parallel_opts": config.parallel_opts or {},
+    }
+    backend = construct_backend(
+        backend="qibotn",
+        platform="cpu",
+        runcard=runcard,
+    )
+
+    start = time.perf_counter()
+    value = backend.execute_circuit(circuit)[0]
+    elapsed = time.perf_counter() - start
+
+    rank = getattr(backend, "rank", 0)
+    return ExpectationResult(float(np.real(value)), elapsed, rank=rank)
@@ -29,6 +29,11 @@ def build_observable(circuit_nqubit):

 def create_hamiltonian_from_dict(data, circuit_nqubit):
    """Create a Qibo SymbolicHamiltonian from the qibotn dict representation."""
+    if "pauli_string_pattern" in data:
+        return create_hamiltonian_from_pauli_pattern(
+            data["pauli_string_pattern"], circuit_nqubit
+        )
+
    pauli_gates = {"X": X, "Y": Y, "Z": Z}
    terms = []

@@ -54,6 +59,38 @@ def create_hamiltonian_from_dict(data, circuit_nqubit):
    return hamiltonians.SymbolicHamiltonian(sum(terms))


+def create_hamiltonian_from_pauli_pattern(pattern, circuit_nqubit):
+    """Create a single Pauli-string Hamiltonian by repeating ``pattern``.
+
+    Example: pattern ``"IXZ"`` on 5 qubits becomes ``I0 * X1 * Z2 * I3 * X4``.
+    Identity factors are omitted except for the all-identity case.
+    """
+    if not isinstance(pattern, str) or not pattern:
+        raise ValueError("pauli_string_pattern must be a non-empty string.")
+
+    pauli_gates = {"X": X, "Y": Y, "Z": Z}
+    pattern = pattern.upper()
+    invalid = sorted(set(pattern) - {"I", "X", "Y", "Z"})
+    if invalid:
+        raise ValueError(
+            "pauli_string_pattern characters must be one of I/X/Y/Z; "
+            f"got {''.join(invalid)!r}."
+        )
+
+    expr = None
+    for qubit in range(circuit_nqubit):
+        name = pattern[qubit % len(pattern)]
+        if name == "I":
+            continue
+        factor = pauli_gates[name](qubit)
+        expr = factor if expr is None else expr * factor
+
+    if expr is None:
+        expr = I(0)
+
+    return hamiltonians.SymbolicHamiltonian(form=expr)
+
+
 def build_random_circuit(nqubits, nlayers, seed=42):
    """Build a random circuit with RY+RZ+CNOT layers for benchmarks."""
    import numpy as np
@@ -2,8 +2,11 @@
 import os
 import pickle
 import signal
+import time
+from math import log2, log10
 import numpy as np
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from dataclasses import dataclass
+from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed

 try:
    from mpi4py import MPI
@@ -13,25 +16,59 @@ except ImportError:
    MPI = None


-def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
+SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass")
+
+
+def _search_chunk(
+    tn_bytes,
+    output_inds,
+    repeats,
+    seed,
+    max_time,
+    slicing_opts,
+    optlib=None,
+):
    import random, cotengra as ctg
+
    random.seed(seed)
    tn = pickle.loads(tn_bytes)
+    kwargs = {}
+    if optlib is not None:
+        kwargs["optlib"] = optlib
    opt = ctg.HyperOptimizer(
-        methods=["kahypar", "kahypar-agglom", "spinglass"],
-        max_repeats=1,
+        methods=SEARCH_METHODS,
+        max_repeats=repeats,
+        max_time=max_time,
        parallel=False,
        minimize="combo-256",
-        optlib="random",
        slicing_opts=slicing_opts,
        progbar=False,
+        **kwargs,
    )
    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
    return tree.combo_cost(factor=256), tree


+def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
+    return _search_chunk(
+        tn_bytes,
+        output_inds,
+        repeats=1,
+        seed=seed,
+        max_time=None,
+        slicing_opts=slicing_opts,
+        optlib="random",
+    )
+
+
 def _kill_pool(pool):
-    for pid in list(pool._processes.keys()):
+    processes = getattr(pool, "_processes", None)
+    if processes:
+        pids = list(processes.keys())
+    else:
+        pids = []
+
+    for pid in pids:
        try:
            os.kill(pid, signal.SIGKILL)
        except ProcessLookupError:
@@ -43,21 +80,14 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
    import time

    if trial_timeout is None:
-        import random, cotengra as ctg
-        random.seed(seed)
-        tn = pickle.loads(tn_bytes)
-        opt = ctg.HyperOptimizer(
-            methods=["kahypar", "kahypar-agglom", "spinglass"],
-            max_repeats=repeats,
-            parallel=False,
-            minimize="combo-256",
+        return _search_chunk(
+            tn_bytes,
+            output_inds,
+            repeats=repeats,
+            seed=seed,
            max_time=max_time,
-            optlib="random",
            slicing_opts=slicing_opts,
-            progbar=False,
        )
-        tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
-        return tree.combo_cost(factor=256), tree

    deadline = time.time() + max_time
    best_cost, best_tree = float("inf"), None
@@ -80,17 +110,36 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
    return best_cost, best_tree


+def _split_repeats(total_repeats, n_workers):
+    n_workers = max(1, int(n_workers))
+    total_repeats = max(1, int(total_repeats))
+    chunk, extra = divmod(total_repeats, n_workers)
+    return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0]
+
+
 def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None):
    tn_bytes = pickle.dumps(tn)
-    repeats_per = max(1, total_repeats // n_workers)
-    pool = ProcessPoolExecutor(max_workers=n_workers)
-    futures = [
-        pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time, slicing_opts, trial_timeout)
-        for seed in range(n_workers)
-    ]
+    repeat_chunks = _split_repeats(total_repeats, n_workers)
+    pool = ProcessPoolExecutor(max_workers=len(repeat_chunks))
+    futures = []
+    for seed, repeats in enumerate(repeat_chunks):
+        futures.append(
+            pool.submit(
+                _serial_search,
+                tn_bytes,
+                output_inds,
+                repeats,
+                seed,
+                max_time,
+                slicing_opts,
+                trial_timeout,
+            )
+        )
    best_cost, best_tree = float("inf"), None
+    deadline = time.monotonic() + max_time if max_time is not None else None
    try:
-        for fut in as_completed(futures, timeout=max_time + 5):
+        timeout = None if deadline is None else max(0.0, deadline - time.monotonic())
+        for fut in as_completed(futures, timeout=timeout):
            try:
                cost, tree = fut.result()
                if cost < best_cost:
@@ -106,21 +155,133 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli
    return best_tree


-def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicing_opts=None, trial_timeout=None):
+def _dask_search(
+    tn,
+    output_inds,
+    total_repeats,
+    max_time,
+    slicing_opts=None,
+    dask_address=None,
+    n_workers=None,
+    optlib=None,
+):
+    """Run one centralized cotengra hyper-optimizer over a dask pool.
+
+    With ``dask_address`` this connects to an external distributed scheduler.
+    Without it, a local dask cluster is created for single-node smoke testing.
+    """
+    try:
+        from distributed import Client, LocalCluster, get_client
+    except ImportError as exc:
+        raise ImportError(
+            "Dask search requires `distributed`. Install it with "
+            "`pip install distributed` or the package extra that provides it."
+        ) from exc
+
+    import cotengra as ctg
+
+    close_client = False
+    close_cluster = False
+    cluster = None
+
+    if dask_address:
+        client = Client(dask_address)
+        close_client = True
+    else:
+        try:
+            client = get_client()
+        except ValueError:
+            cluster = LocalCluster(
+                n_workers=max(1, int(n_workers or os.cpu_count() or 1)),
+                threads_per_worker=1,
+                processes=True,
+                memory_limit=0,
+            )
+            client = Client(cluster)
+            close_client = True
+            close_cluster = True
+
+    kwargs = {}
+    if optlib is not None:
+        kwargs["optlib"] = optlib
+
+    try:
+        opt = ctg.HyperOptimizer(
+            methods=SEARCH_METHODS,
+            max_repeats=total_repeats,
+            max_time=max_time,
+            parallel=client,
+            minimize="combo-256",
+            slicing_opts=slicing_opts,
+            progbar=False,
+            **kwargs,
+        )
+        return tn.contraction_tree(optimize=opt, output_inds=output_inds)
+    finally:
+        if close_client:
+            client.close()
+        if close_cluster:
+            cluster.close()
+
+
+def _mpi_search(
+    tn,
+    output_inds,
+    total_repeats,
+    max_time,
+    n_workers=None,
+    slicing_opts=None,
+    trial_timeout=None,
+    search_backend="processpool",
+    dask_address=None,
+):
    comm = MPI.COMM_WORLD
    rank, size = comm.Get_rank(), comm.Get_size()
-    tn_bytes = pickle.dumps(tn)
+    search_backend = search_backend or "processpool"
+
+    if search_backend == "dask":
+        if not dask_address:
+            raise ValueError(
+                "MPI + dask search requires an external dask scheduler. Start "
+                "dask-scheduler/dask-worker outside mpiexec and pass "
+                "`--dask-address tcp://host:8786`."
+            )
+
+        payload = None
+        if rank == 0:
+            try:
+                tree = _dask_search(
+                    tn,
+                    output_inds,
+                    total_repeats,
+                    max_time,
+                    slicing_opts=slicing_opts,
+                    dask_address=dask_address,
+                    n_workers=n_workers,
+                )
+                payload = ("ok", tree)
+            except Exception as exc:
+                payload = ("error", repr(exc))
+
+        status, value = comm.bcast(payload, root=0)
+        if status == "error":
+            raise RuntimeError(f"Dask path search failed on rank 0: {value}")
+        return value
+
    repeats_per = max(1, total_repeats // size)

-    if n_workers and n_workers > 1:
-        local_tree = _processpool_search(
-            tn, output_inds, repeats_per, n_workers, max_time, slicing_opts, trial_timeout
-        )
-        local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
-    else:
-        local_cost, local_tree = _serial_search(
-            tn_bytes, output_inds, repeats_per, rank, max_time, slicing_opts, trial_timeout
-        )
+    # Run search work in child processes even when n_workers == 1, so the parent
+    # MPI rank can enforce the global timeout by killing active trials.
+    local_tree = _processpool_search(
+        tn,
+        output_inds,
+        repeats_per,
+        max(1, n_workers or 1),
+        max_time,
+        slicing_opts,
+        trial_timeout,
+    )
+    local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")

    all_results = comm.gather((local_cost, local_tree), root=0)
    best_tree = None
@@ -133,11 +294,13 @@ def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicin


 def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
-                         max_time=300, n_workers=48, slicing_opts=None, trial_timeout=None):
+                         max_time=300, n_workers=48, slicing_opts=None,
+                         trial_timeout=None, search_backend=None,
+                         dask_address=None):
    """Parallel contraction path search.

    Args:
-        method: 'processpool' | 'mpi' | 'serial'
+        method: 'processpool' | 'dask' | 'mpi' | 'serial'
        total_repeats: Total optimization repeats across all workers
        max_time: Global timeout per worker (seconds)
        n_workers: Workers per MPI rank (or total for processpool)
@@ -151,45 +314,185 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
    elif method == 'mpi':
        if not _HAVE_MPI:
            raise ImportError("mpi4py not available")
-        return _mpi_search(tn, output_inds, total_repeats, max_time, n_workers, slicing_opts, trial_timeout)
+        return _mpi_search(
+            tn,
+            output_inds,
+            total_repeats,
+            max_time,
+            n_workers,
+            slicing_opts,
+            trial_timeout,
+            search_backend=search_backend,
+            dask_address=dask_address,
+        )
    elif method == 'processpool':
        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
+    elif method == 'dask':
+        return _dask_search(
+            tn,
+            output_inds,
+            total_repeats,
+            max_time,
+            slicing_opts=slicing_opts,
+            dask_address=dask_address,
+            n_workers=n_workers,
+        )
    else:
        raise ValueError(f"Unknown method: {method}")


-def parallel_contract(tree, arrays, method='mpi', comm=None):
+def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
+    """Return comparable cost estimates for a cotengra contraction tree.
+
+    These values are estimates, not profiling results. They are the right first
+    signal for path quality: lower ``combo`` usually means lower CPU contraction
+    time, while ``peak_memory_gib`` estimates the largest intermediate tensor.
+    """
+    stats = tree.contract_stats()
+    flops = float(stats["flops"])
+    write = float(stats["write"])
+    size = float(stats["size"])
+    combo = float(tree.combo_cost(factor=combo_factor))
+    nslices = int(getattr(tree, "multiplicity", 1))
+    original_flops = float(stats.get("original_flops", flops))
+
+    return {
+        "flops": flops,
+        "write": write,
+        "size": size,
+        "combo": combo,
+        "log10_flops": log10(flops) if flops > 0 else float("-inf"),
+        "log10_write": log10(write) if write > 0 else float("-inf"),
+        "log2_size": log2(size) if size > 0 else float("-inf"),
+        "log10_combo": log10(combo) if combo > 0 else float("-inf"),
+        "nslices": nslices,
+        "slicing_overhead": flops / original_flops if original_flops > 0 else float("nan"),
+        "peak_memory_gib": size * dtype_bytes / 1024**3,
+    }
+
+
+@dataclass(frozen=True)
+class SlicePlan:
+    """Slice ownership for one MPI rank."""
+
+    rank: int
+    size: int
+    nslices: int
+    indices: tuple
+    assignment: str = "block"
+
+
+@dataclass(frozen=True)
+class SlicedContractStats:
+    """Diagnostics for one sliced contraction."""
+
+    rank: int
+    size: int
+    nslices: int
+    local_slices: int
+    assignment: str
+
+
+def mpi_slice_plan(nslices, rank, size, assignment="block"):
+    """Return the contraction slice ids assigned to one MPI rank.
+
+    ``block`` gives each rank a contiguous range, mirroring cutensornet's
+    slice-range style. ``cyclic`` gives rank ``r`` slices ``r, r + size, ...``,
+    which can balance better if individual slice costs vary.
+    """
+    if nslices < 0:
+        raise ValueError("nslices must be non-negative.")
+    if size <= 0:
+        raise ValueError("size must be positive.")
+    if not 0 <= rank < size:
+        raise ValueError("rank must satisfy 0 <= rank < size.")
+
+    if assignment == "block":
+        chunk, extra = divmod(nslices, size)
+        start = rank * chunk + min(rank, extra)
+        stop = start + chunk + (1 if rank < extra else 0)
+        indices = tuple(range(start, stop))
+    elif assignment == "cyclic":
+        indices = tuple(range(rank, nslices, size))
+    else:
+        raise ValueError("assignment must be 'block' or 'cyclic'.")
+
+    return SlicePlan(rank, size, nslices, indices, assignment)
+
+
+def _array_backend(arrays):
+    return "torch" if type(arrays[0]).__module__.startswith("torch") else "numpy"
+
+
+def _to_numpy_vector(value, is_torch):
+    if is_torch:
+        return value.detach().cpu().numpy().reshape(-1)
+    return np.asarray(value).reshape(-1)
+
+
+def _zero_vector_like(arrays):
+    return np.zeros(1, dtype=np.complex128)
+
+
+def contract_tree_slices(tree, arrays, slice_indices, backend=None):
+    """Contract a subset of cotengra slices and return their local sum."""
+    backend = backend or _array_backend(arrays)
+    is_torch = backend == "torch"
+    local = None
+
+    for slice_id in slice_indices:
+        value = tree.contract_slice(arrays, slice_id, backend=backend)
+        value = _to_numpy_vector(value, is_torch)
+        local = value if local is None else local + value
+
+    return _zero_vector_like(arrays) if local is None else local
+
+
+def parallel_contract(
+    tree,
+    arrays,
+    method='mpi',
+    comm=None,
+    assignment="block",
+    return_stats=False,
+):
    if method == 'mpi':
        if not _HAVE_MPI or comm is None:
            raise ValueError("MPI method requires mpi4py and comm")
-        return _contract_mpi(tree, arrays, comm)
+        return _contract_mpi(
+            tree,
+            arrays,
+            comm,
+            assignment=assignment,
+            return_stats=return_stats,
+        )
    raise ValueError(f"Unknown method: {method}")


-def _contract_mpi(tree, arrays, comm, root=0):
+def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False):
    rank, size = comm.Get_rank(), comm.Get_size()
-    is_torch = type(arrays[0]).__module__.startswith("torch")
+    backend = _array_backend(arrays)
+    is_torch = backend == "torch"
+    nslices = int(getattr(tree, "multiplicity", 1))
+    stats = SlicedContractStats(rank, size, nslices, 0, assignment)

-    if is_torch:
-        result_torch = None
-        for i in range(rank, tree.multiplicity, size):
-            x = tree.contract_slice(arrays, i, backend="torch").reshape(-1)
-            result_torch = x if result_torch is None else result_torch + x
+    if not set(getattr(tree, "sliced_inds", ())).isdisjoint(set(getattr(tree, "output", ()))):
+        raise NotImplementedError(
+            "MPI sliced contraction currently requires sliced indices not to "
+            "appear in the output."
+        )

-        if result_torch is None:
-            result_np = np.zeros(1, dtype=np.complex128)
-        else:
-            result_np = result_torch.detach().cpu().numpy()
-    else:
-        result_np = None
-        for i in range(rank, tree.multiplicity, size):
-            x = tree.contract_slice(arrays, i)
-            x_np = np.asarray(x).reshape(-1)
-            result_np = x_np if result_np is None else result_np + x_np
+    if nslices <= 1:
+        if rank != root:
+            return (None, stats) if return_stats else None
+        result = tree.contract(arrays, backend=backend)
+        result = _to_numpy_vector(result, is_torch)
+        return (result, stats) if return_stats else result

-        if result_np is None:
-            result_np = np.zeros(1, dtype=np.complex128)
+    plan = mpi_slice_plan(nslices, rank, size, assignment=assignment)
+    local = contract_tree_slices(tree, arrays, plan.indices, backend=backend)
+    stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment)

-    result = np.zeros_like(result_np) if rank == root else None
-    comm.Reduce(result_np, result, root=root)
-    return result
+    result = np.zeros_like(local) if rank == root else None
+    comm.Reduce(local, result, root=root)
+    return (result, stats) if return_stats else result
@@ -0,0 +1,130 @@
+import math
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Z
+
+from qibotn.backends.cpu import CpuTensorNet
+from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit
+
+
+def build_circuit(nqubits=6):
+    circuit = Circuit(nqubits)
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=0.1 * (qubit + 1)))
+        circuit.add(gates.RZ(qubit, theta=-0.05 * (qubit + 1)))
+    for qubit in range(nqubits - 1):
+        circuit.add(gates.CNOT(qubit, qubit + 1))
+    return circuit
+
+
+def build_observable(nqubits):
+    form = 0
+    for qubit in range(nqubits):
+        form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def test_cpu_generic_tn_expectation_matches_statevector():
+    circuit = build_circuit()
+    observable = build_observable(circuit.nqubits)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": False,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_expectation_matches_statevector():
+    circuit = build_circuit()
+    observable = build_observable(circuit.nqubits)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+            "max_bond_dimension": 64,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_runcard_pauli_pattern_matches_statevector():
+    circuit = build_circuit()
+    observable = {"pauli_string_pattern": "IXZ"}
+    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
+        form=X(1) * Z(2) * X(4) * Z(5)
+    )
+    exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
+
+    for mps_enabled in (False, True):
+        backend = CpuTensorNet(
+            {
+                "MPI_enabled": False,
+                "MPS_enabled": mps_enabled,
+                "NCCL_enabled": False,
+                "expectation_enabled": observable,
+                "max_bond_dimension": 64,
+                "tensor_module": "torch",
+                "torch_threads": 1,
+            }
+        )
+        value = backend.execute_circuit(circuit)[0]
+
+        assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_sampling_uses_nshots():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    for qubit in range(3):
+        circuit.add(gates.CNOT(qubit, qubit + 1))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": False,
+        }
+    )
+    result = backend.execute_circuit(circuit, nshots=100)
+
+    assert sum(result.frequencies().values()) == 100
+    assert set(result.frequencies()) <= {"0000", "1111"}
+
+
+def test_cpu_generic_tn_long_pauli_string_matches_statevector():
+    circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42)
+    observable = {"pauli_string_pattern": "XZ"}
+    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
+        form=X(0) * Z(1) * X(2) * Z(3) * X(4) * Z(5) * X(6) * Z(7) * X(8) * Z(9)
+    )
+    exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": False,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
@@ -0,0 +1,46 @@
+import numpy as np
+
+from qibotn.parallel import _split_repeats, contract_tree_slices, mpi_slice_plan
+
+
+def test_mpi_slice_plan_block_balances_contiguous_ranges():
+    plans = [mpi_slice_plan(10, rank, 4, assignment="block") for rank in range(4)]
+
+    assert [plan.indices for plan in plans] == [
+        (0, 1, 2),
+        (3, 4, 5),
+        (6, 7),
+        (8, 9),
+    ]
+
+
+def test_mpi_slice_plan_cyclic_balances_round_robin():
+    plans = [mpi_slice_plan(10, rank, 4, assignment="cyclic") for rank in range(4)]
+
+    assert [plan.indices for plan in plans] == [
+        (0, 4, 8),
+        (1, 5, 9),
+        (2, 6),
+        (3, 7),
+    ]
+
+
+class DummyTree:
+    def contract_slice(self, arrays, i, backend=None):
+        return arrays[0] * (i + 1)
+
+
+def test_contract_tree_slices_sums_numpy_slices():
+    result = contract_tree_slices(
+        DummyTree(),
+        [np.asarray([2.0 + 0.0j])],
+        (0, 2, 3),
+        backend="numpy",
+    )
+
+    np.testing.assert_allclose(result, np.asarray([16.0 + 0.0j]))
+
+
+def test_split_repeats_balances_workers():
+    assert _split_repeats(10, 4) == [3, 3, 2, 2]
+    assert _split_repeats(2, 4) == [1, 1]
@@ -0,0 +1,167 @@
+import math
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason
+from qibotn.backends.vidal_tebd import (
+    _route_non_adjacent_gates,
+    _gate_sites,
+)
+
+
+def build_local_circuit(nqubits=8, nlayers=3, seed=42):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+        for q in range(layer % 2, nqubits - 1, 2):
+            circuit.add(gates.CNOT(q, q + 1))
+    return circuit
+
+
+def test_vidal_backend_expectation_matches_statevector():
+    circuit = build_local_circuit()
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.5 * X(0) * Z(1) + 0.25 * Y(2) * Y(3) - 0.7 * Z(7)
+    )
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=128, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_fallback_for_non_adjacent_gate():
+    """compile_circuit=False (default) → falls back to qmatchatea for non-adjacent."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=32, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_routes_non_adjacent_with_compile():
+    """Non-adjacent gate with compile_circuit=True goes through Vidal SWAP routing."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_can_route_non_adjacent():
+    """_can_route_non_adjacent correctly identifies routable circuits."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    assert _can_route_non_adjacent(circuit)
+
+    circuit.add(gates.CNOT(0, 1))
+    assert _can_route_non_adjacent(circuit)
+
+
+def test_cannot_route_multi_qubit():
+    """Circuits with 3+ qubit gates cannot be routed."""
+    circuit = Circuit(3)
+    circuit.add(gates.TOFFOLI(0, 1, 2))
+    assert not _can_route_non_adjacent(circuit)
+
+
+def test_routing_preserves_adjacent_gates():
+    """_route_non_adjacent_gates leaves adjacent gates unchanged."""
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    original = list(circuit.queue)
+    routed = _route_non_adjacent_gates(original, 4)
+
+    # Count 2Q gates — should be more due to inserted SWAPs, so just
+    # check that all 2-site gates ARE adjacent.
+    for gate in routed:
+        sites = _gate_sites(gate)
+        if len(sites) == 2:
+            diff = abs(sites[0] - sites[1])
+            assert diff == 1, f"Non-adjacent gate after routing: {gate.name} on {sites}"
+
+
+def test_routing_non_adjacent_cnot():
+    """Manually verify SWAP+CNOT+unSWAP for CNOT(0,3)."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(3))
+    circuit.add(gates.CNOT(0, 3))
+
+    routed = _route_non_adjacent_gates(list(circuit.queue), 4)
+
+    # Expected: H(0), H(3), SWAP(2,3), SWAP(1,2), routed CNOT on (0,1), SWAP(1,2), SWAP(2,3)
+    names = [getattr(g, "name", g.__class__.__name__) for g in routed]
+    assert names == ["h", "h", "swap", "swap", "routed_two_qubit", "swap", "swap"], f"Got {names}"
+
+    # Verify expectation through full pipeline
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_truncation_error_no_truncation():
+    """With large bond, truncation error should be essentially zero."""
+    circuit = build_local_circuit(nqubits=6, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(form=0.5 * X(0) * Z(1))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=256, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+    _ = value  # ensure computation runs
+
+    assert backend.last_truncation_error < 1e-14, (
+        f"Expected near-zero truncation error, got {backend.last_truncation_error}"
+    )
+
+
+def test_vidal_backend_matches_statevector_multiterm():
+    """Multi-term observable with non-adjacent gates, compile_circuit=True."""
+    circuit = Circuit(5)
+    for q in range(5):
+        circuit.add(gates.RY(q, theta=0.7))
+        circuit.add(gates.RZ(q, theta=0.3))
+    circuit.add(gates.CNOT(0, 2))
+    circuit.add(gates.CNOT(1, 4))
+
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=(0.3 * X(0) * Z(2) + 0.7 * Y(1) * Y(4) - 0.5 * Z(0) * X(4))
+    )
+
+    exact_state = circuit().state(numpy=True)
+    exact = observable.expectation_from_state(exact_state)
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+    np.testing.assert_allclose(value, exact, atol=1e-10)
@@ -0,0 +1,18 @@
+# Tools
+
+Auxiliary scripts for profiling, legacy comparisons, and scale probes.
+
+The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
+For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
+
+Files here are intentionally secondary:
+
+- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
+- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
+- `run_cpu_single_cases.sh`: single-node scale probes.
+- `run_cpu_large_cases.sh`: two-node MPI scale probes.
+- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
+- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
+- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
+- `qibojit_reference_expectation.py`: state-vector reference helper.
+- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
@@ -1,62 +1,34 @@
-"""Baseline MPS expectation scan with the qmatchatea backend."""
+"""MPS expectation benchmark for qmatchatea and Vidal backends."""

 import argparse
 import json
 import logging
-import math
+import os
+import socket
 import time

 import numpy as np
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Z

+from qibotn.benchmark_cases import (
+    build_circuit as build_benchmark_circuit,
+    exact_pauli_sum,
+    observable_terms,
+    terms_to_dict,
+)
 from qibotn.backends.qmatchatea import QMatchaTeaBackend
 from qibotn.backends.vidal_tebd import run_vidal_ring_xz


 def build_circuit(nqubits, nlayers, seed):
-    import numpy as np
-
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-    for _ in range(nlayers):
-        for qubit in range(nqubits):
-            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        for qubit in range(0, nqubits - 1, 2):
-            circuit.add(gates.CNOT(qubit, qubit + 1))
-        for qubit in range(1, nqubits - 1, 2):
-            circuit.add(gates.CNOT(qubit, qubit + 1))
-    return circuit
+    return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)


 def build_observable(nqubits):
-    form = 0
-    for qubit in range(nqubits):
-        form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
-    return hamiltonians.SymbolicHamiltonian(form=form)
+    return terms_to_dict(observable_terms("ring_xz", nqubits))


 def exact_expectation(circuit, nqubits):
-    import numpy as np
-
-    state = circuit().state(numpy=True).reshape(-1)
-
-    value = 0.0
-    chunk_size = 1 << 20
-    for qubit in range(nqubits):
-        next_qubit = (qubit + 1) % nqubits
-        x_flip = 1 << (nqubits - 1 - qubit)
-        z_shift = nqubits - 1 - next_qubit
-        term = 0.0
-        for start in range(0, state.size, chunk_size):
-            stop = min(start + chunk_size, state.size)
-            indices = np.arange(start, stop, dtype=np.int64)
-            z_bit = (indices >> z_shift) & 1
-            z_phase = 1 - 2 * z_bit
-            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
-        value += 0.5 * term
-    return float(value)
+    return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)


 def main():
@@ -68,16 +40,21 @@ def main():
    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
    parser.add_argument("--torch-threads", type=int, default=32)
    parser.add_argument(
-        "--executor", choices=("qmatchatea", "vidal"), default="qmatchatea"
+        "--executor",
+        choices=("qmatchatea", "vidal", "vidal-mpi"),
+        default="qmatchatea",
    )
-    parser.add_argument("--vidal-workers", type=int, default=1)
-    parser.add_argument("--vidal-batched", action="store_true")
    parser.add_argument("--mpi-ct", action="store_true")
    parser.add_argument("--mpi-barriers", type=int, default=-1)
    parser.add_argument("--mpi-isometrization", type=int, default=-1)
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    parser.add_argument("--reference-file")
+    parser.add_argument(
+        "--mpi-rank-map",
+        action="store_true",
+        help="Print MPI rank, host, pid, and torch thread placement metadata.",
+    )
    args = parser.parse_args()
    logging.getLogger("qibo.config").setLevel(logging.ERROR)
    logging.getLogger("qtealeaves").setLevel(logging.ERROR)
@@ -91,6 +68,26 @@ def main():

        rank = MPI.COMM_WORLD.Get_rank()
        size = MPI.COMM_WORLD.Get_size()
+        if args.mpi_rank_map:
+            rank_info = {
+                "rank": rank,
+                "size": size,
+                "host": socket.gethostname(),
+                "pid": os.getpid(),
+                "torch_threads": args.torch_threads,
+                "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
+                "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
+            }
+            rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
+            if rank == 0:
+                print("mpi_rank_map")
+                for item in sorted(rank_infos, key=lambda row: row["rank"]):
+                    print(
+                        "rank={rank} size={size} host={host} pid={pid} "
+                        "torch_threads={torch_threads} "
+                        "OMP_NUM_THREADS={omp_num_threads} "
+                        "MKL_NUM_THREADS={mkl_num_threads}".format(**item)
+                    )

    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
    observable = build_observable(args.nqubits)
@@ -106,30 +103,42 @@ def main():
        exact = exact_expectation(circuit, args.nqubits)

    if rank == 0:
-        mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
+        if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
+            mpi_label = f"VidalSegment/{size}"
+        else:
+            mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
        print(
            f"nqubits={args.nqubits} nlayers={args.nlayers} "
            f"bond={args.bond} seed={args.seed} "
            f"tensor_module={args.tensor_module} svd_control=E! "
-            f"compile_circuit=True mpi={mpi_label} executor={args.executor} "
-            f"vidal_workers={args.vidal_workers} vidal_batched={args.vidal_batched}"
+            f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
        )
        if exact is not None:
            print(f"exact={exact:.16e}")
        print("expval abs_error rel_error seconds")

    start = time.perf_counter()
-    if args.executor == "vidal":
+    timings = None
+    if args.executor in ("vidal", "vidal-mpi"):
+        if args.executor == "vidal-mpi" and not args.mpi_ct:
+            raise ValueError("--executor vidal-mpi requires --mpi-ct.")
        if args.mpi_ct:
-            raise ValueError("--executor vidal is a single-process executor.")
-        value = run_vidal_ring_xz(
-            circuit,
-            max_bond=args.bond,
-            cut_ratio=1e-12,
-            tensor_module=args.tensor_module,
-            workers=args.vidal_workers,
-            use_batched=args.vidal_batched,
-        )
+            from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
+
+            value, timings = run_segment_vidal_mpi_ring_xz(
+                circuit,
+                max_bond=args.bond,
+                cut_ratio=1e-12,
+                tensor_module=args.tensor_module,
+                comm=MPI.COMM_WORLD,
+            )
+        else:
+            value = run_vidal_ring_xz(
+                circuit,
+                max_bond=args.bond,
+                cut_ratio=1e-12,
+                tensor_module=args.tensor_module,
+            )
    else:
        backend = QMatchaTeaBackend()
        backend.configure_tn_simulation(
@@ -151,6 +160,12 @@ def main():
            preprocess=False,
            compile_circuit=True,
        )
+    max_timings = None
+    if timings:
+        max_timings = {
+            key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
+            for key, local_value in timings.items()
+        }
    if rank != 0:
        return
    value = float(np.real(value))
@@ -158,6 +173,10 @@ def main():
    abs_error = float("nan") if exact is None else abs(value - exact)
    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
+    if max_timings:
+        print("timing_section max_seconds")
+        for key, max_value in max_timings.items():
+            print(f"{key} {max_value:.6f}")


 if __name__ == "__main__":
@@ -0,0 +1,137 @@
+"""Compare QMatchaTeaBackend with the VidalBackend fast path."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import time
+
+import numpy as np
+import torch
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+from qibotn.backends.qmatchatea import QMatchaTeaBackend
+from qibotn.backends.vidal import VidalBackend
+
+
+def build_circuit(nqubits, nlayers, seed, kind):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+        if kind == "brickwall":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+        elif kind == "shifted-cz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(q, q + 1))
+        elif kind == "reversed-cnot":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+    return circuit
+
+
+def build_observable(nqubits, kind):
+    form = 0
+    if kind == "ring-xz":
+        for q in range(nqubits):
+            form += 0.5 * X(q) * Z((q + 1) % nqubits)
+    elif kind == "open-zz":
+        for q in range(nqubits - 1):
+            form += Z(q) * Z(q + 1) / (nqubits - 1)
+    elif kind == "mixed":
+        form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
+        for q in range(0, nqubits - 1, 3):
+            form += 0.125 * Y(q) * Y(q + 1)
+    else:
+        raise ValueError(f"Unknown observable kind {kind!r}.")
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def run_backend(backend, circuit, observable):
+    start = time.perf_counter()
+    value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
+    return float(np.real(value)), time.perf_counter() - start
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=34)
+    parser.add_argument("--nlayers", type=int, default=20)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument(
+        "--circuit-kind",
+        choices=("brickwall", "shifted-cz", "reversed-cnot"),
+        default="brickwall",
+    )
+    parser.add_argument(
+        "--observable-kind",
+        choices=("ring-xz", "open-zz", "mixed"),
+        default="ring-xz",
+    )
+    parser.add_argument("--reference-file")
+    parser.add_argument("--skip-qmatchatea", action="store_true")
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+    circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
+    observable = build_observable(args.nqubits, args.observable_kind)
+
+    exact = None
+    if args.reference_file:
+        with open(args.reference_file, "r", encoding="utf-8") as f:
+            exact = float(json.load(f)["expectation"])
+
+    print(
+        f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
+        f"circuit={args.circuit_kind} observable={args.observable_kind} "
+        f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
+    )
+    if exact is not None:
+        print(f"exact={exact:.16e}")
+    print("backend value abs_error seconds")
+
+    if not args.skip_qmatchatea:
+        qmt = QMatchaTeaBackend()
+        qmt.configure_tn_simulation(
+            ansatz="MPS",
+            max_bond_dimension=args.bond,
+            cut_ratio=1e-12,
+            svd_control="E!",
+            tensor_module=args.tensor_module,
+            compile_circuit=True,
+            track_memory=False,
+        )
+        value, seconds = run_backend(qmt, circuit, observable)
+        error = float("nan") if exact is None else abs(value - exact)
+        print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
+
+    vidal = VidalBackend()
+    vidal.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=args.bond,
+        cut_ratio=1e-12,
+        tensor_module=args.tensor_module,
+        compile_circuit=True,
+        fallback=True,
+    )
+    value, seconds = run_backend(vidal, circuit, observable)
+    error = float("nan") if exact is None else abs(value - exact)
+    print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,72 @@
+"""Chrome trace profiler for the VidalBackend fast path."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import torch
+from torch.profiler import ProfilerActivity, profile
+
+from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
+from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=34)
+    parser.add_argument("--nlayers", type=int, default=20)
+    parser.add_argument("--bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument("--cut-ratio", type=float, default=1e-12)
+    parser.add_argument("--profile-memory", action="store_true")
+    parser.add_argument("--rows", type=int, default=60)
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+
+    prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
+    trace_path = Path(f"{prefix}.json")
+    table_path = Path(f"{prefix}.txt")
+    trace_path.parent.mkdir(parents=True, exist_ok=True)
+
+    circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
+    observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
+    config = ExpectationConfig(
+        ansatz="mps",
+        bond=args.bond,
+        cut_ratio=args.cut_ratio,
+        tensor_module="torch",
+        torch_threads=args.torch_threads,
+    )
+
+    print(
+        f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
+        f"bond={args.bond} threads={args.torch_threads}"
+    )
+
+    with profile(
+        activities=[ProfilerActivity.CPU],
+        record_shapes=args.profile_memory,
+        profile_memory=args.profile_memory,
+        with_stack=args.profile_memory,
+    ) as prof:
+        result = run_cpu_expectation(circuit, observable, config)
+
+    table = (
+        f"expval={result.value:.16e}\n\n"
+        f"# sorted by self_cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
+        f"# sorted by cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
+    )
+
+    print(table, end="")
+    table_path.write_text(table, encoding="utf-8")
+    prof.export_chrome_trace(str(trace_path))
+    print(f"trace={trace_path}\ntable={table_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Large CPU expectation benchmarks for two-server runs.
+#
+# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
+# each.  Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
+# needed.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+HOSTFILE="${HOSTFILE:-hostfile}"
+
+MPS_RANKS="${MPS_RANKS:-8}"
+MPS_THREADS="${MPS_THREADS:-12}"
+TN_RANKS="${TN_RANKS:-12}"
+TN_THREADS="${TN_THREADS:-8}"
+
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+
+run_mpi() {
+  local ranks="$1"
+  shift
+  "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
+}
+
+run_case() {
+  local title="$1"
+  shift
+  echo
+  echo "================================================================================"
+  echo "$title"
+  echo "================================================================================"
+  echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  echo "$*"
+  "$@"
+}
+
+case "${1:-help}" in
+  smoke)
+    run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_SMOKE_NQ:-40}" \
+        --nlayers "${MPS_SMOKE_LAYERS:-30}" \
+        --bond "${MPS_SMOKE_BOND:-2048}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz \
+        --observables ring_xz open_zz range2_xx
+
+    run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
+      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
+        --mpi \
+        --nqubits "${TN_SMOKE_NQ:-32}" \
+        --nlayers "${TN_SMOKE_LAYERS:-16}" \
+        --torch-threads "$TN_THREADS" \
+        --circuits brickwall_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz range2_xx \
+        --tn-target-slices "${TN_SMOKE_SLICES:-12}"
+    ;;
+
+  mps-long)
+    run_case "MPS MPI long: n=64 layers=48 bond=4096" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_LONG_NQ:-64}" \
+        --nlayers "${MPS_LONG_LAYERS:-48}" \
+        --bond "${MPS_LONG_BOND:-4096}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  mps-pressure)
+    run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_PRESSURE_NQ:-80}" \
+        --nlayers "${MPS_PRESSURE_LAYERS:-64}" \
+        --bond "${MPS_PRESSURE_BOND:-4096}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
+        --observables ring_xz open_zz mixed_local range2_xx long_z_string
+    ;;
+
+  tn-long)
+    run_case "TN MPI long: n=36 layers=20 target_slices=24" \
+      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
+        --mpi \
+        --nqubits "${TN_LONG_NQ:-36}" \
+        --nlayers "${TN_LONG_LAYERS:-20}" \
+        --torch-threads "$TN_THREADS" \
+        --circuits brickwall_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz range2_xx \
+        --tn-target-slices "${TN_LONG_SLICES:-24}"
+    ;;
+
+  all)
+    "$0" smoke
+    "$0" mps-long
+    "$0" tn-long
+    ;;
+
+  help|*)
+    cat >&2 <<'EOF'
+Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
+
+Common overrides:
+  HOSTFILE=hostfile
+  PYTHON_BIN=.venv/bin/python
+  MPIEXEC=mpiexec
+  MPS_RANKS=8 MPS_THREADS=12
+  TN_RANKS=12 TN_THREADS=8
+
+Scale overrides:
+  MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
+  MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
+  TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
+EOF
+    exit 2
+    ;;
+esac
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Single-node CPU scale probes for expectation benchmarks.
+#
+# Intended for one 96-core / ~500 GiB RAM node.  The default "probe" mode runs
+# moderate MPS and TN cases first.  Larger modes are available after checking
+# runtime and memory from the probe output.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+TIME_BIN="${TIME_BIN:-/usr/bin/time}"
+
+MPS_RANKS="${MPS_RANKS:-8}"
+MPS_THREADS="${MPS_THREADS:-12}"
+TN_RANKS="${TN_RANKS:-8}"
+TN_THREADS="${TN_THREADS:-12}"
+
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+
+estimate_mps_memory() {
+  local nqubits="$1"
+  local bond="$2"
+  "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
+import sys
+n = int(sys.argv[1])
+chi = int(sys.argv[2])
+ranks = int(sys.argv[3])
+resident = n * 2 * chi * chi * 16
+per_rank = resident / ranks
+print(
+    "MPS rough resident memory: "
+    f"total={resident / 1024**3:.1f} GiB "
+    f"per_rank={per_rank / 1024**3:.1f} GiB "
+    "(temporary eig/SVD workspaces are additional)"
+)
+PY
+}
+
+run_timed() {
+  echo
+  echo "--------------------------------------------------------------------------------"
+  echo "$*"
+  echo "--------------------------------------------------------------------------------"
+  "$TIME_BIN" -v "$@"
+}
+
+run_mps_case() {
+  local label="$1"
+  local nqubits="$2"
+  local nlayers="$3"
+  local bond="$4"
+  shift 4
+  echo
+  echo "================================================================================"
+  echo "$label"
+  echo "================================================================================"
+  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  estimate_mps_memory "$nqubits" "$bond"
+  run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
+    --mpi --mps \
+    --nqubits "$nqubits" \
+    --nlayers "$nlayers" \
+    --bond "$bond" \
+    --torch-threads "$MPS_THREADS" \
+    "$@"
+}
+
+run_tn_case() {
+  local label="$1"
+  local nqubits="$2"
+  local nlayers="$3"
+  shift 3
+  echo
+  echo "================================================================================"
+  echo "$label"
+  echo "================================================================================"
+  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
+  run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
+    --mpi \
+    --nqubits "$nqubits" \
+    --nlayers "$nlayers" \
+    --torch-threads "$TN_THREADS" \
+    "$@"
+}
+
+case "${1:-help}" in
+  probe)
+    run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
+      --circuits brickwall_cnot \
+      --observables ring_xz
+
+    run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
+      --circuits brickwall_cnot \
+      --observables ring_xz \
+      --tn-target-slices 8
+    ;;
+
+  mps-medium)
+    run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  mps-long)
+    run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  tn-medium)
+    run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
+      --circuits brickwall_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz range2_xx \
+      --tn-target-slices 16
+    ;;
+
+  tn-long)
+    run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
+      --circuits brickwall_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz range2_xx \
+      --tn-target-slices 32
+    ;;
+
+  help|*)
+    cat >&2 <<'EOF'
+Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
+
+Common overrides:
+  PYTHON_BIN=.venv/bin/python
+  MPIEXEC=mpiexec
+  MPS_RANKS=8 MPS_THREADS=12
+  TN_RANKS=8 TN_THREADS=12
+  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
+EOF
+    exit 2
+    ;;
+esac
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+NQ="${NQ:-34}"
+LAYERS="${LAYERS:-20}"
+BOND="${BOND:-512}"
+SEED="${SEED:-42}"
+RANKS="${RANKS:-1 2 4}"
+THREADS="${THREADS:-32 32 16}"
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+CIRCUIT="${CIRCUIT:-brickwall_cnot}"
+OBSERVABLE="${OBSERVABLE:-ring_xz}"
+EXACT="${EXACT:-0}"
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+if [[ "${1:-help}" != "run" ]]; then
+  cat >&2 <<'EOF'
+Usage: tools/run_vidal_segment_mpi_scan.sh run
+
+Overrides:
+  NQ=34 LAYERS=20 BOND=512 SEED=42
+  RANKS="1 2 4" THREADS="32 32 16"
+  CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
+  EXACT=1
+  PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
+EOF
+  if [[ "${1:-help}" == "help" ]]; then
+    exit 0
+  fi
+  exit 2
+fi
+
+read -r -a ranks <<< "$RANKS"
+read -r -a threads <<< "$THREADS"
+
+if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
+  echo "RANKS and THREADS must have the same number of entries." >&2
+  exit 2
+fi
+
+common=(
+  --nqubits "$NQ"
+  --nlayers "$LAYERS"
+  --bond "$BOND"
+  --seed "$SEED"
+  --mps
+  --circuits "$CIRCUIT"
+  --observables "$OBSERVABLE"
+)
+
+if [[ "$EXACT" == "1" ]]; then
+  common+=(--exact)
+fi
+
+for idx in "${!ranks[@]}"; do
+  nrank="${ranks[$idx]}"
+  nthr="${threads[$idx]}"
+  if [[ "$nrank" == "1" ]]; then
+    echo "== Vidal serial ranks=1 torch_threads=$nthr =="
+    "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      "${common[@]}" --torch-threads "$nthr"
+  else
+    echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
+    "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      "${common[@]}" --torch-threads "$nthr" --mpi
+  fi
+done
@@ -0,0 +1,202 @@
+"""Correctness checks for the Vidal/TEBD MPS fast path.
+
+The cases here intentionally cover more than the benchmark ring-XZ observable:
+different nearest-neighbor gate orientations and several Pauli-sum observables.
+Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
+the segmented Vidal executor.
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import time
+
+import numpy as np
+import torch
+from qibo import Circuit, gates
+
+from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+from qibotn.backends.vidal_tebd import VidalTEBDExecutor
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+            if kind == "rx_ry_cz":
+                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind in ("brickwall", "reversed_cnot"):
+            for q in range(0, nqubits - 1, 2):
+                if kind == "reversed_cnot" and (layer % 2):
+                    circuit.add(gates.CNOT(q + 1, q))
+                else:
+                    circuit.add(gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                if kind == "reversed_cnot" and not (layer % 2):
+                    circuit.add(gates.CNOT(q + 1, q))
+                else:
+                    circuit.add(gates.CNOT(q, q + 1))
+        elif kind == "rx_ry_cz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+    return circuit
+
+
+def observable_terms(kind, nqubits):
+    if kind == "ring_xz":
+        return [
+            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
+            for site in range(nqubits)
+        ]
+    if kind == "open_zz":
+        return [
+            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
+            for site in range(nqubits - 1)
+        ]
+    if kind == "mixed_local":
+        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
+        terms += [
+            (0.125, (("Y", site), ("Y", site + 1)))
+            for site in range(0, nqubits - 1, 3)
+        ]
+        return terms
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def exact_pauli_sum(circuit, terms, nqubits):
+    state = circuit().state(numpy=True).reshape(-1)
+    indices = np.arange(state.size, dtype=np.int64)
+    value = 0.0 + 0.0j
+    for coeff, ops in terms:
+        flipped = indices.copy()
+        phase = np.ones(state.size, dtype=np.complex128)
+        for name, site in ops:
+            shift = nqubits - 1 - site
+            bit = (indices >> shift) & 1
+            name = name.upper()
+            if name == "X":
+                flipped ^= 1 << shift
+            elif name == "Y":
+                flipped ^= 1 << shift
+                phase *= 1j * (1 - 2 * bit)
+            elif name == "Z":
+                phase *= 1 - 2 * bit
+            elif name != "I":
+                raise ValueError(f"Unsupported Pauli {name!r}.")
+        value += coeff * np.vdot(state[flipped], phase * state)
+    return float(value.real)
+
+
+def run_vidal(circuit, terms, nqubits, bond, tensor_module):
+    executor = VidalTEBDExecutor(
+        nqubits=nqubits,
+        max_bond=bond,
+        cut_ratio=1e-12,
+        tensor_module=tensor_module,
+    )
+    executor.run_circuit(circuit)
+    return float(executor.expectation_pauli_sum(terms))
+
+
+def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
+    executor = SegmentVidalMPIExecutor(
+        nqubits=nqubits,
+        max_bond=bond,
+        cut_ratio=1e-12,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    executor.run_circuit(circuit)
+    return executor.expectation_pauli_sum_root(terms)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=16)
+    parser.add_argument("--nlayers", type=int, default=6)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument("--mpi", action="store_true")
+    parser.add_argument(
+        "--circuits",
+        nargs="+",
+        default=("brickwall", "reversed_cnot", "rx_ry_cz"),
+    )
+    parser.add_argument(
+        "--observables",
+        nargs="+",
+        default=("ring_xz", "open_zz", "mixed_local"),
+    )
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+    comm = None
+    rank = 0
+    size = 1
+    if args.mpi:
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        size = comm.Get_size()
+
+    if rank == 0:
+        mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
+        print(
+            f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"bond={args.bond} tensor_module={args.tensor_module}"
+        )
+        print("circuit observable exact value abs_error seconds")
+
+    for circuit_kind in args.circuits:
+        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
+        exact = None
+        if rank == 0:
+            exact_values = {
+                obs: exact_pauli_sum(
+                    circuit, observable_terms(obs, args.nqubits), args.nqubits
+                )
+                for obs in args.observables
+            }
+        else:
+            exact_values = None
+        if comm is not None:
+            exact_values = comm.bcast(exact_values, root=0)
+
+        for obs_kind in args.observables:
+            terms = observable_terms(obs_kind, args.nqubits)
+            start = time.perf_counter()
+            if args.mpi:
+                value = run_segment_mpi(
+                    circuit,
+                    terms,
+                    args.nqubits,
+                    args.bond,
+                    args.tensor_module,
+                    comm,
+                )
+            else:
+                value = run_vidal(
+                    circuit, terms, args.nqubits, args.bond, args.tensor_module
+                )
+            if rank != 0:
+                continue
+            elapsed = time.perf_counter() - start
+            exact = exact_values[obs_kind]
+            print(
+                f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
+                f"{abs(value - exact):.6e} {elapsed:.3f}"
+            )
+
+
+if __name__ == "__main__":
+    main()