完善mps的vidal机制,多节点并行;补充tn搜索时dask集群搜索的方式

2026-05-12 15:44:19 +08:00
parent aa122964b4
commit 72f95599bb
32 changed files with 3529 additions and 320 deletions
--- a/tools/README.md
+++ b/tools/README.md
@@ -0,0 +1,18 @@
+# Tools
+
+Auxiliary scripts for profiling, legacy comparisons, and scale probes.
+
+The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
+For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
+
+Files here are intentionally secondary:
+
+- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
+- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
+- `run_cpu_single_cases.sh`: single-node scale probes.
+- `run_cpu_large_cases.sh`: two-node MPI scale probes.
+- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
+- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
+- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
+- `qibojit_reference_expectation.py`: state-vector reference helper.
+- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
--- a/tools/baseline_mps_expectation.py
+++ b/tools/baseline_mps_expectation.py
@@ -0,0 +1,183 @@
+"""MPS expectation benchmark for qmatchatea and Vidal backends."""
+
+import argparse
+import json
+import logging
+import os
+import socket
+import time
+
+import numpy as np
+
+from qibotn.benchmark_cases import (
+    build_circuit as build_benchmark_circuit,
+    exact_pauli_sum,
+    observable_terms,
+    terms_to_dict,
+)
+from qibotn.backends.qmatchatea import QMatchaTeaBackend
+from qibotn.backends.vidal_tebd import run_vidal_ring_xz
+
+
+def build_circuit(nqubits, nlayers, seed):
+    return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
+
+
+def build_observable(nqubits):
+    return terms_to_dict(observable_terms("ring_xz", nqubits))
+
+
+def exact_expectation(circuit, nqubits):
+    return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=40)
+    parser.add_argument("--nlayers", type=int, default=30)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument(
+        "--executor",
+        choices=("qmatchatea", "vidal", "vidal-mpi"),
+        default="qmatchatea",
+    )
+    parser.add_argument("--mpi-ct", action="store_true")
+    parser.add_argument("--mpi-barriers", type=int, default=-1)
+    parser.add_argument("--mpi-isometrization", type=int, default=-1)
+    parser.add_argument("--exact", action="store_true")
+    parser.add_argument("--exact-max-qubits", type=int, default=24)
+    parser.add_argument("--reference-file")
+    parser.add_argument(
+        "--mpi-rank-map",
+        action="store_true",
+        help="Print MPI rank, host, pid, and torch thread placement metadata.",
+    )
+    args = parser.parse_args()
+    logging.getLogger("qibo.config").setLevel(logging.ERROR)
+    logging.getLogger("qtealeaves").setLevel(logging.ERROR)
+    import torch
+
+    torch.set_num_threads(args.torch_threads)
+    rank = 0
+    size = 1
+    if args.mpi_ct:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+        size = MPI.COMM_WORLD.Get_size()
+        if args.mpi_rank_map:
+            rank_info = {
+                "rank": rank,
+                "size": size,
+                "host": socket.gethostname(),
+                "pid": os.getpid(),
+                "torch_threads": args.torch_threads,
+                "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
+                "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
+            }
+            rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
+            if rank == 0:
+                print("mpi_rank_map")
+                for item in sorted(rank_infos, key=lambda row: row["rank"]):
+                    print(
+                        "rank={rank} size={size} host={host} pid={pid} "
+                        "torch_threads={torch_threads} "
+                        "OMP_NUM_THREADS={omp_num_threads} "
+                        "MKL_NUM_THREADS={mkl_num_threads}".format(**item)
+                    )
+
+    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
+    observable = build_observable(args.nqubits)
+    exact = None
+    if args.reference_file:
+        with open(args.reference_file, "r", encoding="utf-8") as f:
+            exact = float(json.load(f)["expectation"])
+    elif args.exact:
+        if args.nqubits > args.exact_max_qubits:
+            raise ValueError(
+                f"--exact is limited to {args.exact_max_qubits} qubits by default."
+            )
+        exact = exact_expectation(circuit, args.nqubits)
+
+    if rank == 0:
+        if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
+            mpi_label = f"VidalSegment/{size}"
+        else:
+            mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
+        print(
+            f"nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"bond={args.bond} seed={args.seed} "
+            f"tensor_module={args.tensor_module} svd_control=E! "
+            f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
+        )
+        if exact is not None:
+            print(f"exact={exact:.16e}")
+        print("expval abs_error rel_error seconds")
+
+    start = time.perf_counter()
+    timings = None
+    if args.executor in ("vidal", "vidal-mpi"):
+        if args.executor == "vidal-mpi" and not args.mpi_ct:
+            raise ValueError("--executor vidal-mpi requires --mpi-ct.")
+        if args.mpi_ct:
+            from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
+
+            value, timings = run_segment_vidal_mpi_ring_xz(
+                circuit,
+                max_bond=args.bond,
+                cut_ratio=1e-12,
+                tensor_module=args.tensor_module,
+                comm=MPI.COMM_WORLD,
+            )
+        else:
+            value = run_vidal_ring_xz(
+                circuit,
+                max_bond=args.bond,
+                cut_ratio=1e-12,
+                tensor_module=args.tensor_module,
+            )
+    else:
+        backend = QMatchaTeaBackend()
+        backend.configure_tn_simulation(
+            ansatz="MPS",
+            max_bond_dimension=args.bond,
+            cut_ratio=1e-12,
+            svd_control="E!",
+            tensor_module=args.tensor_module,
+            compile_circuit=True,
+            track_memory=False,
+            mpi_approach="CT" if args.mpi_ct else "SR",
+            mpi_num_procs=size,
+            mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
+            mpi_isometrization=args.mpi_isometrization,
+        )
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=False,
+            compile_circuit=True,
+        )
+    max_timings = None
+    if timings:
+        max_timings = {
+            key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
+            for key, local_value in timings.items()
+        }
+    if rank != 0:
+        return
+    value = float(np.real(value))
+    elapsed = time.perf_counter() - start
+    abs_error = float("nan") if exact is None else abs(value - exact)
+    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
+    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
+    if max_timings:
+        print("timing_section max_seconds")
+        for key, max_value in max_timings.items():
+            print(f"{key} {max_value:.6f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/benchmark_contract_sliced.py
+++ b/tools/benchmark_contract_sliced.py
@@ -0,0 +1,56 @@
+"""MPI parallel sliced contraction using pre-sliced tree."""
+import time, pickle, os
+import numpy as np
+from mpi4py import MPI
+
+NQUBITS, NLAYERS, NCORES = 25, 10, 48
+
+comm = MPI.COMM_WORLD
+rank, size = comm.Get_rank(), comm.Get_size()
+
+os.environ['OMP_NUM_THREADS'] = str(NCORES)
+os.environ['MKL_NUM_THREADS'] = str(NCORES)
+
+import torch
+import qibo, quimb as qu
+from qibotn.observables import build_random_circuit
+
+torch.set_num_threads(NCORES)
+
+circuit = build_random_circuit(NQUBITS, NLAYERS)
+qibo.set_backend("qibotn", platform="quimb")
+backend = qibo.get_backend()
+backend.configure_tn_simulation(ansatz="tn")
+qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
+tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
+
+if rank == 0:
+    with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
+        tree = pickle.load(f)
+else:
+    tree = None
+tree = comm.bcast(tree, root=0)
+
+arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors]
+n_slices = tree.multiplicity
+
+if rank == 0:
+    print(f"Slices: {n_slices}, Ranks: {size}, "
+          f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
+          f"Threads/rank: {NCORES}, Backend: torch")
+
+t0 = time.time()
+result = None
+for i in range(rank, n_slices, size):
+    val = tree.contract_slice(arrays, i, backend='torch')
+    val_np = val.cpu().numpy().reshape(-1)
+    result = val_np if result is None else result + val_np
+
+if result is None:
+    result = np.zeros(1, dtype=np.complex128)
+
+total = np.zeros_like(result) if rank == 0 else None
+comm.Reduce(result, total, root=0)
+
+if rank == 0:
+    print(f"Contract: {time.time() - t0:.4f}s  Expectation: {0.5 * total[0].real:.10f}")
--- a/tools/benchmark_search.py
+++ b/tools/benchmark_search.py
@@ -0,0 +1,34 @@
+"""Search contraction path and save."""
+import time, os, pickle
+from qibotn.parallel import parallel_path_search
+from qibotn.observables import build_random_circuit
+import qibo, quimb as qu
+
+from mpi4py import MPI
+
+NQUBITS, NLAYERS, WORKERS = 20, 10, 96
+
+comm = MPI.COMM_WORLD
+rank, size = comm.Get_rank(), comm.Get_size()
+method = 'mpi' if size > 1 else 'processpool'
+
+circuit = build_random_circuit(NQUBITS, NLAYERS)
+qibo.set_backend("qibotn", platform="quimb")
+backend = qibo.get_backend()
+backend.configure_tn_simulation(ansatz="tn")
+qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
+tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
+
+if rank == 0:
+    print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
+t0 = time.time()
+tree = parallel_path_search(tn, tn.outer_inds(), method=method,
+    total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60)
+t_search = time.time() - t0
+
+if rank == 0:
+    os.makedirs('data', exist_ok=True)
+    path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
+    with open(path, 'wb') as f:
+        pickle.dump(tree, f)
+    print(f"Search: {t_search:.2f}s  Peak: {tree.max_size() * 16 / 1e9:.2f} GB  Saved: {path}")
--- a/tools/benchmark_slice.py
+++ b/tools/benchmark_slice.py
@@ -0,0 +1,16 @@
+"""Slice saved tree and save."""
+import pickle
+
+NQUBITS, NLAYERS = 25, 10
+
+with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
+    tree = pickle.load(f)
+
+print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
+
+tree_sliced = tree.slice_and_reconfigure(target_size=2**28)
+
+with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
+    pickle.dump(tree_sliced, f)
+
+print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB  Slices: {tree_sliced.multiplicity}")
--- a/tools/benchmark_tn_mpi.py
+++ b/tools/benchmark_tn_mpi.py
@@ -0,0 +1,378 @@
+"""MPI-parallel TN benchmark: path search + contraction via MPI."""
+import json
+import pickle
+import time
+import argparse
+import numpy as np
+import cotengra as ctg
+import qibo
+from qibo import Circuit, gates
+from mpi4py import MPI
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from qibotn.observables import check_observable, extract_gates_and_qubits
+
+
+def _load_observable(observable_file=None, observable_json=None):
+    if observable_file:
+        with open(observable_file, "r", encoding="utf8") as f:
+            return json.load(f)
+    if observable_json:
+        return json.loads(observable_json)
+    return None
+
+
+def _term_to_quimb_operator(term):
+    """Convert one extracted Hamiltonian term to a quimb operator."""
+    import quimb as qu
+
+    coeff = complex(term[0][2]) if term else 1.0
+    op = None
+    where = []
+
+    for qubit, gate_name, _ in term:
+        qubit = int(qubit)
+        gate_name = str(gate_name).upper()
+        if gate_name == "I":
+            continue
+        where.append(qubit)
+        op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
+
+    return complex(coeff), op, tuple(where)
+
+
+def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
+    import pickle, cotengra as ctg, random
+    random.seed(seed)
+    tn = pickle.loads(tn_bytes)
+    opt = ctg.HyperOptimizer(
+        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
+        max_repeats=repeats,
+        parallel=False,
+        minimize='combo-256',
+        max_time=max_time,
+        optlib="random",
+        slicing_opts={'target_size': 2**29, 'allow_outer': True},
+        progbar=False,
+    )
+    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
+    return tree.combo_cost(factor=256), tree
+
+
+def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
+                    timeout):
+    import pickle, os, signal
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+    tn_bytes = pickle.dumps(tn)
+    if n_workers <= 1:
+        return _run_serial_search(
+            tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
+        )[1]
+    repeats_per = max(1, total_repeats // n_workers)
+    best_cost, best_tree = float('inf'), None
+
+    pool = ProcessPoolExecutor(max_workers=n_workers)
+    futures = [
+        pool.submit(_run_serial_search, tn_bytes, output_inds,
+                    repeats_per, seed, num_slices, n_ranks, timeout)
+        for seed in range(n_workers)
+    ]
+    try:
+        for fut in as_completed(futures, timeout=timeout + 5):
+            try:
+                cost, tree = fut.result()
+                if cost < best_cost:
+                    best_cost, best_tree = cost, tree
+            except Exception as e:
+                print(f"  [worker failed] {e}")
+    except TimeoutError:
+        pass
+    finally:
+        for fut in futures:
+            fut.cancel()
+        for pid in list(pool._processes.keys()):
+            try:
+                os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+        pool.shutdown(wait=False)
+
+    return best_tree
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+def _contract_mpi(tree, arrays, comm, root=0):
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    is_torch = type(arrays[0]).__module__.startswith("torch")
+
+    result_np = None
+    for i in range(rank, tree.multiplicity, size):
+        x = tree.contract_slice(arrays, i)
+        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
+        result_np = x_np if result_np is None else result_np + x_np
+
+    if result_np is None:
+        result_np = np.zeros(1, dtype=np.complex128)
+
+    result = np.zeros_like(result_np) if rank == root else None
+    comm.Reduce(result_np, result, root=root)
+
+    if rank == root:
+        import torch
+        return torch.from_numpy(np.asarray(result)) if is_torch else result
+    return None
+
+
+def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
+            load_path=None, save_path=None):
+    """Each MPI rank runs serial path search over total_repeats/size trials,
+    rank 0 picks the global best, then all ranks contract in parallel."""
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128)
+
+    # --- path search: each rank serial, gather best to rank 0 ---
+    if load_path:
+        if rank == 0:
+            with open(load_path, "rb") as f:
+                saved = pickle.load(f)
+            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
+            print(f"  [path loaded]  {load_path}")
+        else:
+            tree = psi = None
+            t_search = 0.0
+    else:
+        rank_repeats = max(1, total_repeats // size)
+        t0 = time.time()
+        # get TN object first (no contraction), then run parallel search
+        psi_tn = qc.to_dense(rehearse="tn")
+        local_tree = parallel_search(
+            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
+            num_slices=num_slices, n_ranks=size, timeout=600,
+        )
+        t_search = time.time() - t0
+        local_psi = psi_tn
+
+        all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0)
+        if rank == 0:
+            _, tree, psi = min(all_results, key=lambda x: x[0])
+            print(f"  [path search]  {t_search:.3f}s  "
+                  f"flops~2^{tree.contraction_cost(log=2):.2f}  "
+                  f"size~2^{tree.contraction_width():.2f}  "
+                  f"slices={tree.multiplicity}")
+            if save_path:
+                with open(save_path, "wb") as f:
+                    pickle.dump({"tree": tree, "psi": psi}, f)
+                print(f"  [path saved]   {save_path}")
+        else:
+            tree = psi = None
+
+        if save_path:
+            t_search = comm.bcast(t_search, root=0)
+            return None, t_search
+
+    tree = comm.bcast(tree, root=0)
+    psi = comm.bcast(psi, root=0)
+    t_search = comm.bcast(t_search, root=0)
+
+    # --- contraction: all ranks work in parallel ---
+    import torch
+    torch.set_num_threads(max(1, 96 // size))
+    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
+    t0 = time.time()
+    sv = _contract_mpi(tree, arrays, comm, root=0)
+    t_contract = time.time() - t0
+
+    if rank == 0:
+        print(f"  [contraction]  {t_contract:.3f}s")
+        return np.array(sv).reshape(-1), t_search + t_contract
+    return None, t_search + t_contract
+
+
+def run_mpi_expval(
+    circuit,
+    nqubits,
+    observable=None,
+    total_repeats=1024,
+    search_workers=1,
+    search_timeout=300,
+):
+    """Compute a Hamiltonian expectation value directly from TN via MPI.
+    MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
+    path search for each term."""
+    import torch
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    observable = check_observable(observable, nqubits)
+    ham_gate_map = extract_gates_and_qubits(observable)
+
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+
+    my_terms = ham_gate_map[rank::size]
+    torch.set_num_threads(max(1, 96 // size))
+    t0 = time.time()
+
+    my_exp = 0.0 + 0.0j
+    for term in my_terms:
+        coeff, op, where = _term_to_quimb_operator(term)
+        if op is None:
+            my_exp += coeff
+            continue
+        tn = qc.local_expectation_tn(op, where=where)
+        if len(tn.outer_inds()) == 0:
+            val = complex(tn.contract())
+        else:
+            tree = parallel_search(
+                tn,
+                tn.outer_inds(),
+                total_repeats,
+                n_workers=search_workers,
+                num_slices=1,
+                n_ranks=size,
+                timeout=search_timeout,
+            )
+            if tree is None:
+                raise RuntimeError("Failed to find a contraction tree for expectation TN.")
+            arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
+            acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
+            val = complex(acc.item() if hasattr(acc, 'item') else acc)
+        my_exp += coeff * val
+
+    t_total = time.time() - t0
+
+    all_results = comm.gather(my_exp, root=0)
+    if rank == 0:
+        total_exp = sum(all_results)
+        print(f"\n[TN expval]  time={t_total:.4f}s  expval={total_exp.real:.12f}")
+        return np.real_if_close(total_exp), t_total
+    return None, t_total
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=30)
+    parser.add_argument("--circuit", type=str, default="qft",
+                        choices=["qft", "variational", "ghz", "brickwork"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--total-repeats", type=int, default=1024)
+    parser.add_argument("--search-workers", type=int, default=1)
+    parser.add_argument("--search-timeout", type=int, default=300)
+    parser.add_argument("--observable-file", type=str, default=None)
+    parser.add_argument("--observable-json", type=str, default=None)
+    parser.add_argument("--save-path", type=str, default=None)
+    parser.add_argument("--load-path", type=str, default=None)
+    parser.add_argument("--no-compare", action="store_true")
+    parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
+    args = parser.parse_args()
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
+    if rank == 0:
+        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
+              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
+
+    np.random.seed(42)
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    observable = _load_observable(args.observable_file, args.observable_json)
+
+    if args.mode == "expval":
+        try:
+            expval, t_total = run_mpi_expval(
+                circuit,
+                args.nqubits,
+                observable=observable,
+                total_repeats=args.total_repeats,
+                search_workers=args.search_workers,
+                search_timeout=args.search_timeout,
+            )
+        except Exception as e:
+            if rank == 0:
+                print(f"[FAILED] {e}")
+            raise
+        if rank == 0:
+            np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
+            if not args.no_compare:
+                print("No built-in reference comparison for arbitrary observables.")
+        return
+
+    try:
+        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
+                              total_repeats=args.total_repeats,
+                              load_path=args.load_path, save_path=args.save_path)
+    except Exception as e:
+        if rank == 0:
+            print(f"[FAILED] {e}")
+        raise
+
+    if rank == 0 and sv is not None:
+        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
+        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
+
+        if not args.no_compare:
+            from qibotn.bak.benchmark_tn import run_qibojit
+            import gc
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+            sv_ref, t_ref = run_qibojit(circuit_ref)
+            np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
+            print(f"[qibojit]       time={t_ref:.4f}s")
+            # free memory before loading via mmap for expval comparison
+            del sv, sv_ref
+            gc.collect()
+            from compare_jit_tn_quimb import check_results
+            ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
+            tn_path  = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
+            check_results(ref_path, tn_path, args.nqubits)
+            if t_total > 0:
+                print(f"Speedup  : {t_ref/t_total:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/check_tree.py
+++ b/tools/check_tree.py
@@ -0,0 +1,25 @@
+"""Check contraction tree statistics."""
+import pickle, sys
+
+path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
+with open(path, 'rb') as f:
+    tree = pickle.load(f)
+
+# Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2
+# complex128 multiply-add = 6 real FLOPs
+CORES = 96
+FREQ = 2.1e9
+AVX512_FP64 = 16
+TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12  # ~6.45 TFLOPS real FP64
+COMPLEX_FLOPS = TFLOPS / 6  # complex128 effective
+
+flops = tree.total_flops()
+slices = tree.multiplicity
+est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12) 
+
+print(f"File: {path}")
+print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")
+print(f"Total FLOPs: {flops:.2e}  x{slices} slices = {flops*slices:.2e}")
+print(f"Contraction width: {tree.contraction_width()}")
+print(f"Multiplicity (slices): {slices}")
+print(f"Estimated time (96 cores): {est_seconds:.1f}s  ({est_seconds/3600:.2f}h)")
--- a/tools/compare_vidal_backend_qmatchatea.py
+++ b/tools/compare_vidal_backend_qmatchatea.py
@@ -0,0 +1,137 @@
+"""Compare QMatchaTeaBackend with the VidalBackend fast path."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import time
+
+import numpy as np
+import torch
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+from qibotn.backends.qmatchatea import QMatchaTeaBackend
+from qibotn.backends.vidal import VidalBackend
+
+
+def build_circuit(nqubits, nlayers, seed, kind):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+        if kind == "brickwall":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+        elif kind == "shifted-cz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(q, q + 1))
+        elif kind == "reversed-cnot":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+    return circuit
+
+
+def build_observable(nqubits, kind):
+    form = 0
+    if kind == "ring-xz":
+        for q in range(nqubits):
+            form += 0.5 * X(q) * Z((q + 1) % nqubits)
+    elif kind == "open-zz":
+        for q in range(nqubits - 1):
+            form += Z(q) * Z(q + 1) / (nqubits - 1)
+    elif kind == "mixed":
+        form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
+        for q in range(0, nqubits - 1, 3):
+            form += 0.125 * Y(q) * Y(q + 1)
+    else:
+        raise ValueError(f"Unknown observable kind {kind!r}.")
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def run_backend(backend, circuit, observable):
+    start = time.perf_counter()
+    value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
+    return float(np.real(value)), time.perf_counter() - start
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=34)
+    parser.add_argument("--nlayers", type=int, default=20)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument(
+        "--circuit-kind",
+        choices=("brickwall", "shifted-cz", "reversed-cnot"),
+        default="brickwall",
+    )
+    parser.add_argument(
+        "--observable-kind",
+        choices=("ring-xz", "open-zz", "mixed"),
+        default="ring-xz",
+    )
+    parser.add_argument("--reference-file")
+    parser.add_argument("--skip-qmatchatea", action="store_true")
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+    circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
+    observable = build_observable(args.nqubits, args.observable_kind)
+
+    exact = None
+    if args.reference_file:
+        with open(args.reference_file, "r", encoding="utf-8") as f:
+            exact = float(json.load(f)["expectation"])
+
+    print(
+        f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
+        f"circuit={args.circuit_kind} observable={args.observable_kind} "
+        f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
+    )
+    if exact is not None:
+        print(f"exact={exact:.16e}")
+    print("backend value abs_error seconds")
+
+    if not args.skip_qmatchatea:
+        qmt = QMatchaTeaBackend()
+        qmt.configure_tn_simulation(
+            ansatz="MPS",
+            max_bond_dimension=args.bond,
+            cut_ratio=1e-12,
+            svd_control="E!",
+            tensor_module=args.tensor_module,
+            compile_circuit=True,
+            track_memory=False,
+        )
+        value, seconds = run_backend(qmt, circuit, observable)
+        error = float("nan") if exact is None else abs(value - exact)
+        print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
+
+    vidal = VidalBackend()
+    vidal.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=args.bond,
+        cut_ratio=1e-12,
+        tensor_module=args.tensor_module,
+        compile_circuit=True,
+        fallback=True,
+    )
+    value, seconds = run_backend(vidal, circuit, observable)
+    error = float("nan") if exact is None else abs(value - exact)
+    print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/profile_vidal_chrome.py
+++ b/tools/profile_vidal_chrome.py
@@ -0,0 +1,72 @@
+"""Chrome trace profiler for the VidalBackend fast path."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import torch
+from torch.profiler import ProfilerActivity, profile
+
+from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
+from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=34)
+    parser.add_argument("--nlayers", type=int, default=20)
+    parser.add_argument("--bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument("--cut-ratio", type=float, default=1e-12)
+    parser.add_argument("--profile-memory", action="store_true")
+    parser.add_argument("--rows", type=int, default=60)
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+
+    prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
+    trace_path = Path(f"{prefix}.json")
+    table_path = Path(f"{prefix}.txt")
+    trace_path.parent.mkdir(parents=True, exist_ok=True)
+
+    circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
+    observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
+    config = ExpectationConfig(
+        ansatz="mps",
+        bond=args.bond,
+        cut_ratio=args.cut_ratio,
+        tensor_module="torch",
+        torch_threads=args.torch_threads,
+    )
+
+    print(
+        f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
+        f"bond={args.bond} threads={args.torch_threads}"
+    )
+
+    with profile(
+        activities=[ProfilerActivity.CPU],
+        record_shapes=args.profile_memory,
+        profile_memory=args.profile_memory,
+        with_stack=args.profile_memory,
+    ) as prof:
+        result = run_cpu_expectation(circuit, observable, config)
+
+    table = (
+        f"expval={result.value:.16e}\n\n"
+        f"# sorted by self_cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
+        f"# sorted by cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
+    )
+
+    print(table, end="")
+    table_path.write_text(table, encoding="utf-8")
+    prof.export_chrome_trace(str(trace_path))
+    print(f"trace={trace_path}\ntable={table_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/qibojit_reference_expectation.py
+++ b/tools/qibojit_reference_expectation.py
@@ -0,0 +1,109 @@
+"""Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
+
+import argparse
+import json
+import math
+import time
+from pathlib import Path
+
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+
+
+def build_circuit(nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for _ in range(nlayers):
+        for qubit in range(nqubits):
+            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        for qubit in range(0, nqubits - 1, 2):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        for qubit in range(1, nqubits - 1, 2):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+    return circuit
+
+
+def ring_xz_expectation(state, nqubits, chunk_size):
+    value = 0.0
+    for qubit in range(nqubits):
+        next_qubit = (qubit + 1) % nqubits
+        x_flip = 1 << (nqubits - 1 - qubit)
+        z_shift = nqubits - 1 - next_qubit
+        term = 0.0
+        for start in range(0, state.size, chunk_size):
+            stop = min(start + chunk_size, state.size)
+            indices = np.arange(start, stop, dtype=np.int64)
+            z_bit = (indices >> z_shift) & 1
+            z_phase = 1 - 2 * z_bit
+            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
+        value += 0.5 * term
+    return float(value)
+
+
+def default_output_path(nqubits, nlayers, seed):
+    return Path("references") / (
+        f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=32)
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output")
+    parser.add_argument("--force", action="store_true")
+    parser.add_argument("--allow-large", action="store_true")
+    parser.add_argument("--max-state-gb", type=float, default=32.0)
+    parser.add_argument("--chunk-size", type=int, default=1 << 20)
+    args = parser.parse_args()
+
+    output = Path(args.output) if args.output else default_output_path(
+        args.nqubits, args.nlayers, args.seed
+    )
+    if output.exists() and not args.force:
+        with open(output, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        print(f"loaded {output}")
+        print(f"expectation={float(data['expectation']):.16e}")
+        return
+
+    state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
+    if state_gb > args.max_state_gb and not args.allow_large:
+        raise MemoryError(
+            f"Estimated state vector alone is {state_gb:.1f} GiB. "
+            "Pass --allow-large after confirming the node has enough memory."
+        )
+
+    qibo.set_backend("qibojit")
+    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
+
+    start = time.perf_counter()
+    state = circuit().state(numpy=True).reshape(-1)
+    expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
+    elapsed = time.perf_counter() - start
+
+    data = {
+        "backend": "qibojit",
+        "observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
+        "nqubits": args.nqubits,
+        "nlayers": args.nlayers,
+        "seed": args.seed,
+        "expectation": expectation,
+        "seconds": elapsed,
+        "state_vector_gib_estimate": state_gb,
+    }
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with open(output, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+        f.write("\n")
+
+    print(f"saved {output}")
+    print(f"expectation={expectation:.16e}")
+    print(f"seconds={elapsed:.3f}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/run_cpu_large_cases.sh
+++ b/tools/run_cpu_large_cases.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Large CPU expectation benchmarks for two-server runs.
+#
+# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
+# each.  Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
+# needed.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+HOSTFILE="${HOSTFILE:-hostfile}"
+
+MPS_RANKS="${MPS_RANKS:-8}"
+MPS_THREADS="${MPS_THREADS:-12}"
+TN_RANKS="${TN_RANKS:-12}"
+TN_THREADS="${TN_THREADS:-8}"
+
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+
+run_mpi() {
+  local ranks="$1"
+  shift
+  "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
+}
+
+run_case() {
+  local title="$1"
+  shift
+  echo
+  echo "================================================================================"
+  echo "$title"
+  echo "================================================================================"
+  echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  echo "$*"
+  "$@"
+}
+
+case "${1:-help}" in
+  smoke)
+    run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_SMOKE_NQ:-40}" \
+        --nlayers "${MPS_SMOKE_LAYERS:-30}" \
+        --bond "${MPS_SMOKE_BOND:-2048}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz \
+        --observables ring_xz open_zz range2_xx
+
+    run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
+      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
+        --mpi \
+        --nqubits "${TN_SMOKE_NQ:-32}" \
+        --nlayers "${TN_SMOKE_LAYERS:-16}" \
+        --torch-threads "$TN_THREADS" \
+        --circuits brickwall_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz range2_xx \
+        --tn-target-slices "${TN_SMOKE_SLICES:-12}"
+    ;;
+
+  mps-long)
+    run_case "MPS MPI long: n=64 layers=48 bond=4096" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_LONG_NQ:-64}" \
+        --nlayers "${MPS_LONG_LAYERS:-48}" \
+        --bond "${MPS_LONG_BOND:-4096}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  mps-pressure)
+    run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
+      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
+        --mpi --mps \
+        --nqubits "${MPS_PRESSURE_NQ:-80}" \
+        --nlayers "${MPS_PRESSURE_LAYERS:-64}" \
+        --bond "${MPS_PRESSURE_BOND:-4096}" \
+        --torch-threads "$MPS_THREADS" \
+        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
+        --observables ring_xz open_zz mixed_local range2_xx long_z_string
+    ;;
+
+  tn-long)
+    run_case "TN MPI long: n=36 layers=20 target_slices=24" \
+      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
+        --mpi \
+        --nqubits "${TN_LONG_NQ:-36}" \
+        --nlayers "${TN_LONG_LAYERS:-20}" \
+        --torch-threads "$TN_THREADS" \
+        --circuits brickwall_cnot shifted_cz rxx_rzz \
+        --observables ring_xz open_zz range2_xx \
+        --tn-target-slices "${TN_LONG_SLICES:-24}"
+    ;;
+
+  all)
+    "$0" smoke
+    "$0" mps-long
+    "$0" tn-long
+    ;;
+
+  help|*)
+    cat >&2 <<'EOF'
+Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
+
+Common overrides:
+  HOSTFILE=hostfile
+  PYTHON_BIN=.venv/bin/python
+  MPIEXEC=mpiexec
+  MPS_RANKS=8 MPS_THREADS=12
+  TN_RANKS=12 TN_THREADS=8
+
+Scale overrides:
+  MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
+  MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
+  TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
+EOF
+    exit 2
+    ;;
+esac
--- a/tools/run_cpu_single_cases.sh
+++ b/tools/run_cpu_single_cases.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Single-node CPU scale probes for expectation benchmarks.
+#
+# Intended for one 96-core / ~500 GiB RAM node.  The default "probe" mode runs
+# moderate MPS and TN cases first.  Larger modes are available after checking
+# runtime and memory from the probe output.
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+TIME_BIN="${TIME_BIN:-/usr/bin/time}"
+
+MPS_RANKS="${MPS_RANKS:-8}"
+MPS_THREADS="${MPS_THREADS:-12}"
+TN_RANKS="${TN_RANKS:-8}"
+TN_THREADS="${TN_THREADS:-12}"
+
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+
+estimate_mps_memory() {
+  local nqubits="$1"
+  local bond="$2"
+  "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
+import sys
+n = int(sys.argv[1])
+chi = int(sys.argv[2])
+ranks = int(sys.argv[3])
+resident = n * 2 * chi * chi * 16
+per_rank = resident / ranks
+print(
+    "MPS rough resident memory: "
+    f"total={resident / 1024**3:.1f} GiB "
+    f"per_rank={per_rank / 1024**3:.1f} GiB "
+    "(temporary eig/SVD workspaces are additional)"
+)
+PY
+}
+
+run_timed() {
+  echo
+  echo "--------------------------------------------------------------------------------"
+  echo "$*"
+  echo "--------------------------------------------------------------------------------"
+  "$TIME_BIN" -v "$@"
+}
+
+run_mps_case() {
+  local label="$1"
+  local nqubits="$2"
+  local nlayers="$3"
+  local bond="$4"
+  shift 4
+  echo
+  echo "================================================================================"
+  echo "$label"
+  echo "================================================================================"
+  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  estimate_mps_memory "$nqubits" "$bond"
+  run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
+    --mpi --mps \
+    --nqubits "$nqubits" \
+    --nlayers "$nlayers" \
+    --bond "$bond" \
+    --torch-threads "$MPS_THREADS" \
+    "$@"
+}
+
+run_tn_case() {
+  local label="$1"
+  local nqubits="$2"
+  local nlayers="$3"
+  shift 3
+  echo
+  echo "================================================================================"
+  echo "$label"
+  echo "================================================================================"
+  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
+  echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
+  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
+  echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
+  run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
+    --mpi \
+    --nqubits "$nqubits" \
+    --nlayers "$nlayers" \
+    --torch-threads "$TN_THREADS" \
+    "$@"
+}
+
+case "${1:-help}" in
+  probe)
+    run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
+      --circuits brickwall_cnot \
+      --observables ring_xz
+
+    run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
+      --circuits brickwall_cnot \
+      --observables ring_xz \
+      --tn-target-slices 8
+    ;;
+
+  mps-medium)
+    run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  mps-long)
+    run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
+      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz mixed_local range2_xx
+    ;;
+
+  tn-medium)
+    run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
+      --circuits brickwall_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz range2_xx \
+      --tn-target-slices 16
+    ;;
+
+  tn-long)
+    run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
+      --circuits brickwall_cnot shifted_cz rxx_rzz \
+      --observables ring_xz open_zz range2_xx \
+      --tn-target-slices 32
+    ;;
+
+  help|*)
+    cat >&2 <<'EOF'
+Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
+
+Common overrides:
+  PYTHON_BIN=.venv/bin/python
+  MPIEXEC=mpiexec
+  MPS_RANKS=8 MPS_THREADS=12
+  TN_RANKS=8 TN_THREADS=12
+  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
+EOF
+    exit 2
+    ;;
+esac
--- a/tools/run_vidal_segment_mpi_scan.sh
+++ b/tools/run_vidal_segment_mpi_scan.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+NQ="${NQ:-34}"
+LAYERS="${LAYERS:-20}"
+BOND="${BOND:-512}"
+SEED="${SEED:-42}"
+RANKS="${RANKS:-1 2 4}"
+THREADS="${THREADS:-32 32 16}"
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+CIRCUIT="${CIRCUIT:-brickwall_cnot}"
+OBSERVABLE="${OBSERVABLE:-ring_xz}"
+EXACT="${EXACT:-0}"
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+if [[ "${1:-help}" != "run" ]]; then
+  cat >&2 <<'EOF'
+Usage: tools/run_vidal_segment_mpi_scan.sh run
+
+Overrides:
+  NQ=34 LAYERS=20 BOND=512 SEED=42
+  RANKS="1 2 4" THREADS="32 32 16"
+  CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
+  EXACT=1
+  PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
+EOF
+  if [[ "${1:-help}" == "help" ]]; then
+    exit 0
+  fi
+  exit 2
+fi
+
+read -r -a ranks <<< "$RANKS"
+read -r -a threads <<< "$THREADS"
+
+if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
+  echo "RANKS and THREADS must have the same number of entries." >&2
+  exit 2
+fi
+
+common=(
+  --nqubits "$NQ"
+  --nlayers "$LAYERS"
+  --bond "$BOND"
+  --seed "$SEED"
+  --mps
+  --circuits "$CIRCUIT"
+  --observables "$OBSERVABLE"
+)
+
+if [[ "$EXACT" == "1" ]]; then
+  common+=(--exact)
+fi
+
+for idx in "${!ranks[@]}"; do
+  nrank="${ranks[$idx]}"
+  nthr="${threads[$idx]}"
+  if [[ "$nrank" == "1" ]]; then
+    echo "== Vidal serial ranks=1 torch_threads=$nthr =="
+    "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      "${common[@]}" --torch-threads "$nthr"
+  else
+    echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
+    "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
+      "${common[@]}" --torch-threads "$nthr" --mpi
+  fi
+done
--- a/tools/validate_vidal_mpi_correctness.py
+++ b/tools/validate_vidal_mpi_correctness.py
@@ -0,0 +1,202 @@
+"""Correctness checks for the Vidal/TEBD MPS fast path.
+
+The cases here intentionally cover more than the benchmark ring-XZ observable:
+different nearest-neighbor gate orientations and several Pauli-sum observables.
+Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
+the segmented Vidal executor.
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import time
+
+import numpy as np
+import torch
+from qibo import Circuit, gates
+
+from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+from qibotn.backends.vidal_tebd import VidalTEBDExecutor
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+            if kind == "rx_ry_cz":
+                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind in ("brickwall", "reversed_cnot"):
+            for q in range(0, nqubits - 1, 2):
+                if kind == "reversed_cnot" and (layer % 2):
+                    circuit.add(gates.CNOT(q + 1, q))
+                else:
+                    circuit.add(gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                if kind == "reversed_cnot" and not (layer % 2):
+                    circuit.add(gates.CNOT(q + 1, q))
+                else:
+                    circuit.add(gates.CNOT(q, q + 1))
+        elif kind == "rx_ry_cz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+    return circuit
+
+
+def observable_terms(kind, nqubits):
+    if kind == "ring_xz":
+        return [
+            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
+            for site in range(nqubits)
+        ]
+    if kind == "open_zz":
+        return [
+            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
+            for site in range(nqubits - 1)
+        ]
+    if kind == "mixed_local":
+        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
+        terms += [
+            (0.125, (("Y", site), ("Y", site + 1)))
+            for site in range(0, nqubits - 1, 3)
+        ]
+        return terms
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def exact_pauli_sum(circuit, terms, nqubits):
+    state = circuit().state(numpy=True).reshape(-1)
+    indices = np.arange(state.size, dtype=np.int64)
+    value = 0.0 + 0.0j
+    for coeff, ops in terms:
+        flipped = indices.copy()
+        phase = np.ones(state.size, dtype=np.complex128)
+        for name, site in ops:
+            shift = nqubits - 1 - site
+            bit = (indices >> shift) & 1
+            name = name.upper()
+            if name == "X":
+                flipped ^= 1 << shift
+            elif name == "Y":
+                flipped ^= 1 << shift
+                phase *= 1j * (1 - 2 * bit)
+            elif name == "Z":
+                phase *= 1 - 2 * bit
+            elif name != "I":
+                raise ValueError(f"Unsupported Pauli {name!r}.")
+        value += coeff * np.vdot(state[flipped], phase * state)
+    return float(value.real)
+
+
+def run_vidal(circuit, terms, nqubits, bond, tensor_module):
+    executor = VidalTEBDExecutor(
+        nqubits=nqubits,
+        max_bond=bond,
+        cut_ratio=1e-12,
+        tensor_module=tensor_module,
+    )
+    executor.run_circuit(circuit)
+    return float(executor.expectation_pauli_sum(terms))
+
+
+def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
+    executor = SegmentVidalMPIExecutor(
+        nqubits=nqubits,
+        max_bond=bond,
+        cut_ratio=1e-12,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    executor.run_circuit(circuit)
+    return executor.expectation_pauli_sum_root(terms)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=16)
+    parser.add_argument("--nlayers", type=int, default=6)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument("--mpi", action="store_true")
+    parser.add_argument(
+        "--circuits",
+        nargs="+",
+        default=("brickwall", "reversed_cnot", "rx_ry_cz"),
+    )
+    parser.add_argument(
+        "--observables",
+        nargs="+",
+        default=("ring_xz", "open_zz", "mixed_local"),
+    )
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.torch_threads)
+    comm = None
+    rank = 0
+    size = 1
+    if args.mpi:
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        size = comm.Get_size()
+
+    if rank == 0:
+        mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
+        print(
+            f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"bond={args.bond} tensor_module={args.tensor_module}"
+        )
+        print("circuit observable exact value abs_error seconds")
+
+    for circuit_kind in args.circuits:
+        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
+        exact = None
+        if rank == 0:
+            exact_values = {
+                obs: exact_pauli_sum(
+                    circuit, observable_terms(obs, args.nqubits), args.nqubits
+                )
+                for obs in args.observables
+            }
+        else:
+            exact_values = None
+        if comm is not None:
+            exact_values = comm.bcast(exact_values, root=0)
+
+        for obs_kind in args.observables:
+            terms = observable_terms(obs_kind, args.nqubits)
+            start = time.perf_counter()
+            if args.mpi:
+                value = run_segment_mpi(
+                    circuit,
+                    terms,
+                    args.nqubits,
+                    args.bond,
+                    args.tensor_module,
+                    comm,
+                )
+            else:
+                value = run_vidal(
+                    circuit, terms, args.nqubits, args.bond, args.tensor_module
+                )
+            if rank != 0:
+                continue
+            elapsed = time.perf_counter() - start
+            exact = exact_values[obs_kind]
+            print(
+                f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
+                f"{abs(value - exact):.6e} {elapsed:.3f}"
+            )
+
+
+if __name__ == "__main__":
+    main()