diff --git a/benchmark_cpu_expectation.py b/benchmark_cpu_expectation.py
index 9141849..3d5897d 100644
--- a/benchmark_cpu_expectation.py
+++ b/benchmark_cpu_expectation.py
@@ -3,6 +3,10 @@
 from __future__ import annotations
 
 import argparse
+import os
+import subprocess
+from pathlib import Path
+from urllib.parse import urlparse
 
 from qibotn.benchmark_cases import (
     CIRCUITS,
@@ -19,6 +23,51 @@ from qibotn.expectation_runner import (
 )
 
 
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def format_optional(value, fmt="g"):
+    return "None" if value is None else format(value, fmt)
+
+
+def should_stop_dask(args):
+    return (
+        not args.keep_dask
+        and args.tn_search_backend == "dask"
+        and args.dask_address is not None
+        and args.tn_load_tree is None
+    )
+
+
+def stop_dask_cluster(args, rank):
+    if rank != 0 or not should_stop_dask(args):
+        return
+    script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh"
+    if not script.exists():
+        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
+        return
+
+    env = os.environ.copy()
+    parsed = urlparse(args.dask_address)
+    if parsed.hostname:
+        env.setdefault("SCHEDULER_HOST", parsed.hostname)
+    if parsed.port:
+        env.setdefault("SCHEDULER_PORT", str(parsed.port))
+
+    print("dask_stop_after_search start", flush=True)
+    subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False)
+    print("dask_stop_after_search done", flush=True)
+
+
 def build_parallel_opts(args):
     slicing_opts = {}
     if args.tn_target_slices is not None:
@@ -31,11 +80,24 @@ def build_parallel_opts(args):
         "search_workers": args.tn_search_workers or args.torch_threads,
         "max_repeats": args.tn_search_repeats,
         "max_time": args.tn_search_time,
+        "print_stats": not args.no_tn_stats,
     }
     if args.tn_search_backend is not None:
         opts["search_backend"] = args.tn_search_backend
     if args.dask_address is not None:
         opts["dask_address"] = args.dask_address
+    if args.tn_save_tree is not None:
+        opts["save_tree_path"] = args.tn_save_tree
+    if args.tn_load_tree is not None:
+        opts["load_tree_path"] = args.tn_load_tree
+    if args.tn_search_only:
+        opts["search_only"] = True
+    if args.tn_debug_trials:
+        opts["debug_trials"] = True
+    if args.tn_contract_implementation is not None:
+        opts["contract_implementation"] = args.tn_contract_implementation
+    if args.dask_close_workers:
+        opts["dask_close_workers"] = True
     return opts
 
 
@@ -43,10 +105,16 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--nqubits", type=int, default=40)
     parser.add_argument("--nlayers", type=int, default=30)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024)
-    parser.add_argument("--cut-ratio", type=float, default=1e-12)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
+    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--torch-threads", type=int, default=8)
+    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
+    parser.add_argument(
+        "--dtype",
+        choices=("complex128", "complex64"),
+        default="complex128",
+    )
     parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
     parser.add_argument("--mps", action="store_true")
     parser.add_argument("--mpi", action="store_true")
@@ -56,19 +124,62 @@ def main():
     parser.add_argument("--observables", nargs="+", default=["ring_xz"])
     parser.add_argument("--pauli-pattern")
     parser.add_argument("--tn-target-slices", type=int)
-    parser.add_argument("--tn-target-size", type=int)
+    parser.add_argument("--tn-target-size", type=int,default=2**32)
     parser.add_argument("--tn-search-workers", type=int)
     parser.add_argument("--tn-search-repeats", type=int, default=128)
     parser.add_argument("--tn-search-time", type=float, default=60.0)
+    parser.add_argument(
+        "--no-tn-stats",
+        action="store_true",
+        help="Do not print per-term TN search/contraction diagnostics.",
+    )
     parser.add_argument(
         "--tn-search-backend",
         choices=("processpool", "dask"),
+        default="dask",
         help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
     )
     parser.add_argument(
         "--dask-address",
         help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
     )
+    parser.add_argument(
+        "--dask-close-workers",
+        action="store_true",
+        help="After dask path search, ask the scheduler to close all currently connected workers.",
+    )
+    parser.add_argument(
+        "--keep-dask",
+        action="store_true",
+        help=(
+            "Keep an external dask cluster running after search. By default, "
+            "tools/manage_tn_dask_cluster.sh stop is called after search when "
+            "--dask-address is used."
+        ),
+    )
+    parser.add_argument(
+        "--tn-save-tree",
+        help="Save searched cotengra contraction tree(s) to this pickle file.",
+    )
+    parser.add_argument(
+        "--tn-load-tree",
+        help="Load cotengra contraction tree(s) from this pickle file and skip path search.",
+    )
+    parser.add_argument(
+        "--tn-search-only",
+        action="store_true",
+        help="Only run path search and optional --tn-save-tree; skip contraction.",
+    )
+    parser.add_argument(
+        "--tn-debug-trials",
+        action="store_true",
+        help="Print dask worker summary and per-trial worker start/done logs.",
+    )
+    parser.add_argument(
+        "--tn-contract-implementation",
+        choices=("auto", "cotengra", "autoray", "cpp"),
+        help="cotengra contraction implementation for TN contraction.",
+    )
     args = parser.parse_args()
 
     ansatz = "mps" if args.mps else (args.ansatz or "tn")
@@ -89,6 +200,8 @@ def main():
         bond=args.bond,
         cut_ratio=args.cut_ratio,
         tensor_module="torch",
+        quimb_backend=args.quimb_backend,
+        dtype=args.dtype,
         torch_threads=args.torch_threads,
         parallel_opts=build_parallel_opts(args),
     )
@@ -98,47 +211,74 @@ def main():
         print(
             f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
             f"nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} "
+            f"bond={format_optional(args.bond)} "
+            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
+            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
             f"torch_threads={args.torch_threads} "
-            f"tn_search_backend={args.tn_search_backend or 'processpool'}"
+            f"tn_search_backend={args.tn_search_backend}"
         )
         print("circuit observable exact value abs_error rel_error seconds")
 
-    for circuit_kind in circuits:
-        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
-        named_observables = (
-            [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
-            if args.pauli_pattern
-            else [
-                (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
-                for obs_kind in observables
-            ]
-        )
+    try:
+        for circuit_kind in circuits:
+            circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
+            named_observables = (
+                [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
+                if args.pauli_pattern
+                else [
+                    (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
+                    for obs_kind in observables
+                ]
+            )
 
-        for obs_name, observable in named_observables:
-            exact = None
-            if args.exact and rank == 0:
-                if args.nqubits > args.exact_max_qubits:
-                    raise ValueError(
-                        f"--exact is limited to {args.exact_max_qubits} qubits by default."
+            for obs_name, observable in named_observables:
+                exact = None
+                if args.exact and rank == 0:
+                    if args.nqubits > args.exact_max_qubits:
+                        raise ValueError(
+                            f"--exact is limited to {args.exact_max_qubits} qubits by default."
+                        )
+                    exact = exact_for_observable(circuit, observable, args.nqubits)
+
+                result = run_cpu_expectation(circuit, observable, config)
+                if args.mpi and result.rank != 0:
+                    continue
+
+                abs_error = float("nan") if exact is None else abs(result.value - exact)
+                rel_error = (
+                    float("nan")
+                    if exact is None
+                    else abs_error / max(abs(exact), 1e-15)
+                )
+                exact_text = "nan" if exact is None else f"{exact:.16e}"
+                print(
+                    f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
+                    f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
+                )
+                for stat in result.parallel_stats or ():
+                    cost = stat["path_cost"]
+                    search_stats = stat.get("search_stats", {})
+                    print(
+                        "tn_term_summary "
+                        f"term={stat.get('term_index', 0)} "
+                        f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
+                        f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
+                        f"completed_trials={search_stats.get('completed_trials', 'na')} "
+                        f"finite_trials={search_stats.get('finite_trials', 'na')} "
+                        f"failed_trials={search_stats.get('failed_trials', 'na')} "
+                        f"requested_trials={search_stats.get('requested_trials', 'na')} "
+                        f"best_score={search_stats.get('best_score', float('nan')):.6g} "
+                        f"slices={cost['nslices']} "
+                        f"log10_flops={cost['log10_flops']:.3f} "
+                        f"log10_write={cost['log10_write']:.3f} "
+                        f"log2_size={cost['log2_size']:.3f} "
+                        f"log10_combo={cost['log10_combo']:.3f} "
+                        f"peak_memory_gib={cost['peak_memory_gib']:.6g} "
+                        f"slicing_overhead={cost['slicing_overhead']:.6g} "
+                        f"rank_slices={stat.get('rank_slices', 'na')}"
                     )
-                exact = exact_for_observable(circuit, observable, args.nqubits)
-
-            result = run_cpu_expectation(circuit, observable, config)
-            if args.mpi and result.rank != 0:
-                continue
-
-            abs_error = float("nan") if exact is None else abs(result.value - exact)
-            rel_error = (
-                float("nan")
-                if exact is None
-                else abs_error / max(abs(exact), 1e-15)
-            )
-            exact_text = "nan" if exact is None else f"{exact:.16e}"
-            print(
-                f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
-                f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
-            )
+    finally:
+        stop_dask_cluster(args, rank)
 
 
 if __name__ == "__main__":
diff --git a/docs/contest_runners.md b/docs/contest_runners.md
new file mode 100644
index 0000000..28c0d33
--- /dev/null
+++ b/docs/contest_runners.md
@@ -0,0 +1,254 @@
+# Contest Runners
+
+This directory contains two self-contained contest entrypoints:
+
+- `tools/tn_contest_runner.py`: general tensor-network path search and contraction.
+- `tools/mps_contest_runner.py`: Vidal/MPS multi-node expectation runner.
+
+Both scripts keep circuit and observable definitions inside the script so a
+contest case can be edited in one place.
+
+## Environment
+
+Run commands from the repository root:
+
+```bash
+cd /home/yx/qibotn
+```
+
+For Intel MPI on two nodes, use the known working style:
+
+```bash
+mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 ...
+```
+
+Set `TCM_ENABLE=1` for CPU runs:
+
+```bash
+export TCM_ENABLE=1
+```
+
+## TN Workflow
+
+List built-in TN contest cases:
+
+```bash
+python -u tools/tn_contest_runner.py list
+```
+
+TN path search uses dask by default. Without `--dask-address`, the script starts
+a local dask cluster. For multiple servers, start one scheduler and workers
+with the helper script, then pass the scheduler address to the search command.
+
+Start the default two-node dask cluster:
+
+```bash
+cd /home/yx/qibotn
+tools/manage_tn_dask_cluster.sh start
+```
+
+Check status:
+
+```bash
+cd /home/yx/qibotn
+tools/manage_tn_dask_cluster.sh status
+```
+
+Stop the cluster:
+
+```bash
+cd /home/yx/qibotn
+tools/manage_tn_dask_cluster.sh stop
+```
+
+The helper defaults are:
+
+```bash
+SCHEDULER_HOST=10.20.1.103
+WORKER_HOSTS="10.20.1.103 10.20.1.102"
+NWORKERS=48
+NTHREADS=1
+ROOT_DIR=/home/yx/qibotn
+PYTHON_BIN=.venv/bin/python
+DASK_WORKER_TTL="24 hours"
+DASK_TICK_LIMIT="30 minutes"
+DASK_LOST_WORKER_TIMEOUT="30 minutes"
+```
+
+Override them inline if needed:
+
+```bash
+WORKER_HOSTS="10.20.1.103 10.20.1.102" NWORKERS=48 \
+  tools/manage_tn_dask_cluster.sh restart
+```
+
+Check that both nodes are connected by adding `--tn-debug-trials` to a small
+search. The output should include `qibotn_dask_workers` with both hosts.
+
+`tools/tn_contest_runner.py search` stops the external dask cluster after the
+search phase by default. Pass `--keep-dask` if you want to reuse the same dask
+cluster for several searches.
+
+Use enough trials to fill the cluster. With the default two-node setup there are
+96 worker slots, so `--tn-search-repeats` should be at least 96. The contest
+runner default is 2048.
+
+Cotengra trials are CPU-bound and can hold the Python GIL long enough for dask
+to report `Event loop was unresponsive`. Dask defaults are much more aggressive:
+`scheduler.worker-ttl=5 minutes`, `admin.tick.limit=3s`, and
+`deploy.lost-worker-timeout=15s`. The helper script raises these limits so
+workers are not killed by dask during search. The intended timeout is
+`--tn-search-time`; after that, the runner stops the external dask cluster.
+
+Small correctness check against statevector:
+
+```bash
+python -u tools/tn_contest_runner.py validate \
+  --case main1 \
+  --nqubits 8 \
+  --nlayers 2 \
+  --torch-threads 4 \
+  --tn-search-repeats 8 \
+  --tn-search-time 5
+```
+
+Search and save contraction trees:
+
+```bash
+TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \
+  --case main1 \
+  --torch-threads 48 \
+  --dtype complex64 \
+  --dask-address tcp://10.20.1.103:8786 \
+  --tn-search-repeats 2048 \
+  --tn-search-time 300
+```
+
+Contract using the saved tree on one node:
+
+```bash
+TCM_ENABLE=1 mpirun -np 2 python -u tools/tn_contest_runner.py contract \
+  --mpi \
+  --case main1 \
+  --torch-threads 48 \
+  --dtype complex64
+```
+
+Contract using the saved tree on two nodes:
+
+```bash
+TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
+  python -u tools/tn_contest_runner.py contract \
+  --mpi \
+  --case main1 \
+  --torch-threads 48 \
+  --dtype complex64
+```
+
+Run search and contract in one command:
+
+```bash
+TCM_ENABLE=1 python -u tools/tn_contest_runner.py all \
+  --case main1 \
+  --torch-threads 48 \
+  --dtype complex64 \
+  --dask-address tcp://10.20.1.103:8786 \
+  --tn-search-repeats 2048 \
+  --tn-search-time 300
+```
+
+Run only selected observables:
+
+```bash
+python -u tools/tn_contest_runner.py search \
+  --case main2 \
+  --observables open_zz
+```
+
+Tree files are written to `trees/contest_tn/` by default. The tree filename
+contains case, observable, qubit count, layer count, and target slice count.
+If any of these change, search again.
+
+Edit TN contest cases in `tools/tn_contest_runner.py`:
+
+- `CASES`: case name, circuit kind, observable list, default scale.
+- `build_circuit`: circuit definitions.
+- `pauli_sum_observable`: observable definitions.
+
+## MPS Workflow
+
+List built-in Vidal/MPS contest cases:
+
+```bash
+python -u tools/mps_contest_runner.py list
+```
+
+Small correctness check against statevector:
+
+```bash
+mpirun -np 2 python -u tools/mps_contest_runner.py validate \
+  --case main1 \
+  --nqubits 8 \
+  --nlayers 2 \
+  --bond 64 \
+  --torch-threads 4
+```
+
+Run one MPS case on one node:
+
+```bash
+TCM_ENABLE=1 mpirun -np 2 python -u tools/mps_contest_runner.py run \
+  --case main1 \
+  --torch-threads 48
+```
+
+Run one MPS case on two nodes:
+
+```bash
+TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
+  python -u tools/mps_contest_runner.py run \
+  --case main1 \
+  --torch-threads 48
+```
+
+Run only one observable:
+
+```bash
+TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
+  python -u tools/mps_contest_runner.py run \
+  --case main1 \
+  --observables ring_xz \
+  --torch-threads 48
+```
+
+Override scale:
+
+```bash
+TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
+  python -u tools/mps_contest_runner.py run \
+  --case main1 \
+  --nqubits 128 \
+  --nlayers 24 \
+  --bond 1024 \
+  --torch-threads 48
+```
+
+Edit MPS contest cases in `tools/mps_contest_runner.py`:
+
+- `CASES`: case name, circuit kind, observable list, default scale and bond.
+- `build_circuit`: circuit definitions.
+- `observable`: observable definitions, including dense local terms.
+
+## Notes
+
+- TN uses path search plus contraction. Reuse tree files only for the exact same
+  circuit, observable, qubit count, layer count, seed, and slicing setup.
+- TN path search defaults to dask. Use `--tn-search-backend processpool` only
+  for fallback/debugging.
+- Prefer the default `--tn-target-size 4294967296` memory target. Do not force
+  `--tn-target-slices` unless you have already verified that cotengra can find
+  valid trees for that exact setting.
+- MPS/Vidal does not use contraction-tree search. It runs the circuit directly
+  and reports `trunc_sum` and `trunc_max`.
+- Default TN contraction is the stable torch/quimb path. Do not pass
+  `--tn-contract-implementation cpp` for contest runs.
diff --git a/hostfile b/hostfile
index 338d16a..e596b93 100644
--- a/hostfile
+++ b/hostfile
@@ -1,2 +1,2 @@
-10.20.6.74
-#10.20.6.102
\ No newline at end of file
+10.20.1.103:2
+10.20.1.102:2
diff --git a/src/qibotn/backends/__init__.py b/src/qibotn/backends/__init__.py
index f2c1189..954f793 100644
--- a/src/qibotn/backends/__init__.py
+++ b/src/qibotn/backends/__init__.py
@@ -30,7 +30,7 @@ class MetaBackend:
         elif platform == "quimb":  # pragma: no cover
             import qibotn.backends.quimb as qmb
 
-            quimb_backend = kwargs.get("quimb_backend", "numpy")
+            quimb_backend = kwargs.get("quimb_backend", "torch")
             contraction_optimizer = kwargs.get("contraction_optimizer", "auto-hq")
             return qmb.BACKENDS[quimb_backend](
                 quimb_backend=quimb_backend, contraction_optimizer=contraction_optimizer
diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py
index 0abe8f7..5259fe0 100644
--- a/src/qibotn/backends/cpu.py
+++ b/src/qibotn/backends/cpu.py
@@ -3,6 +3,9 @@
 from __future__ import annotations
 
 import os
+import pickle
+import time
+from pathlib import Path
 
 import numpy as np
 from qibo import hamiltonians
@@ -10,7 +13,12 @@ from qibo.backends import NumpyBackend
 from qibo.config import raise_error
 
 from qibotn.backends.abstract import QibotnBackend
-from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason
+from qibotn.backends.vidal import (
+    _observable_mpo_tensors,
+    _operator_terms_to_mpo,
+    _symbolic_hamiltonian_to_operator_terms,
+    _unsupported_reason,
+)
 from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
 from qibotn.backends.vidal_tebd import VidalTEBDExecutor
 from qibotn.observables import check_observable
@@ -23,6 +31,51 @@ def _as_bool_or_dict(value, name):
     raise TypeError(f"{name} has an unexpected type")
 
 
+def _bind_numa_node(rank):
+    """Bind the calling process (or thread) to the NUMA node for *rank*.
+
+    The MPI rank is converted to a local (per-node) rank through the
+    environment variables commonly set by Open MPI, MVAPICH, and Slurm.
+    The process CPU affinity and NUMA memory policy are set accordingly.
+
+    Returns the NUMA domain that was selected, or ``None`` if the binding
+    could not be determined.
+    """
+    local_rank = rank
+    for name in (
+        "OMPI_COMM_WORLD_LOCAL_RANK",
+        "MV2_COMM_WORLD_LOCAL_RANK",
+        "MPI_LOCALRANKID",
+        "SLURM_LOCALID",
+    ):
+        try:
+            local_rank = int(os.environ[name])
+            break
+        except (KeyError, ValueError):
+            pass
+
+    domain = local_rank % 2
+    cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
+    try:
+        cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
+        if cpus:
+            os.sched_setaffinity(0, cpus)
+    except (FileNotFoundError, OSError):
+        pass
+
+    try:
+        import ctypes
+
+        libnuma = ctypes.CDLL("libnuma.so.1")
+        if libnuma.numa_available() >= 0:
+            libnuma.numa_run_on_node(ctypes.c_int(domain))
+            libnuma.numa_set_preferred(ctypes.c_int(domain))
+    except Exception:
+        pass
+
+    return domain
+
+
 def _parse_cpu_list(text):
     cpus = set()
     for item in text.split(","):
@@ -90,8 +143,15 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             ),
         )
         self.tensor_module = runcard.get("tensor_module", "torch")
+        self.dtype = runcard.get("dtype", "complex128")
+        self.compile_circuit = bool(runcard.get("compile_circuit", False))
+        self.preprocess = bool(runcard.get("preprocess", False))
+        self.mpi_term_batch_size = runcard.get(
+            "mpi_term_batch_size",
+            runcard.get("term_batch_size", None),
+        )
         self.torch_threads = runcard.get("torch_threads", None)
-        self.quimb_backend = runcard.get("quimb_backend", "numpy")
+        self.quimb_backend = runcard.get("quimb_backend", "torch")
         self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq")
         self.parallel_opts = runcard.get("parallel_opts", {})
         self.parallel_stats = []
@@ -117,7 +177,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             value = self.expectation(circuit, self.observable)
             if self.MPI_enabled and self.rank > 0:
                 return np.asarray([0], dtype=np.int64)
-            return np.asarray([value], dtype=np.float64)
+            dtype = np.complex128 if np.iscomplexobj(value) else np.float64
+            return np.asarray([value], dtype=dtype)
 
         backend = self._quimb_backend()
         backend.configure_tn_simulation(
@@ -131,13 +192,26 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             return_array=return_array,
         )
 
-    def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None):
-        observable = check_observable(observable, circuit.nqubits)
+    def expectation(self, circuit, observable=None, preprocess=None, compile_circuit=None):
+        mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
+        if mpo_tensors is None:
+            observable = check_observable(observable, circuit.nqubits)
+        use_preprocess = self.preprocess if preprocess is None else preprocess
+        if mpo_tensors is not None and not self.MPS_enabled:
+            raise_error(
+                NotImplementedError,
+                "MPO expectation is currently supported only by the Vidal MPS path.",
+            )
 
         if self.MPS_enabled:
             reason = _unsupported_reason(circuit)
-            if reason is None:
-                return self._vidal_expectation(circuit, observable)
+            if reason is None or self.compile_circuit or use_preprocess:
+                return self._vidal_expectation(
+                    circuit,
+                    observable,
+                    preprocess=use_preprocess,
+                    compile_circuit=compile_circuit,
+                )
 
         backend = self._quimb_backend()
         backend.configure_tn_simulation(
@@ -149,8 +223,45 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             return self._quimb_expectation_mpi(backend, circuit, observable)
         return self._quimb_expectation_processpool(backend, circuit, observable)
 
-    def _vidal_expectation(self, circuit, observable):
-        terms = _symbolic_hamiltonian_to_pauli_terms(observable)
+    def _vidal_expectation(
+        self, circuit, observable, preprocess=False, compile_circuit=None
+    ):
+        if compile_circuit is None:
+            compile_circuit = self.compile_circuit
+        if preprocess:
+            if self.MPI_enabled:
+                from mpi4py import MPI
+
+                self.rank = MPI.COMM_WORLD.Get_rank()
+
+            from qibotn.backends.vidal import VidalBackend
+
+            backend = VidalBackend()
+            backend.configure_tn_simulation(
+                max_bond_dimension=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+                compile_circuit=compile_circuit,
+                mpi_approach="CT" if self.MPI_enabled else "SR",
+                mpi_term_batch_size=self.mpi_term_batch_size,
+                fallback=False,
+            )
+            value = backend.expectation(
+                circuit,
+                observable,
+                preprocess=True,
+                compile_circuit=compile_circuit,
+            )
+            self.rank = getattr(backend, "rank", self.rank)
+            self.last_truncation_error = getattr(
+                backend, "last_truncation_error", np.nan
+            )
+            self.last_max_truncation_error = getattr(
+                backend, "last_max_truncation_error", np.nan
+            )
+            return value
+
+        mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
         if self.MPI_enabled:
             from mpi4py import MPI
 
@@ -164,8 +275,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 comm=comm,
             )
             executor.run_circuit(circuit)
-            value = executor.expectation_pauli_sum_root(terms)
-            return np.nan if self.rank != 0 else float(np.real(value))
+            self.last_truncation_error = float(executor.global_truncation_error())
+            self.last_max_truncation_error = float(
+                executor.global_max_truncation_error()
+            )
+            if mpo_tensors is not None:
+                value = executor.expectation_mpo_root(mpo_tensors)
+            else:
+                terms = _symbolic_hamiltonian_to_operator_terms(observable)
+                value = executor.expectation_mpo_root(
+                    _operator_terms_to_mpo(terms, circuit.nqubits)
+                )
+            return np.nan if self.rank != 0 else value
 
         executor = VidalTEBDExecutor(
             nqubits=circuit.nqubits,
@@ -174,7 +295,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             tensor_module=self.tensor_module,
         )
         executor.run_circuit(circuit)
-        return float(np.real(executor.expectation_pauli_sum(terms)))
+        self.last_truncation_error = float(executor.truncation_error)
+        self.last_max_truncation_error = float(executor.max_truncation_error)
+        if mpo_tensors is not None:
+            return executor.expectation_mpo(mpo_tensors)
+        terms = _symbolic_hamiltonian_to_operator_terms(observable)
+        return executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits))
 
     def _quimb_backend(self):
         import qibotn.backends.quimb as qmb
@@ -185,26 +311,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         )
 
     def _bind_rank_to_numa_domain(self, rank):
-        domain = rank % 2
-        cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
-        try:
-            cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
-            if cpus:
-                os.sched_setaffinity(0, cpus)
-        except (FileNotFoundError, OSError):
-            pass
-
-        try:
-            import ctypes
-
-            libnuma = ctypes.CDLL("libnuma.so.1")
-            if libnuma.numa_available() >= 0:
-                libnuma.numa_run_on_node(ctypes.c_int(domain))
-                libnuma.numa_set_preferred(ctypes.c_int(domain))
-        except Exception:
-            pass
-
-        self.numa_domain = domain
+        self.numa_domain = _bind_numa_node(rank)
 
     def _default_search_workers(self, nranks=1):
         if self.torch_threads:
@@ -259,6 +366,22 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         search_time = opts.get("max_time", 60)
         search_backend = opts.get("search_backend")
         dask_address = opts.get("dask_address")
+        dask_close_workers = bool(opts.get("dask_close_workers", False))
+        print_stats = bool(opts.get("print_stats", False))
+        debug_trials = bool(opts.get("debug_trials", False))
+        search_only = bool(opts.get("search_only", False))
+        save_tree_path = opts.get("save_tree_path")
+        load_tree_path = opts.get("load_tree_path")
+        loaded_trees = None
+        saved_trees = []
+        saved_costs = []
+
+        if load_tree_path:
+            with Path(load_tree_path).open("rb") as f:
+                payload = pickle.load(f)
+            loaded_trees = payload["trees"] if isinstance(payload, dict) else payload
+            if not isinstance(loaded_trees, (list, tuple)):
+                loaded_trees = [loaded_trees]
 
         qc = backend._qibo_circuit_to_quimb(
             circuit,
@@ -271,7 +394,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
 
         total_value = 0.0 + 0.0j
         terms = extract_gates_and_qubits(observable)
-        for coeff, factors in terms:
+        for term_index, (coeff, factors) in enumerate(terms):
             if not factors:
                 if self.rank == 0:
                     total_value += coeff
@@ -297,31 +420,112 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 user_slicing_opts,
             )
 
-            tree = parallel_path_search(
-                tn,
-                tn.outer_inds(),
-                method="dask" if method != "mpi" and search_backend == "dask" else method,
-                total_repeats=search_repeats,
-                max_time=search_time,
-                n_workers=search_workers,
-                slicing_opts=slicing_opts,
-                trial_timeout=opts.get("trial_timeout"),
-                search_backend=search_backend,
-                dask_address=dask_address,
-            )
+            if loaded_trees is not None:
+                if term_index >= len(loaded_trees):
+                    raise ValueError(
+                        f"Loaded tree file has {len(loaded_trees)} tree(s), "
+                        f"but term {term_index} was requested."
+                    )
+                tree = loaded_trees[term_index]
+                search_seconds = 0.0
+                if self.rank == 0 and print_stats:
+                    print(
+                        f"tn_tree_loaded term={term_index} path={load_tree_path}",
+                        flush=True,
+                    )
+            else:
+                search_start = time.perf_counter()
+                tree = parallel_path_search(
+                    tn,
+                    tn.outer_inds(),
+                    method="dask" if method != "mpi" and search_backend == "dask" else method,
+                    total_repeats=search_repeats,
+                    max_time=search_time,
+                    n_workers=search_workers,
+                    slicing_opts=slicing_opts,
+                    trial_timeout=opts.get("trial_timeout"),
+                    search_backend=search_backend,
+                    dask_address=dask_address,
+                    debug_trials=debug_trials,
+                    dask_close_workers=dask_close_workers,
+                )
+                search_seconds = time.perf_counter() - search_start
             if tree is None:
                 raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
+            if self.parallel_opts.get("contract_implementation") == "cpp":
+                from qibotn.torch_contractor import prepare_torch_cpp_contractor
 
-            if int(getattr(tree, "multiplicity", 1)) <= 1:
+                prepare_torch_cpp_contractor(tree)
+
+            path_cost = contraction_tree_costs(tree)
+            search_stats = getattr(tree, "qibotn_search_stats", {})
+            if save_tree_path and loaded_trees is None:
+                saved_trees.append(tree)
+                saved_costs.append(path_cost)
+            if self.rank == 0 and print_stats:
+                print(
+                    "tn_search_done "
+                    f"term={term_index} "
+                    f"search_seconds={search_seconds:.3f} "
+                    f"completed_trials={search_stats.get('completed_trials', 'na')} "
+                    f"finite_trials={search_stats.get('finite_trials', 'na')} "
+                    f"failed_trials={search_stats.get('failed_trials', 'na')} "
+                    f"requested_trials={search_stats.get('requested_trials', search_repeats)} "
+                    f"best_score={search_stats.get('best_score', float('nan')):.6g} "
+                    f"slices={path_cost['nslices']} "
+                    f"log10_flops={path_cost['log10_flops']:.3f} "
+                    f"log10_write={path_cost['log10_write']:.3f} "
+                    f"log2_size={path_cost['log2_size']:.3f} "
+                    f"log10_combo={path_cost['log10_combo']:.3f} "
+                    f"peak_memory_gib={path_cost['peak_memory_gib']:.6g}",
+                    flush=True,
+                )
+
+            if search_only:
+                self.parallel_stats.append(
+                    {
+                        "term_index": term_index,
+                        "term_factors": tuple(factors),
+                        "path_cost": path_cost,
+                        "search_stats": search_stats,
+                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
+                        "slice_assignment": "search_only",
+                        "rank_slices": [],
+                        "search_seconds": search_seconds,
+                        "contract_seconds": 0.0,
+                        "search_workers": search_workers,
+                        "search_repeats": search_repeats,
+                        "search_time": search_time,
+                        "search_backend": search_backend or method,
+                        "dask_address": dask_address,
+                        "numa_domain": getattr(self, "numa_domain", None),
+                    }
+                )
+                continue
+
+            if comm is None and int(getattr(tree, "multiplicity", 1)) <= 1:
                 if self.rank == 0:
+                    contract_start = time.perf_counter()
                     value = self._contract_term_unsliced(tn, tree, backend)
+                    contract_seconds = time.perf_counter() - contract_start
+                    if print_stats:
+                        print(
+                            "tn_contract_done "
+                            f"term={term_index} "
+                            f"contract_seconds={contract_seconds:.3f}",
+                            flush=True,
+                        )
                     self.parallel_stats.append(
                         {
+                            "term_index": term_index,
                             "term_factors": tuple(factors),
-                            "path_cost": contraction_tree_costs(tree),
+                            "path_cost": path_cost,
+                            "search_stats": search_stats,
                             "tree_slices": 1,
                             "slice_assignment": "root",
                             "rank_slices": [1] + [0] * (size - 1),
+                            "search_seconds": search_seconds,
+                            "contract_seconds": contract_seconds,
                             "search_workers": search_workers,
                             "search_repeats": search_repeats,
                             "search_time": search_time,
@@ -334,14 +538,27 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 continue
 
             if comm is None:
+                contract_start = time.perf_counter()
                 value = self._contract_term_unsliced(tn, tree, backend)
+                contract_seconds = time.perf_counter() - contract_start
+                if print_stats:
+                    print(
+                        "tn_contract_done "
+                        f"term={term_index} "
+                        f"contract_seconds={contract_seconds:.3f}",
+                        flush=True,
+                    )
                 self.parallel_stats.append(
                     {
+                        "term_index": term_index,
                         "term_factors": tuple(factors),
-                        "path_cost": contraction_tree_costs(tree),
+                        "path_cost": path_cost,
+                        "search_stats": search_stats,
                         "tree_slices": int(getattr(tree, "multiplicity", 1)),
                         "slice_assignment": "local",
                         "rank_slices": [int(getattr(tree, "multiplicity", 1))],
+                        "search_seconds": search_seconds,
+                        "contract_seconds": contract_seconds,
                         "search_workers": search_workers,
                         "search_repeats": search_repeats,
                         "search_time": search_time,
@@ -353,6 +570,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
                 continue
 
+            contract_start = time.perf_counter()
             arrays = self._term_arrays(tn, backend)
             value, stats = parallel_contract(
                 tree,
@@ -360,18 +578,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 method="mpi",
                 comm=comm,
                 return_stats=True,
+                implementation=self.parallel_opts.get("contract_implementation"),
             )
+            contract_seconds = time.perf_counter() - contract_start
             gathered_stats = comm.gather(stats, root=0)
             if rank == 0:
+                if print_stats:
+                    print(
+                        "tn_contract_done "
+                        f"term={term_index} "
+                        f"contract_seconds={contract_seconds:.3f}",
+                        flush=True,
+                    )
                 self.parallel_stats.append(
                     {
+                        "term_index": term_index,
                         "term_factors": tuple(factors),
-                        "path_cost": contraction_tree_costs(tree),
+                        "path_cost": path_cost,
+                        "search_stats": search_stats,
                         "tree_slices": stats.nslices,
                         "slice_assignment": stats.assignment,
                         "rank_slices": [
                             item.local_slices for item in gathered_stats
                         ],
+                        "search_seconds": search_seconds,
+                        "contract_seconds": contract_seconds,
                         "search_workers": search_workers,
                         "search_repeats": search_repeats,
                         "search_time": search_time,
@@ -382,22 +613,73 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 )
                 total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
 
+        if self.rank == 0 and save_tree_path and loaded_trees is None:
+            path = Path(save_tree_path)
+            path.parent.mkdir(parents=True, exist_ok=True)
+            with path.open("wb") as f:
+                pickle.dump(
+                    {
+                        "trees": saved_trees,
+                        "costs": saved_costs,
+                        "nterms": len(saved_trees),
+                    },
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL,
+                )
+            if print_stats:
+                print(
+                    f"tn_tree_saved path={save_tree_path} nterms={len(saved_trees)}",
+                    flush=True,
+                )
+
+        if search_only:
+            return np.nan
+
         return np.nan if rank != 0 else float(np.real(total_value))
 
     def _contract_term_unsliced(self, tn, tree, backend):
+        contract_implementation = self.parallel_opts.get("contract_implementation")
+        if contract_implementation == "cpp":
+            if backend.backend != "torch":
+                raise ValueError("contract_implementation='cpp' requires torch backend.")
+            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
+            from qibotn.torch_contractor import contract_tree_cpp
+
+            arrays = [
+                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
+                for array in tn.arrays
+            ]
+            nslices = int(getattr(tree, "multiplicity", 1))
+            if nslices > 1:
+                total = None
+                for slice_id in range(nslices):
+                    value = contract_tree_cpp(tree, tree.slice_arrays(arrays, slice_id))
+                    total = value if total is None else total + value
+                return total
+            return contract_tree_cpp(tree, arrays)
+
         if backend.backend == "torch":
-            import torch
-            from qibotn.backends.quimb import _torch_cpu_array
+            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
 
             for tensor in tn.tensors:
-                tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128)
-            return tn.contract(all, output_inds=(), optimize=tree, backend="torch")
+                tensor._data = _torch_cpu_array(
+                    tensor._data,
+                    dtype=_torch_dtype(self.dtype),
+                )
+            return tn.contract(
+                all,
+                output_inds=(),
+                optimize=tree,
+                backend="torch",
+                implementation=contract_implementation,
+            )
 
         return tn.contract(
             all,
             output_inds=(),
             optimize=tree,
             backend=backend.backend,
+            implementation=contract_implementation,
         )
 
     def _mpi_slicing_opts(self, user_slicing_opts):
@@ -405,11 +687,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
 
     def _term_arrays(self, tn, backend):
         if backend.backend == "torch":
-            import torch
-            from qibotn.backends.quimb import _torch_cpu_array
+            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
 
             return [
-                _torch_cpu_array(array, dtype=torch.complex128)
+                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
                 for array in tn.arrays
             ]
-        return [backend.engine.asarray(array) for array in tn.arrays]
+        from qibotn.backends.quimb import _numpy_dtype
+
+        return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays]
diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py
index ea894d9..3d49b00 100644
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -62,12 +62,26 @@ def _torch_cpu_array(data, dtype=None):
     return x
 
 
-def _arrays_to_backend(arrays, backend, engine):
-    if backend == "torch":
-        import torch
+def _torch_dtype(dtype):
+    import torch
 
-        return [_torch_cpu_array(array, dtype=torch.complex128) for array in arrays]
-    return [engine.asarray(array) for array in arrays]
+    if dtype in ("complex64", "single"):
+        return torch.complex64
+    return torch.complex128
+
+
+def _numpy_dtype(dtype):
+    import numpy as np
+
+    if dtype in ("complex64", "single"):
+        return np.complex64
+    return np.complex128
+
+
+def _arrays_to_backend(arrays, backend, engine, dtype="complex128"):
+    if backend == "torch":
+        return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in arrays]
+    return [engine.asarray(array, dtype=_numpy_dtype(dtype)) for array in arrays]
 
 
 def _pauli_term_to_dense_operator(factors):
@@ -145,7 +159,7 @@ def pauli_product_expectation(
     return tn.contract(all, output_inds=(), optimize=optimize, backend=backend)
 
 
-def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"):
+def __init__(self, quimb_backend="torch", contraction_optimizer="auto-hq"):
     super(self.__class__, self).__init__()
 
     self.name = "qibotn"
@@ -198,7 +212,7 @@ def circuit_ansatz(self):
 
 
 def setup_backend_specifics(
-    self, quimb_backend="numpy", contractions_optimizer="auto-hq"
+    self, quimb_backend="torch", contractions_optimizer="auto-hq"
 ):
     """Setup backend specifics.
     Args:
@@ -645,7 +659,7 @@ METHODS = {
 }
 
 
-def _generate_backend(quimb_backend: str = "numpy"):
+def _generate_backend(quimb_backend: str = "torch"):
     bases = (QibotnBackend,)
 
     if quimb_backend == "numpy":
@@ -653,9 +667,14 @@ def _generate_backend(quimb_backend: str = "numpy"):
 
         bases += (NumpyBackend,)
     elif quimb_backend == "torch":
-        from qiboml.backends import PyTorchBackend
+        try:
+            from qiboml.backends import PyTorchBackend
+        except ImportError:
+            from qibo.backends import NumpyBackend
 
-        bases += (PyTorchBackend,)
+            bases += (NumpyBackend,)
+        else:
+            bases += (PyTorchBackend,)
     elif quimb_backend == "jax":
         from qiboml.backends import JaxBackend
 
diff --git a/src/qibotn/backends/vidal.py b/src/qibotn/backends/vidal.py
index 02c8a93..8fbd4d5 100644
--- a/src/qibotn/backends/vidal.py
+++ b/src/qibotn/backends/vidal.py
@@ -39,6 +39,162 @@ def _symbolic_hamiltonian_to_pauli_terms(hamiltonian):
     return terms
 
 
+def _symbolic_hamiltonian_to_operator_terms(hamiltonian):
+    terms = []
+    factor_pattern = re.compile(r"([^\d]+)(\d+)")
+    paulis = {
+        "I": np.eye(2, dtype=np.complex128),
+        "X": np.array([[0, 1], [1, 0]], dtype=np.complex128),
+        "Y": np.array([[0, -1j], [1j, 0]], dtype=np.complex128),
+        "Z": np.array([[1, 0], [0, -1]], dtype=np.complex128),
+    }
+    for term in hamiltonian.terms:
+        ops_by_site = {}
+        for factor in term.factors:
+            site = getattr(factor, "target_qubit", None)
+            matrix = getattr(factor, "matrix", None)
+            if site is None or matrix is None:
+                match = factor_pattern.match(str(factor))
+                if match is None:
+                    raise ValueError(f"Unsupported observable factor {factor!r}.")
+                name = match.group(1).upper()
+                if name not in paulis:
+                    raise ValueError(f"Unsupported observable operator {name!r}.")
+                site = int(match.group(2))
+                matrix = paulis[name]
+            matrix = np.asarray(matrix, dtype=np.complex128)
+            site = int(site)
+            if site in ops_by_site:
+                ops_by_site[site] = ops_by_site[site] @ matrix
+            else:
+                ops_by_site[site] = matrix
+        terms.append((complex(term.coefficient), tuple(ops_by_site.items())))
+    return terms
+
+
+def _dense_operator_to_product_terms(coeff, qubits, matrix):
+    """Expand a dense k-local operator into product-matrix terms.
+
+    The dense matrix basis is ordered by the provided ``qubits`` sequence.  For
+    example, ``qubits=[2, 5]`` means matrix rows/columns are ordered as
+    ``|q2 q5>``.
+    """
+    qubits = tuple(int(qubit) for qubit in qubits)
+    if len(set(qubits)) != len(qubits):
+        raise ValueError("Dense observable qubits must be unique.")
+    matrix = np.asarray(matrix, dtype=np.complex128)
+    dim = 2 ** len(qubits)
+    if matrix.shape != (dim, dim):
+        raise ValueError(
+            "Dense observable matrix shape must be "
+            f"({dim}, {dim}) for {len(qubits)} qubits."
+        )
+
+    units = [
+        np.array([[1, 0], [0, 0]], dtype=np.complex128),
+        np.array([[0, 1], [0, 0]], dtype=np.complex128),
+        np.array([[0, 0], [1, 0]], dtype=np.complex128),
+        np.array([[0, 0], [0, 1]], dtype=np.complex128),
+    ]
+    terms = []
+    for row in range(dim):
+        for col in range(dim):
+            value = complex(coeff) * complex(matrix[row, col])
+            if value == 0:
+                continue
+            ops = []
+            for offset, site in enumerate(qubits):
+                shift = len(qubits) - offset - 1
+                out_bit = (row >> shift) & 1
+                in_bit = (col >> shift) & 1
+                ops.append((site, units[2 * out_bit + in_bit]))
+            terms.append((value, tuple(ops)))
+    return terms
+
+
+def _dense_observable_to_operator_terms(observable):
+    if not isinstance(observable, dict):
+        return None
+
+    if "matrix" in observable:
+        terms = [observable]
+    else:
+        terms = observable.get("dense_terms")
+        if terms is None:
+            raw_terms = observable.get("terms")
+            if not raw_terms or not any("matrix" in term for term in raw_terms):
+                return None
+            terms = raw_terms
+
+    operator_terms = []
+    for term in terms:
+        if "matrix" not in term:
+            raise ValueError("Dense observable terms must include a matrix.")
+        qubits = term.get("qubits", term.get("sites"))
+        if qubits is None:
+            raise ValueError("Dense observable terms must include qubits or sites.")
+        operator_terms.extend(
+            _dense_operator_to_product_terms(
+                term.get("coefficient", 1.0),
+                qubits,
+                term["matrix"],
+            )
+        )
+    return operator_terms
+
+
+def _operator_terms_to_mpo(terms, nqubits):
+    """Build an exact direct-sum MPO for product-operator terms.
+
+    This intentionally favors correctness and generality over compression: an
+    ``m``-term sum becomes an MPO with bond dimension ``m``.  Local Hamiltonians
+    can be compressed later without changing the public expectation path.
+    """
+    identity = np.eye(2, dtype=np.complex128)
+    expanded_terms = []
+    for coeff, ops in terms:
+        local_ops = [identity for _ in range(nqubits)]
+        for site, matrix in ops:
+            site = int(site)
+            if site < 0 or site >= nqubits:
+                raise ValueError(f"Observable site {site} is outside the circuit.")
+            matrix = np.asarray(matrix, dtype=np.complex128)
+            if matrix.shape != (2, 2):
+                raise ValueError("Only qubit local operators with shape (2, 2) are supported.")
+            local_ops[site] = matrix
+        expanded_terms.append((complex(coeff), local_ops))
+
+    if not expanded_terms:
+        raise ValueError("Cannot build an MPO from an empty observable.")
+
+    bond_dim = len(expanded_terms)
+    mpo = []
+    for site in range(nqubits):
+        left_dim = 1 if site == 0 else bond_dim
+        right_dim = 1 if site == nqubits - 1 else bond_dim
+        tensor = np.zeros((left_dim, 2, 2, right_dim), dtype=np.complex128)
+        for term_index, (coeff, local_ops) in enumerate(expanded_terms):
+            left = 0 if site == 0 else term_index
+            right = 0 if site == nqubits - 1 else term_index
+            op = coeff * local_ops[site] if site == 0 else local_ops[site]
+            tensor[left, :, :, right] += op
+        mpo.append(tensor)
+    return mpo
+
+
+def _observable_mpo_tensors(observable, nqubits=None):
+    if isinstance(observable, dict):
+        if "mpo_tensors" in observable:
+            return observable["mpo_tensors"]
+        if "mpo" in observable:
+            return observable["mpo"]
+        if nqubits is not None:
+            terms = _dense_observable_to_operator_terms(observable)
+            if terms is not None:
+                return _operator_terms_to_mpo(terms, nqubits)
+    return None
+
+
 def _unsupported_reason(circuit):
     for gate in circuit.queue:
         name = getattr(gate, "name", gate.__class__.__name__)
@@ -71,6 +227,44 @@ def _can_route_non_adjacent(circuit):
     return True
 
 
+@dataclass
+class _PreparedCircuit:
+    nqubits: int
+    queue: list
+
+
+def _decompose_gate_for_mps(gate, nqubits, stack=()):
+    sites = _gate_sites(gate)
+    if len(sites) <= 2:
+        return [gate]
+    if gate in stack or not hasattr(gate, "decompose"):
+        name = getattr(gate, "name", gate.__class__.__name__)
+        raise ValueError(f"gate {name} acts on {len(sites)} qubits")
+
+    free = [qubit for qubit in range(nqubits) if qubit not in sites]
+    try:
+        decomposed = gate.decompose(*free, use_toffolis=False, method="standard")
+    except TypeError:
+        decomposed = gate.decompose(*free)
+    if not decomposed or decomposed == [gate]:
+        name = getattr(gate, "name", gate.__class__.__name__)
+        raise ValueError(f"gate {name} could not be decomposed for Vidal MPS")
+
+    result = []
+    for item in decomposed:
+        result.extend(_decompose_gate_for_mps(item, nqubits, stack + (gate,)))
+    return result
+
+
+def _prepare_circuit_for_mps(circuit, decompose=True):
+    if not decompose:
+        return circuit
+    queue = []
+    for gate in circuit.queue:
+        queue.extend(_decompose_gate_for_mps(gate, circuit.nqubits))
+    return _PreparedCircuit(nqubits=circuit.nqubits, queue=queue)
+
+
 @dataclass
 class VidalBackend(QibotnBackend, NumpyBackend):
     """QiboTN backend using Vidal/TEBD when possible.
@@ -89,14 +283,16 @@ class VidalBackend(QibotnBackend, NumpyBackend):
         self.name = "qibotn"
         self.platform = "vidal"
         self.precision = "double"
+        self.rank = 0
         self.last_truncation_error = 0.0
+        self.last_max_truncation_error = 0.0
         self.configure_tn_simulation()
 
     def configure_tn_simulation(
         self,
         ansatz: str = "MPS",
-        max_bond_dimension: int = 10,
-        cut_ratio: float = 1e-9,
+        max_bond_dimension: int | None = 10,
+        cut_ratio: float | None = 1e-9,
         trunc_tracking_mode: str = "C",
         svd_control: str = "E!",
         ini_bond_dimension: int = 1,
@@ -108,6 +304,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
         mpi_num_procs: int = 1,
         mpi_where_barriers: int = -1,
         mpi_isometrization: int = -1,
+        mpi_term_batch_size: int | None = None,
         fallback: bool = True,
     ):
         self.ansatz = ansatz
@@ -124,6 +321,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
         self.mpi_num_procs = mpi_num_procs
         self.mpi_where_barriers = mpi_where_barriers
         self.mpi_isometrization = mpi_isometrization
+        self.mpi_term_batch_size = mpi_term_batch_size
         self.fallback = fallback
         self._fallback_backend = None
 
@@ -157,10 +355,15 @@ class VidalBackend(QibotnBackend, NumpyBackend):
             raise NotImplementedError(reason)
         return self._qmatchatea_fallback()
 
+    def _preprocess_circuit(self, circuit, compile_circuit):
+        """Decompose unsupported multi-qubit gates for the local Vidal path."""
+        return _prepare_circuit_for_mps(circuit, decompose=True)
+
     def _run_fast_executor(self, circuit, compile_circuit=True):
         if self.mpi_approach == "CT":
             from mpi4py import MPI
 
+            self.rank = MPI.COMM_WORLD.Get_rank()
             executor = SegmentVidalMPIExecutor(
                 nqubits=circuit.nqubits,
                 max_bond=self.max_bond_dimension,
@@ -169,6 +372,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
                 comm=MPI.COMM_WORLD,
             )
         else:
+            self.rank = 0
             executor = VidalTEBDExecutor(
                 nqubits=circuit.nqubits,
                 max_bond=self.max_bond_dimension,
@@ -183,9 +387,21 @@ class VidalBackend(QibotnBackend, NumpyBackend):
             backend = self._fallback_or_raise("VidalBackend supports only MPS.")
             return backend.expectation(circuit, observable, preprocess, compile_circuit)
 
+        original_circuit = circuit
         if compile_circuit is None:
             compile_circuit = self.compile_circuit
 
+        if preprocess:
+            try:
+                circuit = self._preprocess_circuit(circuit, compile_circuit)
+            except Exception as exc:
+                backend = self._fallback_or_raise(
+                    f"VidalBackend preprocessing failed: {exc}"
+                )
+                return backend.expectation(
+                    original_circuit, observable, preprocess, compile_circuit
+                )
+
         reason = _unsupported_reason(circuit)
         if reason is not None:
             # Non-adjacent gates can be routed at compile time
@@ -193,40 +409,51 @@ class VidalBackend(QibotnBackend, NumpyBackend):
                 pass  # proceed with Vidal + SWAP routing
             else:
                 backend = self._fallback_or_raise(reason)
-                return backend.expectation(circuit, observable, preprocess, compile_circuit)
+                return backend.expectation(
+                    original_circuit, observable, preprocess, compile_circuit
+                )
 
-        # preprocess=True requests qmatchatea-style transpilation to MPS topology.
-        # The fast path validates MPS-compatibility via _unsupported_reason above;
-        # if the circuit is already MPS-compatible, preprocessing is a no-op.
-        # If it isn't, _unsupported_reason already routed to fallback.
-        # So the fast path skips preprocessing — just warn once.
-        if preprocess:
-            import warnings
+        executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
+        self.last_truncation_error = float(
+            executor.global_truncation_error()
+            if hasattr(executor, "global_truncation_error")
+            else executor.truncation_error
+        )
+        self.last_max_truncation_error = float(
+            executor.global_max_truncation_error()
+            if hasattr(executor, "global_max_truncation_error")
+            else executor.max_truncation_error
+        )
 
-            warnings.warn(
-                "VidalBackend fast path does not preprocess circuits. "
-                "If the circuit passes _unsupported_reason (all gates are "
-                "MPS-compatible), fast-path execution proceeds directly.",
-                stacklevel=2,
-            )
+        mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
+        if mpo_tensors is not None:
+            if self.mpi_approach == "CT":
+                value = executor.expectation_mpo_root(mpo_tensors)
+                from qtealeaves.tooling.mpisupport import MPI
+
+                if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                    return np.nan
+                return value
+            return executor.expectation_mpo(mpo_tensors)
 
         hamiltonian = check_observable(observable, circuit.nqubits)
         try:
-            terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian)
+            terms = _symbolic_hamiltonian_to_operator_terms(hamiltonian)
         except ValueError as exc:
             backend = self._fallback_or_raise(str(exc))
-            return backend.expectation(circuit, observable, preprocess, compile_circuit)
+            return backend.expectation(
+                original_circuit, observable, preprocess, compile_circuit
+            )
 
-        executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
-        self.last_truncation_error = float(executor.truncation_error)
+        mpo_tensors = _operator_terms_to_mpo(terms, circuit.nqubits)
         if self.mpi_approach == "CT":
-            value = executor.expectation_pauli_sum_root(terms)
+            value = executor.expectation_mpo_root(mpo_tensors)
             from qtealeaves.tooling.mpisupport import MPI
 
             if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
                 return np.nan
-            return np.real(value)
-        return np.real(executor.expectation_pauli_sum(terms))
+            return value
+        return executor.expectation_mpo(mpo_tensors)
 
     def execute_circuit(
         self,
diff --git a/src/qibotn/backends/vidal_mpi_segment.py b/src/qibotn/backends/vidal_mpi_segment.py
index c0742c6..1014588 100644
--- a/src/qibotn/backends/vidal_mpi_segment.py
+++ b/src/qibotn/backends/vidal_mpi_segment.py
@@ -23,6 +23,7 @@ from qibotn.backends.vidal_tebd import (
     _is_two_qubit_batch,
     _make_two_site_update,
     _ones,
+    _real_if_close,
     _route_non_adjacent_gates,
     _svd,
     _tensor_update_from_numpy,
@@ -35,6 +36,8 @@ from qibotn.backends.vidal_tebd import (
 
 _EDGE_TAG = 1701
 _UPDATE_TAG = 1702
+_EXPECT_ENV_TAG = 1703
+_EXPECT_RESULT_TAG = 1704
 
 
 def _partition_sites(nsites, nranks):
@@ -49,9 +52,9 @@ def _partition_sites(nsites, nranks):
 @dataclass
 class SegmentVidalMPIExecutor:
     nqubits: int
-    max_bond: int
+    max_bond: int | None
     comm: object
-    cut_ratio: float = 1e-12
+    cut_ratio: float | None = 1e-12
     tensor_module: str = "torch"
 
     def __post_init__(self):
@@ -63,6 +66,10 @@ class SegmentVidalMPIExecutor:
         if self.start == self.end:
             raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.")
 
+        from qibotn.backends.cpu import _bind_numa_node
+
+        self.numa_domain = _bind_numa_node(self.rank)
+
         self.xp = _backend_module(self.tensor_module)
         if self.xp is np:
             self.dtype = np.complex128
@@ -82,11 +89,22 @@ class SegmentVidalMPIExecutor:
             for bond in range(self.start, self.end + 1)
         }
         self._accumulated_truncation_error = 0.0
+        self._max_truncation_error = 0.0
 
     @property
     def truncation_error(self):
         return self._accumulated_truncation_error
 
+    def global_truncation_error(self):
+        return self.comm.allreduce(self._accumulated_truncation_error, op=MPI.SUM)
+
+    @property
+    def max_truncation_error(self):
+        return self._max_truncation_error
+
+    def global_max_truncation_error(self):
+        return self.comm.allreduce(self._max_truncation_error, op=MPI.MAX)
+
     def owns_site(self, site):
         return self.start <= site < self.end
 
@@ -270,7 +288,12 @@ class SegmentVidalMPIExecutor:
     def _install_update(self, update):
         if isinstance(update["left"], np.ndarray):
             update = _tensor_update_from_numpy(self.xp, update, self.dtype)
-        self._accumulated_truncation_error += update.get("truncation_error", 0.0)
+        truncation_error = update.get("truncation_error", 0.0)
+        self._accumulated_truncation_error += truncation_error
+        self._max_truncation_error = max(
+            self._max_truncation_error,
+            truncation_error,
+        )
         site = update["site"]
         if self.owns_site(site):
             self.gammas[site] = update["left"]
@@ -288,26 +311,189 @@ class SegmentVidalMPIExecutor:
         }
         return self.comm.gather(payload, root=0)
 
-    def expectation_pauli_sum_root(self, terms):
-        payloads = self.gather_full_state()
-        if self.rank != 0:
-            return None
-        full = VidalTEBDExecutor(
-            self.nqubits,
-            self.max_bond,
-            cut_ratio=self.cut_ratio,
-            tensor_module=self.tensor_module,
+    def expectation_pauli_sum_root(self, terms, term_batch_size=None):
+        paulis = {
+            "I": self._eye(2),
+            "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
+            "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
+            "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
+        }
+        operator_terms = [
+            (
+                coeff,
+                tuple((site, paulis[name.upper()]) for name, site in ops),
+            )
+            for coeff, ops in terms
+        ]
+        return self.expectation_operator_sum_root(
+            operator_terms,
+            term_batch_size=term_batch_size,
         )
-        for payload in payloads:
-            for site, tensor in payload["gammas"].items():
-                full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype)
-            for bond, tensor in payload["lambdas"].items():
-                full.lambdas[int(bond)] = _asarray(
-                    self.xp,
-                    tensor,
-                    self.xp.float64 if self.xp is not np else np.float64,
+
+    def expectation_operator_sum_root(self, terms, term_batch_size=None):
+        if term_batch_size is None:
+            term_batch_size = max(1, len(terms))
+        norm = self._distributed_product_expectation({})
+        total = 0.0 + 0.0j
+        for start in range(0, len(terms), int(term_batch_size)):
+            batch = terms[start : start + int(term_batch_size)]
+            values = self._distributed_operator_batch_expectation(batch, norm)
+            if self.rank == 0:
+                for (coeff, _), term_value in zip(batch, values):
+                    total += complex(coeff) * complex(term_value)
+        return None if self.rank != 0 else _real_if_close(total / norm)
+
+    def _eye(self, size):
+        if self.xp is np:
+            return np.eye(size, dtype=self.dtype)
+        return self.xp.eye(size, dtype=self.dtype, device=self.device)
+
+    def _distributed_product_expectation(self, operators):
+        if self.rank == 0:
+            env = self._segment_product_environment(operators)
+            if self.size == 1:
+                return env.reshape(-1)[0]
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_product_environment(operators, incoming)
+        if self.rank == self.size - 1:
+            self.comm.send(_to_numpy(env).reshape(-1)[0], dest=0, tag=_EXPECT_RESULT_TAG)
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_product_environment(self, operators, incoming=None):
+        if incoming is None:
+            env = _asarray(
+                self.xp,
+                np.eye(len(self.lambdas[self.start]), dtype=np.complex128),
+                self.dtype,
+            )
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        identity = self._eye(2)
+        for site in range(self.start, self.end):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            op = operators.get(site, identity)
+            env = self.xp.einsum(
+                "xy,xsb,st,ytd->bd", env, self._conj(tensor), op, tensor
+            )
+        return env
+
+    def _distributed_operator_batch_expectation(self, terms, norm):
+        if not terms:
+            return []
+        if all(not ops for _, ops in terms):
+            return [norm] * len(terms) if self.rank == 0 else None
+
+        batch_ops = [
+            {int(site): _asarray(self.xp, matrix, self.dtype) for site, matrix in ops}
+            for _, ops in terms
+        ]
+        if self.rank == 0:
+            env = self._segment_operator_batch_environment(batch_ops)
+            if self.size == 1:
+                return list(env.reshape(len(terms), -1)[:, 0])
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_operator_batch_environment(batch_ops, incoming)
+        if self.rank == self.size - 1:
+            values = list(_to_numpy(env).reshape(len(terms), -1)[:, 0])
+            self.comm.send(values, dest=0, tag=_EXPECT_RESULT_TAG)
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_operator_batch_environment(self, batch_ops, incoming=None):
+        batch_size = len(batch_ops)
+        if incoming is None:
+            dim = len(self.lambdas[self.start])
+            env = _asarray(
+                self.xp,
+                np.tile(np.eye(dim, dtype=np.complex128), (batch_size, 1, 1)),
+                self.dtype,
+            )
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        identity = self._eye(2)
+        for site in range(self.start, self.end):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            ops = self.xp.stack(
+                [operators.get(site, identity) for operators in batch_ops],
+                axis=0,
+            )
+            env = self.xp.einsum(
+                "nxy,xsb,nst,ytd->nbd",
+                env,
+                self._conj(tensor),
+                ops,
+                tensor,
+            )
+        return env
+
+    def _conj(self, tensor):
+        return np.conjugate(tensor) if self.xp is np else tensor.conj()
+
+    def expectation_mpo_root(self, mpo_tensors):
+        if len(mpo_tensors) != self.nqubits:
+            raise ValueError(
+                f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
+            )
+        norm = self._distributed_product_expectation({})
+        if self.rank == 0:
+            env = self._segment_mpo_environment(mpo_tensors)
+            if self.size == 1:
+                return _real_if_close(env.reshape(-1)[0] / norm)
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            value = self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+            return _real_if_close(value / norm)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_mpo_environment(mpo_tensors, incoming)
+        if self.rank == self.size - 1:
+            self.comm.send(
+                _to_numpy(env).reshape(-1)[0],
+                dest=0,
+                tag=_EXPECT_RESULT_TAG,
+            )
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_mpo_environment(self, mpo_tensors, incoming=None):
+        if incoming is None:
+            left_dim = len(self.lambdas[self.start])
+            env = _asarray(
+                self.xp,
+                np.zeros((left_dim, 1, left_dim), dtype=np.complex128),
+                self.dtype,
+            )
+            env[:, 0, :] = self._eye(left_dim)
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        for site in range(self.start, self.end):
+            mpo = _asarray(self.xp, mpo_tensors[site], self.dtype)
+            if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
+                raise ValueError(
+                    "Each MPO tensor must have shape "
+                    "(left_bond, 2, 2, right_bond)."
                 )
-        return full.expectation_pauli_sum(terms)
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            env = self.xp.einsum(
+                "xlc,xub,lutr,ctd->brd",
+                env,
+                self._conj(tensor),
+                mpo,
+                tensor,
+            )
+        return env
 
     def expectation_ring_xz_root(self):
         terms = [
diff --git a/src/qibotn/backends/vidal_tebd.py b/src/qibotn/backends/vidal_tebd.py
index e2b2640..25f001e 100644
--- a/src/qibotn/backends/vidal_tebd.py
+++ b/src/qibotn/backends/vidal_tebd.py
@@ -50,10 +50,10 @@ def _transpose(xp, tensor, axes):
     return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes)
 
 
-def _vdot_real(xp, left, right):
+def _vdot(xp, left, right):
     if xp is np:
-        return np.vdot(left.reshape(-1), right.reshape(-1)).real
-    return xp.vdot(left.reshape(-1), right.reshape(-1)).real
+        return np.vdot(left.reshape(-1), right.reshape(-1))
+    return xp.vdot(left.reshape(-1), right.reshape(-1))
 
 
 def _to_float(x):
@@ -62,6 +62,19 @@ def _to_float(x):
     return float(x)
 
 
+def _to_scalar(x):
+    if hasattr(x, "detach"):
+        return x.detach().cpu().item()
+    if isinstance(x, np.ndarray):
+        return x.item()
+    return x
+
+
+def _real_if_close(x, tol=1000):
+    value = np.real_if_close(x, tol=tol)
+    return value.item() if isinstance(value, np.ndarray) else value
+
+
 def _to_numpy(tensor):
     if hasattr(tensor, "detach"):
         return tensor.detach().cpu().numpy()
@@ -154,8 +167,8 @@ def _safe_inverse(xp, values):
 @dataclass
 class VidalTEBDExecutor:
     nqubits: int
-    max_bond: int
-    cut_ratio: float = 1e-12
+    max_bond: int | None
+    cut_ratio: float | None = 1e-12
     tensor_module: str = "torch"
 
     def __post_init__(self):
@@ -175,6 +188,7 @@ class VidalTEBDExecutor:
             _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
         ]
         self._accumulated_truncation_error = 0.0
+        self._max_truncation_error = 0.0
 
     def run_circuit(self, circuit, compile_circuit=True):
         gates = circuit.queue
@@ -189,6 +203,10 @@ class VidalTEBDExecutor:
     def truncation_error(self):
         return self._accumulated_truncation_error
 
+    @property
+    def max_truncation_error(self):
+        return self._max_truncation_error
+
     def _apply_gate(self, gate):
         sites = _gate_sites(gate)
         matrix = _asarray(self.xp, gate.matrix(), self.dtype)
@@ -232,6 +250,10 @@ class VidalTEBDExecutor:
         update = _make_two_site_update(item, umat, singvals, vh,
                                        self.max_bond, self.cut_ratio, self.xp)
         self._accumulated_truncation_error += update["truncation_error"]
+        self._max_truncation_error = max(
+            self._max_truncation_error,
+            update["truncation_error"],
+        )
         i = update["site"]
         self.gammas[i] = update["left"]
         self.gammas[i + 1] = update["right"]
@@ -252,25 +274,37 @@ class VidalTEBDExecutor:
             "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
             "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
         }
-        value = 0.0
+        operator_terms = [
+            (
+                coeff,
+                tuple((site, paulis[name.upper()]) for name, site in ops),
+            )
+            for coeff, ops in terms
+        ]
+        return self.expectation_operator_sum(operator_terms)
+
+    def expectation_operator_sum(self, terms):
+        value = 0.0 + 0.0j
         norm = self.norm()
         for coeff, ops in terms:
-            ops = tuple((name.upper(), site) for name, site in ops)
+            operators = {
+                int(site): _asarray(self.xp, matrix, self.dtype)
+                for site, matrix in ops
+            }
             if len(ops) == 0:
                 term_value = norm
-            elif len(ops) == 1:
-                name, site = ops[0]
-                term_value = _to_float(self._expect_one_site(site, paulis[name]))
-            elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1:
-                (name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1])
-                term_value = _to_float(
-                    self._expect_adjacent(site0, paulis[name0], paulis[name1])
+            elif len(operators) == 1:
+                site, matrix = next(iter(operators.items()))
+                term_value = _to_scalar(self._expect_one_site(site, matrix))
+            elif len(operators) == 2 and abs(max(operators) - min(operators)) == 1:
+                site0, site1 = sorted(operators)
+                term_value = _to_scalar(
+                    self._expect_adjacent(site0, operators[site0], operators[site1])
                 )
             else:
-                operators = {site: paulis[name] for name, site in ops}
-                term_value = _to_float(self.expect_product_operators(operators))
-            value += float(np.real(coeff)) * term_value
-        return value / norm
+                term_value = _to_scalar(self.expect_product_operators(operators))
+            value += complex(coeff) * complex(term_value)
+        return _real_if_close(value / norm)
 
     def _expect_one_site(self, site, op):
         theta = self.xp.einsum(
@@ -280,7 +314,7 @@ class VidalTEBDExecutor:
             self.lambdas[site + 1],
         )
         op_theta = self.xp.einsum("us,asb->aub", op, theta)
-        return _vdot_real(self.xp, theta, op_theta)
+        return _vdot(self.xp, theta, op_theta)
 
     def _expect_adjacent(self, site, op_left, op_right):
         theta = self.xp.einsum(
@@ -292,7 +326,7 @@ class VidalTEBDExecutor:
             self.lambdas[site + 2],
         )
         op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta)
-        return _vdot_real(self.xp, theta, op_theta)
+        return _vdot(self.xp, theta, op_theta)
 
     def expect_product_operators(self, operators):
         env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype)
@@ -303,10 +337,38 @@ class VidalTEBDExecutor:
             env = self.xp.einsum(
                 "xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor
             )
-        return env.reshape(-1)[0].real
+        return env.reshape(-1)[0]
 
     def norm(self):
-        return _to_float(self.expect_product_operators({}))
+        return float(np.real(_to_scalar(self.expect_product_operators({}))))
+
+    def expectation_mpo(self, mpo_tensors):
+        """Compute ``<psi|MPO|psi> / <psi|psi>``.
+
+        MPO tensors are expected in ``(left_bond, phys_out, phys_in, right_bond)``
+        order, with physical dimension 2 on every site.
+        """
+        if len(mpo_tensors) != self.nqubits:
+            raise ValueError(
+                f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
+            )
+        env = _asarray(self.xp, [[[1.0 + 0.0j]]], self.dtype)
+        for site, raw_mpo in enumerate(mpo_tensors):
+            mpo = _asarray(self.xp, raw_mpo, self.dtype)
+            if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
+                raise ValueError(
+                    "Each MPO tensor must have shape "
+                    "(left_bond, 2, 2, right_bond)."
+                )
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            env = self.xp.einsum(
+                "xlc,xub,lutr,ctd->brd",
+                env,
+                _conj(self.xp, tensor),
+                mpo,
+                tensor,
+            )
+        return _real_if_close(_to_scalar(env.reshape(-1)[0]) / self.norm())
 
 
 def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
@@ -328,8 +390,8 @@ def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, ga
 
 def _choose_bond(singvals, max_bond, cut_ratio, xp):
     max_possible = int(singvals.shape[0])
-    keep = min(max_possible, max_bond)
-    if cut_ratio > 0 and max_possible > 0:
+    keep = max_possible if max_bond is None else min(max_possible, int(max_bond))
+    if cut_ratio is not None and cut_ratio > 0 and max_possible > 0:
         threshold = singvals[0] * cut_ratio
         if xp is np:
             ratio_keep = int(np.count_nonzero(singvals > threshold))
@@ -415,8 +477,8 @@ class _RoutedTwoQubitGate:
     name = "routed_two_qubit"
     control_qubits = ()
 
-    def __init__(self, original_gate, left_phys, right_phys):
-        self.target_qubits = (left_phys, right_phys)
+    def __init__(self, original_gate, physical_sites):
+        self.target_qubits = tuple(physical_sites)
         self._matrix = original_gate.matrix()
 
     def matrix(self):
@@ -447,8 +509,10 @@ def _route_non_adjacent_gates(gates, nqubits):
         for pos in range(right - 1, left, -1):
             routed.append(_SWAPGate(pos, pos + 1))
 
-        # Apply the original gate at the adjacent physical positions
-        routed.append(_RoutedTwoQubitGate(gate, left, left + 1))
+        # Apply the original gate in its original qubit order.  For gates like
+        # CNOT(5, 0), sorting the routed sites would swap control and target.
+        physical_map = {left: left, right: left + 1}
+        routed.append(_RoutedTwoQubitGate(gate, [physical_map[site] for site in sites]))
 
         # Reverse SWAPs to restore original ordering
         for pos in range(left + 1, right):
diff --git a/src/qibotn/circuit_convertor.py b/src/qibotn/circuit_convertor.py
index 1c8b3ee..900cdf7 100644
--- a/src/qibotn/circuit_convertor.py
+++ b/src/qibotn/circuit_convertor.py
@@ -1,6 +1,19 @@
-import cupy as cp
 import numpy as np
 
+try:
+    import cupy as cp
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+
+
+def _require_cupy():
+    if cp is None:
+        raise ImportError(
+            "The cuQuantum circuit converter requires cupy. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+    return cp
+
 # Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
 
 
@@ -19,7 +32,7 @@ class QiboCircuitToEinsum:
     """
 
     def __init__(self, circuit, dtype="complex128"):
-        self.backend = cp
+        self.backend = _require_cupy()
         self.dtype = getattr(self.backend, dtype)
         self.init_basis_map(self.backend, dtype)
         self.init_intermediate_circuit(circuit)
@@ -116,7 +129,9 @@ class QiboCircuitToEinsum:
             required_shape = self.op_shape_from_qubits(len(gate_qubits))
             self.gate_tensors.append(
                 (
-                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape),
+                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
+                        required_shape
+                    ),
                     gate_qubits,
                 )
             )
@@ -161,7 +176,7 @@ class QiboCircuitToEinsum:
             required_shape = self.op_shape_from_qubits(len(gate_qubits))
             self.gate_tensors_inverse.append(
                 (
-                    cp.asarray(gate.matrix()).reshape(required_shape),
+                    self.backend.asarray(gate.matrix()).reshape(required_shape),
                     gate_qubits,
                 )
             )
@@ -169,7 +184,7 @@ class QiboCircuitToEinsum:
         # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
         self.active_qubits_inverse = np.unique(gates_qubits_inverse)
 
-    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp):
+    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
         """Populate the gates for all pauli operators.
 
         Parameters:
@@ -180,6 +195,8 @@ class QiboCircuitToEinsum:
         Returns:
             A sequence of pauli gates.
         """
+        if backend is None:
+            backend = _require_cupy()
         asarray = backend.asarray
         pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
         pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
diff --git a/src/qibotn/circuit_to_mps.py b/src/qibotn/circuit_to_mps.py
index 680ef9b..48cf55d 100644
--- a/src/qibotn/circuit_to_mps.py
+++ b/src/qibotn/circuit_to_mps.py
@@ -1,10 +1,23 @@
-import cupy as cp
-import cuquantum.bindings.cutensornet as cutn
 import numpy as np
 
 from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.mps_utils import apply_gate, initial
 
+try:
+    import cupy as cp
+    import cuquantum.bindings.cutensornet as cutn
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    cutn = None
+
+
+def _require_cuquantum():
+    if cp is None or cutn is None:
+        raise ImportError(
+            "The cuQuantum MPS converter requires cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
 
 class QiboCircuitToMPS:
     """A helper class to convert Qibo circuit to MPS.
@@ -23,6 +36,7 @@ class QiboCircuitToMPS:
         dtype="complex128",
         rand_seed=0,
     ):
+        _require_cuquantum()
         np.random.seed(rand_seed)
         cp.random.seed(rand_seed)
 
@@ -44,4 +58,6 @@ class QiboCircuitToMPS:
             )
 
     def __del__(self):
-        cutn.destroy(self.handle)
+        handle = getattr(self, "handle", None)
+        if cutn is not None and handle is not None:
+            cutn.destroy(handle)
diff --git a/src/qibotn/csrc/torch_contractor.cpp b/src/qibotn/csrc/torch_contractor.cpp
new file mode 100644
index 0000000..9551460
--- /dev/null
+++ b/src/qibotn/csrc/torch_contractor.cpp
@@ -0,0 +1,1024 @@
+#include <torch/extension.h>
+
+#include <ATen/ops/bmm_ops.h>
+#include <ATen/ops/mm_ops.h>
+#include <ATen/ops/mul_ops.h>
+
+#ifdef QIBOTN_USE_MKL
+#include <mkl_cblas.h>
+#include <mkl_service.h>
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <complex>
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+namespace {
+
+using Tensor = torch::Tensor;
+
+struct ContractOp {
+  int64_t out_id;
+  int64_t left_id;
+  int64_t right_id;
+  int64_t kind;
+  std::vector<int64_t> left_perm;
+  std::vector<int64_t> left_shape;
+  std::vector<int64_t> right_perm;
+  std::vector<int64_t> right_shape;
+  std::vector<int64_t> out_shape;
+  std::vector<int64_t> out_perm;
+};
+
+bool env_enabled(const char* name) {
+  const char* raw = std::getenv(name);
+  if (raw == nullptr) {
+    return false;
+  }
+  std::string value(raw);
+  return value == "1" || value == "true" || value == "TRUE" ||
+      value == "yes" || value == "on";
+}
+
+int env_int(const char* name, int default_value) {
+  const char* raw = std::getenv(name);
+  if (raw == nullptr || raw[0] == '\0') {
+    return default_value;
+  }
+  char* end = nullptr;
+  long value = std::strtol(raw, &end, 10);
+  if (end == raw || value <= 0) {
+    return default_value;
+  }
+  return static_cast<int>(value);
+}
+
+int64_t env_i64(const char* name, int64_t default_value) {
+  const char* raw = std::getenv(name);
+  if (raw == nullptr || raw[0] == '\0') {
+    return default_value;
+  }
+  char* end = nullptr;
+  long long value = std::strtoll(raw, &end, 10);
+  if (end == raw || value <= 0) {
+    return default_value;
+  }
+  return static_cast<int64_t>(value);
+}
+
+struct WorkspaceKey {
+  c10::ScalarType dtype;
+  c10::DeviceType device_type;
+  c10::DeviceIndex device_index;
+  int64_t numel;
+
+  bool operator==(const WorkspaceKey& other) const {
+    return dtype == other.dtype && device_type == other.device_type &&
+        device_index == other.device_index && numel == other.numel;
+  }
+};
+
+struct WorkspaceKeyHash {
+  size_t operator()(const WorkspaceKey& key) const {
+    size_t h = static_cast<size_t>(key.numel);
+    h ^= static_cast<size_t>(key.dtype) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    h ^= static_cast<size_t>(key.device_type) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    h ^= static_cast<size_t>(key.device_index) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    return h;
+  }
+};
+
+class Workspace {
+ public:
+  Workspace()
+      : max_cached_bytes_(read_size_env(
+            "QIBOTN_CPP_WORKSPACE_MAX_BYTES",
+            1LL * 1024 * 1024 * 1024)),
+        max_buffer_bytes_(read_size_env(
+            "QIBOTN_CPP_WORKSPACE_MAX_BUFFER_BYTES",
+            256LL * 1024 * 1024)) {}
+
+  Tensor acquire(
+      const std::vector<int64_t>& shape,
+      c10::ScalarType dtype,
+      c10::Device device) {
+    const auto numel = product(shape);
+    WorkspaceKey key{dtype, device.type(), device.index(), numel};
+    auto& bucket = pool_[key];
+    Tensor out;
+    if (!bucket.empty()) {
+      out = bucket.back();
+      bucket.pop_back();
+      cached_bytes_ -= tensor_bytes(out);
+      out.resize_(shape);
+    } else {
+      out = torch::empty(shape, torch::TensorOptions().dtype(dtype).device(device));
+    }
+    return out;
+  }
+
+  void release(const Tensor& x) {
+    if (!x.defined() || !x.is_contiguous() || x.storage_offset() != 0) {
+      return;
+    }
+    const auto bytes = tensor_bytes(x);
+    if (bytes > max_buffer_bytes_) {
+      return;
+    }
+    if (cached_bytes_ + bytes > max_cached_bytes_) {
+      return;
+    }
+    WorkspaceKey key{
+        x.scalar_type(),
+        x.device().type(),
+        x.device().index(),
+        x.numel(),
+    };
+    pool_[key].push_back(x);
+    cached_bytes_ += bytes;
+  }
+
+ private:
+  static int64_t read_size_env(const char* name, int64_t default_value) {
+    const char* raw = std::getenv(name);
+    if (raw == nullptr || raw[0] == '\0') {
+      return default_value;
+    }
+    char* end = nullptr;
+    const auto value = std::strtod(raw, &end);
+    if (end == raw || value < 0) {
+      return default_value;
+    }
+    std::string suffix(end);
+    int64_t multiplier = 1;
+    if (suffix == "g" || suffix == "G" || suffix == "gb" || suffix == "GB") {
+      multiplier = 1024LL * 1024 * 1024;
+    } else if (
+        suffix == "m" || suffix == "M" || suffix == "mb" || suffix == "MB") {
+      multiplier = 1024LL * 1024;
+    } else if (
+        suffix == "k" || suffix == "K" || suffix == "kb" || suffix == "KB") {
+      multiplier = 1024LL;
+    }
+    return static_cast<int64_t>(value * multiplier);
+  }
+
+  static int64_t product(const std::vector<int64_t>& shape) {
+    int64_t out = 1;
+    for (const auto value : shape) {
+      out *= value;
+    }
+    return out;
+  }
+
+  static int64_t tensor_bytes(const Tensor& x) {
+    return x.numel() * x.element_size();
+  }
+
+  std::unordered_map<WorkspaceKey, std::vector<Tensor>, WorkspaceKeyHash> pool_;
+  int64_t cached_bytes_ = 0;
+  int64_t max_cached_bytes_;
+  int64_t max_buffer_bytes_;
+};
+
+struct PreparedTensor {
+  Tensor tensor;
+  bool workspace_owned;
+};
+
+std::vector<int64_t> tensor_sizes(const Tensor& x) {
+  return std::vector<int64_t>(x.sizes().begin(), x.sizes().end());
+}
+
+PreparedTensor apply_preparation(
+    Workspace& workspace,
+    Tensor x,
+    const std::vector<int64_t>& perm,
+    const std::vector<int64_t>& shape) {
+  bool workspace_owned = false;
+  if (!perm.empty()) {
+    x = x.permute(perm);
+  }
+  if (!shape.empty()) {
+    try {
+      x = x.view(shape);
+    } catch (const c10::Error&) {
+      Tensor out = workspace.acquire(shape, x.scalar_type(), x.device());
+      out.view(tensor_sizes(x)).copy_(x);
+      x = out;
+      workspace_owned = true;
+    }
+  }
+  return PreparedTensor{x, workspace_owned};
+}
+
+std::vector<int64_t> broadcast_shape(
+    c10::IntArrayRef left,
+    c10::IntArrayRef right) {
+  const auto ndim = std::max(left.size(), right.size());
+  std::vector<int64_t> out(ndim, 1);
+  for (size_t i = 0; i < ndim; ++i) {
+    const auto li = left.size() < ndim - i ? 1 : left[i - (ndim - left.size())];
+    const auto ri = right.size() < ndim - i ? 1 : right[i - (ndim - right.size())];
+    if (li != ri && li != 1 && ri != 1) {
+      throw std::runtime_error("Shapes are not broadcastable.");
+    }
+    out[i] = std::max<int64_t>(li, ri);
+  }
+  return out;
+}
+
+std::vector<int64_t> matmul_output_shape(const Tensor& left, const Tensor& right) {
+  if (left.dim() == 2 && right.dim() == 2) {
+    return {left.size(-2), right.size(-1)};
+  }
+  auto batch = broadcast_shape(
+      left.sizes().slice(0, left.dim() - 2),
+      right.sizes().slice(0, right.dim() - 2));
+  batch.push_back(left.size(-2));
+  batch.push_back(right.size(-1));
+  return batch;
+}
+
+std::vector<int64_t> multiply_output_shape(const Tensor& left, const Tensor& right) {
+  return broadcast_shape(left.sizes(), right.sizes());
+}
+
+bool direct_cblas_mm(const Tensor& left, const Tensor& right, Tensor& out) {
+#ifndef QIBOTN_USE_MKL
+  return false;
+#else
+  static const bool direct_mkl_enabled = env_enabled("QIBOTN_CPP_DIRECT_MKL");
+  if (!direct_mkl_enabled) {
+    return false;
+  }
+  if (left.device().type() != c10::DeviceType::CPU ||
+      right.device().type() != c10::DeviceType::CPU ||
+      out.device().type() != c10::DeviceType::CPU) {
+    return false;
+  }
+  if (left.dim() != 2 || right.dim() != 2 || out.dim() != 2) {
+    return false;
+  }
+  if (!left.is_contiguous() || !right.is_contiguous() || !out.is_contiguous()) {
+    return false;
+  }
+  if (left.scalar_type() != right.scalar_type() ||
+      left.scalar_type() != out.scalar_type()) {
+    return false;
+  }
+  if (left.size(1) != right.size(0) ||
+      out.size(0) != left.size(0) ||
+      out.size(1) != right.size(1)) {
+    return false;
+  }
+
+  const int m = static_cast<int>(left.size(0));
+  const int k = static_cast<int>(left.size(1));
+  const int n = static_cast<int>(right.size(1));
+  const int lda = static_cast<int>(left.stride(0));
+  const int ldb = static_cast<int>(right.stride(0));
+  const int ldc = static_cast<int>(out.stride(0));
+  const int mkl_threads = env_int("QIBOTN_CPP_MKL_THREADS", 0);
+  const int previous_threads =
+      mkl_threads > 0 ? mkl_set_num_threads_local(mkl_threads) : 0;
+
+  if (left.scalar_type() == c10::ScalarType::ComplexFloat) {
+    const std::complex<float> alpha(1.0f, 0.0f);
+    const std::complex<float> beta(0.0f, 0.0f);
+    cblas_cgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasNoTrans,
+        m,
+        n,
+        k,
+        &alpha,
+        left.data_ptr(),
+        lda,
+        right.data_ptr(),
+        ldb,
+        &beta,
+        out.data_ptr(),
+        ldc);
+    if (mkl_threads > 0) {
+      mkl_set_num_threads_local(previous_threads);
+    }
+    return true;
+  }
+  if (left.scalar_type() == c10::ScalarType::ComplexDouble) {
+    const std::complex<double> alpha(1.0, 0.0);
+    const std::complex<double> beta(0.0, 0.0);
+    cblas_zgemm(
+        CblasRowMajor,
+        CblasNoTrans,
+        CblasNoTrans,
+        m,
+        n,
+        k,
+        &alpha,
+        left.data_ptr(),
+        lda,
+        right.data_ptr(),
+        ldb,
+        &beta,
+        out.data_ptr(),
+        ldc);
+    if (mkl_threads > 0) {
+      mkl_set_num_threads_local(previous_threads);
+    }
+    return true;
+  }
+  if (mkl_threads > 0) {
+    mkl_set_num_threads_local(previous_threads);
+  }
+  return false;
+#endif
+}
+
+Tensor apply_output_preparation(
+    Tensor x,
+    const std::vector<int64_t>& shape,
+    const std::vector<int64_t>& perm) {
+  if (!shape.empty()) {
+    x = x.reshape(shape);
+  }
+  if (!perm.empty()) {
+    x = x.permute(perm);
+  }
+  return x;
+}
+
+struct ExecutedOp {
+  Tensor left;
+  Tensor right;
+  bool left_workspace_owned = false;
+  bool right_workspace_owned = false;
+};
+
+void release_inputs(
+    const ContractOp& op,
+    const ExecutedOp& executed,
+    size_t ninputs,
+    std::vector<Tensor>& temps,
+    Workspace& workspace) {
+  if (executed.left_workspace_owned) {
+    workspace.release(executed.left);
+  }
+  if (executed.right_workspace_owned) {
+    workspace.release(executed.right);
+  }
+  if (op.left_id >= static_cast<int64_t>(ninputs)) {
+    workspace.release(temps.at(static_cast<size_t>(op.left_id)));
+    temps.at(static_cast<size_t>(op.left_id)) = Tensor();
+  }
+  if (op.right_id >= static_cast<int64_t>(ninputs)) {
+    workspace.release(temps.at(static_cast<size_t>(op.right_id)));
+    temps.at(static_cast<size_t>(op.right_id)) = Tensor();
+  }
+}
+
+ExecutedOp execute_single_op(
+    const ContractOp& op,
+    std::vector<Tensor>& temps,
+    Workspace& workspace) {
+  Tensor left = temps.at(static_cast<size_t>(op.left_id));
+  Tensor right = temps.at(static_cast<size_t>(op.right_id));
+  if (!left.defined() || !right.defined()) {
+    throw std::runtime_error("Contraction plan referenced a released tensor.");
+  }
+
+  auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape);
+  auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape);
+  left = prepared_left.tensor;
+  right = prepared_right.tensor;
+
+  Tensor out;
+  if (op.kind == 0) {
+    auto out_shape = matmul_output_shape(left, right);
+    out = workspace.acquire(out_shape, left.scalar_type(), left.device());
+    if (left.dim() == 2 && right.dim() == 2) {
+      if (!direct_cblas_mm(left, right, out)) {
+        at::_ops::mm_out::call(left, right, out);
+      }
+    } else if (left.dim() == 3 && right.dim() == 3) {
+      at::_ops::bmm_out::call(left, right, out);
+    } else {
+      out = torch::matmul(left, right);
+    }
+  } else if (op.kind == 1) {
+    auto out_shape = multiply_output_shape(left, right);
+    out = workspace.acquire(out_shape, left.scalar_type(), left.device());
+    at::_ops::mul_out::call(left, right, out);
+  } else {
+    throw std::runtime_error("Unknown contraction op kind.");
+  }
+
+  out = apply_output_preparation(out, op.out_shape, op.out_perm);
+  temps.at(static_cast<size_t>(op.out_id)) = out;
+  return ExecutedOp{left, right, prepared_left.workspace_owned, prepared_right.workspace_owned};
+}
+
+struct GemmBatchKey {
+  c10::ScalarType dtype;
+  int64_t m;
+  int64_t k;
+  int64_t n;
+
+  bool operator==(const GemmBatchKey& other) const {
+    return dtype == other.dtype && m == other.m && k == other.k && n == other.n;
+  }
+};
+
+struct GemmBatchKeyHash {
+  size_t operator()(const GemmBatchKey& key) const {
+    size_t h = static_cast<size_t>(key.m);
+    h ^= static_cast<size_t>(key.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    h ^= static_cast<size_t>(key.n) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    h ^= static_cast<size_t>(key.dtype) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+    return h;
+  }
+};
+
+bool static_small_2d_gemm_key(
+    const ContractOp& op,
+    int64_t max_flops,
+    GemmBatchKey& key) {
+  if (op.kind != 0 || op.left_shape.size() != 2 || op.right_shape.size() != 2) {
+    return false;
+  }
+  const int64_t m = op.left_shape[0];
+  const int64_t k = op.left_shape[1];
+  const int64_t n = op.right_shape[1];
+  if (op.right_shape[0] != k || m * k * n > max_flops) {
+    return false;
+  }
+  key = GemmBatchKey{c10::ScalarType::Undefined, m, k, n};
+  return true;
+}
+
+#ifdef QIBOTN_USE_MKL
+bool execute_mkl_batch(
+    const std::vector<int>& op_indices,
+    const std::vector<ContractOp>& ops,
+    std::vector<Tensor>& temps,
+    Workspace& workspace,
+    std::vector<ExecutedOp>& executed) {
+  if (op_indices.empty()) {
+    return false;
+  }
+
+  const auto& first_op = ops[op_indices[0]];
+  std::vector<Tensor> lefts;
+  std::vector<Tensor> rights;
+  std::vector<Tensor> outs;
+  lefts.reserve(op_indices.size());
+  rights.reserve(op_indices.size());
+  outs.reserve(op_indices.size());
+  executed.clear();
+  executed.reserve(op_indices.size());
+
+  c10::ScalarType dtype = c10::ScalarType::Undefined;
+  int m = 0;
+  int k = 0;
+  int n = 0;
+  for (const int op_index : op_indices) {
+    const auto& op = ops[op_index];
+    Tensor left = temps.at(static_cast<size_t>(op.left_id));
+    Tensor right = temps.at(static_cast<size_t>(op.right_id));
+    if (!left.defined() || !right.defined()) {
+      throw std::runtime_error("Batch contraction referenced a released tensor.");
+    }
+    auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape);
+    auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape);
+    left = prepared_left.tensor;
+    right = prepared_right.tensor;
+    if (left.dim() != 2 || right.dim() != 2 || !left.is_contiguous() ||
+        !right.is_contiguous() || left.device().type() != c10::DeviceType::CPU ||
+        right.device().type() != c10::DeviceType::CPU ||
+        left.scalar_type() != right.scalar_type()) {
+      return false;
+    }
+    if (dtype == c10::ScalarType::Undefined) {
+      dtype = left.scalar_type();
+      m = static_cast<int>(left.size(0));
+      k = static_cast<int>(left.size(1));
+      n = static_cast<int>(right.size(1));
+    }
+    if (left.scalar_type() != dtype || left.size(0) != m || left.size(1) != k ||
+        right.size(0) != k || right.size(1) != n) {
+      return false;
+    }
+    Tensor out = workspace.acquire({m, n}, dtype, left.device());
+    if (!out.is_contiguous()) {
+      return false;
+    }
+    lefts.push_back(left);
+    rights.push_back(right);
+    outs.push_back(out);
+    executed.push_back(
+        ExecutedOp{left, right, prepared_left.workspace_owned, prepared_right.workspace_owned});
+  }
+
+  if (dtype != c10::ScalarType::ComplexFloat && dtype != c10::ScalarType::ComplexDouble) {
+    return false;
+  }
+
+  const int group_count = 1;
+  const int group_size[] = {static_cast<int>(op_indices.size())};
+  const CBLAS_TRANSPOSE transa[] = {CblasNoTrans};
+  const CBLAS_TRANSPOSE transb[] = {CblasNoTrans};
+  const int m_array[] = {m};
+  const int n_array[] = {n};
+  const int k_array[] = {k};
+  const int lda[] = {k};
+  const int ldb[] = {n};
+  const int ldc[] = {n};
+  std::vector<const void*> a_array(op_indices.size());
+  std::vector<const void*> b_array(op_indices.size());
+  std::vector<void*> c_array(op_indices.size());
+  for (size_t i = 0; i < op_indices.size(); ++i) {
+    a_array[i] = lefts[i].data_ptr();
+    b_array[i] = rights[i].data_ptr();
+    c_array[i] = outs[i].data_ptr();
+  }
+
+  const int mkl_threads = env_int("QIBOTN_CPP_BATCH_MKL_THREADS", 0);
+  const int previous_threads =
+      mkl_threads > 0 ? mkl_set_num_threads_local(mkl_threads) : 0;
+  if (dtype == c10::ScalarType::ComplexFloat) {
+    const std::complex<float> alpha(1.0f, 0.0f);
+    const std::complex<float> beta(0.0f, 0.0f);
+    const void* alpha_array[] = {&alpha};
+    const void* beta_array[] = {&beta};
+    cblas_cgemm_batch(
+        CblasRowMajor,
+        transa,
+        transb,
+        m_array,
+        n_array,
+        k_array,
+        alpha_array,
+        a_array.data(),
+        lda,
+        b_array.data(),
+        ldb,
+        beta_array,
+        c_array.data(),
+        ldc,
+        group_count,
+        group_size);
+  } else {
+    const std::complex<double> alpha(1.0, 0.0);
+    const std::complex<double> beta(0.0, 0.0);
+    const void* alpha_array[] = {&alpha};
+    const void* beta_array[] = {&beta};
+    cblas_zgemm_batch(
+        CblasRowMajor,
+        transa,
+        transb,
+        m_array,
+        n_array,
+        k_array,
+        alpha_array,
+        a_array.data(),
+        lda,
+        b_array.data(),
+        ldb,
+        beta_array,
+        c_array.data(),
+        ldc,
+        group_count,
+        group_size);
+  }
+  if (mkl_threads > 0) {
+    mkl_set_num_threads_local(previous_threads);
+  }
+
+  for (size_t i = 0; i < op_indices.size(); ++i) {
+    const auto& op = ops[op_indices[i]];
+    Tensor out = apply_output_preparation(outs[i], op.out_shape, op.out_perm);
+    temps.at(static_cast<size_t>(op.out_id)) = out;
+  }
+  return true;
+}
+#endif
+
+std::vector<int64_t> list_to_i64_vector(const pybind11::handle& object) {
+  std::vector<int64_t> values;
+  for (const auto item : object) {
+    values.push_back(pybind11::cast<int64_t>(item));
+  }
+  return values;
+}
+
+ContractOp parse_op(const pybind11::handle& object) {
+  auto tuple = pybind11::cast<pybind11::tuple>(object);
+  if (tuple.size() != 10) {
+    throw std::runtime_error("Compiled contraction op must have 10 fields.");
+  }
+
+  ContractOp op;
+  op.out_id = pybind11::cast<int64_t>(tuple[0]);
+  op.left_id = pybind11::cast<int64_t>(tuple[1]);
+  op.right_id = pybind11::cast<int64_t>(tuple[2]);
+  op.kind = pybind11::cast<int64_t>(tuple[3]);
+  op.left_perm = list_to_i64_vector(tuple[4]);
+  op.left_shape = list_to_i64_vector(tuple[5]);
+  op.right_perm = list_to_i64_vector(tuple[6]);
+  op.right_shape = list_to_i64_vector(tuple[7]);
+  op.out_shape = list_to_i64_vector(tuple[8]);
+  op.out_perm = list_to_i64_vector(tuple[9]);
+  return op;
+}
+
+std::vector<ContractOp> parse_plan(const pybind11::list& plan) {
+  std::vector<ContractOp> ops;
+  ops.reserve(plan.size());
+  for (const auto object : plan) {
+    ops.push_back(parse_op(object));
+  }
+  return ops;
+}
+
+Tensor run_contract(
+    const std::vector<Tensor>& arrays,
+    const std::vector<ContractOp>& ops,
+    int64_t ntemps,
+    int64_t root_id,
+    Workspace& workspace) {
+  c10::InferenceMode inference_guard(true);
+  std::vector<Tensor> temps(static_cast<size_t>(ntemps));
+
+  if (arrays.size() > temps.size()) {
+    throw std::runtime_error("Temporary tensor table is smaller than input arrays.");
+  }
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    temps[i] = arrays[i];
+  }
+
+  for (const auto& op : ops) {
+    Tensor left = temps.at(static_cast<size_t>(op.left_id));
+    Tensor right = temps.at(static_cast<size_t>(op.right_id));
+    if (!left.defined() || !right.defined()) {
+      throw std::runtime_error("Contraction plan referenced a released tensor.");
+    }
+
+    auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape);
+    auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape);
+    left = prepared_left.tensor;
+    right = prepared_right.tensor;
+
+    Tensor out;
+    if (op.kind == 0) {
+      auto out_shape = matmul_output_shape(left, right);
+      out = workspace.acquire(out_shape, left.scalar_type(), left.device());
+      if (left.dim() == 2 && right.dim() == 2) {
+        if (!direct_cblas_mm(left, right, out)) {
+          at::_ops::mm_out::call(left, right, out);
+        }
+      } else if (left.dim() == 3 && right.dim() == 3) {
+        at::_ops::bmm_out::call(left, right, out);
+      } else {
+        out = torch::matmul(left, right);
+      }
+    } else if (op.kind == 1) {
+      auto out_shape = multiply_output_shape(left, right);
+      out = workspace.acquire(out_shape, left.scalar_type(), left.device());
+      at::_ops::mul_out::call(left, right, out);
+    } else {
+      throw std::runtime_error("Unknown contraction op kind.");
+    }
+
+    out = apply_output_preparation(out, op.out_shape, op.out_perm);
+    temps.at(static_cast<size_t>(op.out_id)) = out;
+
+    if (prepared_left.workspace_owned) {
+      workspace.release(left);
+    }
+    if (prepared_right.workspace_owned) {
+      workspace.release(right);
+    }
+    if (op.left_id >= static_cast<int64_t>(arrays.size())) {
+      workspace.release(temps.at(static_cast<size_t>(op.left_id)));
+      temps.at(static_cast<size_t>(op.left_id)) = Tensor();
+    }
+    if (op.right_id >= static_cast<int64_t>(arrays.size())) {
+      workspace.release(temps.at(static_cast<size_t>(op.right_id)));
+      temps.at(static_cast<size_t>(op.right_id)) = Tensor();
+    }
+  }
+
+  Tensor result = temps.at(static_cast<size_t>(root_id));
+  if (!result.defined()) {
+    throw std::runtime_error("Contraction plan did not produce a result tensor.");
+  }
+  return result;
+}
+
+Tensor run_contract_dag(
+    const std::vector<Tensor>& arrays,
+    const std::vector<ContractOp>& ops,
+    int64_t ntemps,
+    int64_t root_id,
+    Workspace& workspace) {
+  c10::InferenceMode inference_guard(true);
+  std::vector<Tensor> temps(static_cast<size_t>(ntemps));
+  if (arrays.size() > temps.size()) {
+    throw std::runtime_error("Temporary tensor table is smaller than input arrays.");
+  }
+  for (size_t i = 0; i < arrays.size(); ++i) {
+    temps[i] = arrays[i];
+  }
+
+  std::unordered_map<int64_t, int> producer;
+  producer.reserve(ops.size());
+  for (size_t i = 0; i < ops.size(); ++i) {
+    producer[ops[i].out_id] = static_cast<int>(i);
+  }
+
+  std::vector<std::vector<int>> dependents(ops.size());
+  std::vector<int> pending_inputs(ops.size(), 0);
+  std::vector<int> remaining_consumers(static_cast<size_t>(ntemps), 0);
+  for (size_t i = 0; i < ops.size(); ++i) {
+    for (const auto tensor_id : {ops[i].left_id, ops[i].right_id}) {
+      if (tensor_id >= 0 && tensor_id < ntemps) {
+        remaining_consumers[static_cast<size_t>(tensor_id)] += 1;
+      }
+      auto it = producer.find(tensor_id);
+      if (it != producer.end()) {
+        pending_inputs[i] += 1;
+        dependents[it->second].push_back(static_cast<int>(i));
+      }
+    }
+  }
+
+  std::vector<int> ready;
+  ready.reserve(ops.size());
+  for (size_t i = 0; i < ops.size(); ++i) {
+    if (pending_inputs[i] == 0) {
+      ready.push_back(static_cast<int>(i));
+    }
+  }
+
+  const bool batch_enabled = env_enabled("QIBOTN_CPP_BATCH_SMALL_GEMM");
+  const int min_batch = env_int("QIBOTN_CPP_BATCH_MIN_SIZE", 4);
+  const int64_t max_flops = env_i64("QIBOTN_CPP_BATCH_MAX_FLOPS", 1000000);
+  std::vector<char> done(ops.size(), 0);
+  size_t completed = 0;
+
+  auto mark_consumed_and_release = [&](const ContractOp& op, const ExecutedOp& executed) {
+    if (executed.left_workspace_owned) {
+      workspace.release(executed.left);
+    }
+    if (executed.right_workspace_owned) {
+      workspace.release(executed.right);
+    }
+    for (const auto tensor_id : {op.left_id, op.right_id}) {
+      if (tensor_id < static_cast<int64_t>(arrays.size()) || tensor_id < 0) {
+        continue;
+      }
+      auto& count = remaining_consumers[static_cast<size_t>(tensor_id)];
+      count -= 1;
+      if (count == 0) {
+        workspace.release(temps.at(static_cast<size_t>(tensor_id)));
+        temps.at(static_cast<size_t>(tensor_id)) = Tensor();
+      }
+    }
+  };
+
+  while (completed < ops.size()) {
+    if (ready.empty()) {
+      throw std::runtime_error("DAG contractor has no ready ops before completion.");
+    }
+
+    std::vector<int> to_execute;
+#ifdef QIBOTN_USE_MKL
+    if (batch_enabled) {
+      std::unordered_map<GemmBatchKey, std::vector<int>, GemmBatchKeyHash> groups;
+      for (const int op_index : ready) {
+        if (done[op_index]) {
+          continue;
+        }
+        GemmBatchKey key;
+        if (!static_small_2d_gemm_key(ops[op_index], max_flops, key)) {
+          continue;
+        }
+        Tensor left = temps.at(static_cast<size_t>(ops[op_index].left_id));
+        if (!left.defined()) {
+          continue;
+        }
+        key.dtype = left.scalar_type();
+        groups[key].push_back(op_index);
+      }
+      for (const auto& item : groups) {
+        if (static_cast<int>(item.second.size()) >= min_batch) {
+          to_execute = item.second;
+          break;
+        }
+      }
+    }
+#endif
+
+    if (to_execute.empty()) {
+      to_execute.push_back(ready.back());
+    }
+
+    std::vector<ExecutedOp> executed_batch;
+    bool batch_ok = false;
+#ifdef QIBOTN_USE_MKL
+    if (to_execute.size() > 1) {
+      batch_ok = execute_mkl_batch(to_execute, ops, temps, workspace, executed_batch);
+    }
+#endif
+    if (!batch_ok) {
+      if (to_execute.size() > 1) {
+        to_execute.resize(1);
+      }
+      executed_batch.clear();
+      executed_batch.push_back(execute_single_op(ops[to_execute[0]], temps, workspace));
+    }
+
+    for (size_t j = 0; j < to_execute.size(); ++j) {
+      const int op_index = to_execute[j];
+      if (done[op_index]) {
+        continue;
+      }
+      done[op_index] = 1;
+      completed += 1;
+      mark_consumed_and_release(ops[op_index], executed_batch[j]);
+      for (const int dep : dependents[op_index]) {
+        pending_inputs[dep] -= 1;
+        if (pending_inputs[dep] == 0) {
+          ready.push_back(dep);
+        }
+      }
+    }
+
+    ready.erase(
+        std::remove_if(
+            ready.begin(),
+            ready.end(),
+            [&](int op_index) { return done[op_index] != 0; }),
+        ready.end());
+  }
+
+  Tensor result = temps.at(static_cast<size_t>(root_id));
+  if (!result.defined()) {
+    throw std::runtime_error("DAG contractor did not produce a result tensor.");
+  }
+  return result;
+}
+
+Tensor contract_impl(
+    const std::vector<Tensor>& arrays,
+    const pybind11::list& plan,
+    int64_t ntemps,
+    int64_t root_id) {
+  auto ops = parse_plan(plan);
+  Workspace workspace;
+  return run_contract(arrays, ops, ntemps, root_id, workspace);
+}
+
+class Contractor {
+ public:
+  Contractor(const pybind11::list& plan, int64_t ntemps, int64_t root_id)
+      : ops_(parse_plan(plan)), ntemps_(ntemps), root_id_(root_id) {
+    compute_batch_summary();
+  }
+
+  Tensor contract(const std::vector<Tensor>& arrays) {
+    if (env_enabled("QIBOTN_CPP_DAG_SCHEDULER")) {
+      return run_contract_dag(arrays, ops_, ntemps_, root_id_, workspace_);
+    }
+    return run_contract(arrays, ops_, ntemps_, root_id_, workspace_);
+  }
+
+  int64_t nsteps() const {
+    return static_cast<int64_t>(ops_.size());
+  }
+
+  pybind11::dict batch_summary() const {
+    pybind11::dict out;
+    out["groups"] = batch_groups_;
+    out["covered_ops"] = batch_covered_ops_;
+    out["calls_saved"] = batch_calls_saved_;
+    return out;
+  }
+
+ private:
+  void compute_batch_summary() {
+    const int min_batch = env_int("QIBOTN_CPP_BATCH_MIN_SIZE", 4);
+    const int64_t max_flops = env_i64("QIBOTN_CPP_BATCH_MAX_FLOPS", 1000000);
+    std::unordered_map<int64_t, int> producer;
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      producer[ops_[i].out_id] = static_cast<int>(i);
+    }
+    std::vector<int> pending_inputs(ops_.size(), 0);
+    std::vector<std::vector<int>> dependents(ops_.size());
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      for (const auto tensor_id : {ops_[i].left_id, ops_[i].right_id}) {
+        auto it = producer.find(tensor_id);
+        if (it != producer.end()) {
+          pending_inputs[i] += 1;
+          dependents[it->second].push_back(static_cast<int>(i));
+        }
+      }
+    }
+    std::vector<int> ready;
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      if (pending_inputs[i] == 0) {
+        ready.push_back(static_cast<int>(i));
+      }
+    }
+    std::vector<char> done(ops_.size(), 0);
+    size_t completed = 0;
+    while (completed < ops_.size() && !ready.empty()) {
+      std::unordered_map<GemmBatchKey, int, GemmBatchKeyHash> counts;
+      int best_count = 0;
+      for (const int op_index : ready) {
+        GemmBatchKey key;
+        if (!static_small_2d_gemm_key(ops_[op_index], max_flops, key)) {
+          continue;
+        }
+        key.dtype = c10::ScalarType::ComplexFloat;
+        int count = ++counts[key];
+        best_count = std::max(best_count, count);
+      }
+      if (best_count >= min_batch) {
+        batch_groups_ += 1;
+        batch_covered_ops_ += best_count;
+        batch_calls_saved_ += best_count - 1;
+      }
+      // Simulate one scheduler step: batch best group if present, otherwise one op.
+      std::vector<int> executed;
+      if (best_count >= min_batch) {
+        GemmBatchKey best_key;
+        for (const auto& item : counts) {
+          if (item.second == best_count) {
+            best_key = item.first;
+            break;
+          }
+        }
+        for (const int op_index : ready) {
+          GemmBatchKey key;
+          if (static_small_2d_gemm_key(ops_[op_index], max_flops, key)) {
+            key.dtype = c10::ScalarType::ComplexFloat;
+            if (key == best_key) {
+              executed.push_back(op_index);
+            }
+          }
+        }
+      } else {
+        executed.push_back(ready.back());
+      }
+      for (const int op_index : executed) {
+        if (done[op_index]) {
+          continue;
+        }
+        done[op_index] = 1;
+        completed += 1;
+        for (const int dep : dependents[op_index]) {
+          pending_inputs[dep] -= 1;
+          if (pending_inputs[dep] == 0) {
+            ready.push_back(dep);
+          }
+        }
+      }
+      ready.erase(
+          std::remove_if(
+              ready.begin(),
+              ready.end(),
+              [&](int op_index) { return done[op_index] != 0; }),
+          ready.end());
+    }
+  }
+
+  std::vector<ContractOp> ops_;
+  int64_t ntemps_;
+  int64_t root_id_;
+  Workspace workspace_;
+  int64_t batch_groups_ = 0;
+  int64_t batch_covered_ops_ = 0;
+  int64_t batch_calls_saved_ = 0;
+};
+
+}  // namespace
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "contract",
+      &contract_impl,
+      "Run a compiled cotengra contraction plan with torch CPU tensors.");
+  pybind11::class_<Contractor>(m, "Contractor")
+      .def(pybind11::init<const pybind11::list&, int64_t, int64_t>())
+      .def("contract", &Contractor::contract)
+      .def("batch_summary", &Contractor::batch_summary)
+      .def_property_readonly("nsteps", &Contractor::nsteps);
+}
diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index ae36893..f2fbf71 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,8 +1,3 @@
-import cupy as cp
-import cuquantum.bindings.cutensornet as cutn
-from cupy.cuda import nccl
-from cupy.cuda.runtime import getDeviceCount
-from cuquantum.tensornet import Network, contract
 from mpi4py import MPI
 
 from qibotn.circuit_convertor import QiboCircuitToEinsum
@@ -15,8 +10,37 @@ from qibotn.observables import (
     extract_gates_and_qubits,
 )
 
+try:
+    import cupy as cp
+    import cuquantum.bindings.cutensornet as cutn
+    from cupy.cuda import nccl
+    from cupy.cuda.runtime import getDeviceCount
+    from cuquantum.tensornet import Network, contract
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    cutn = None
+    nccl = None
+    getDeviceCount = None
+    Network = None
+    contract = None
 
-def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
+
+def _require_cuquantum():
+    if (
+        cp is None
+        or cutn is None
+        or nccl is None
+        or getDeviceCount is None
+        or Network is None
+        or contract is None
+    ):
+        raise ImportError(
+            "The legacy GPU evaluation helpers require cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+def get_ham_gates(pauli_map, dtype="complex128", backend=None):
     """Populate the gates for all pauli operators.
 
     Parameters:
@@ -27,6 +51,13 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
     Returns:
         A sequence of pauli gates.
     """
+    if backend is None:
+        backend = cp
+    if backend is None:
+        raise ImportError(
+            "get_ham_gates requires an array backend; cupy is unavailable "
+            "in this CPU-only environment."
+        )
     asarray = backend.asarray
     pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
     pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
@@ -46,6 +77,7 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
 
 def initialize_mpi():
     """Initialize MPI communication and device selection."""
+    _require_cuquantum()
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
@@ -56,6 +88,7 @@ def initialize_mpi():
 
 def initialize_nccl(comm_mpi, rank, size):
     """Initialize NCCL communication."""
+    _require_cuquantum()
     nccl_id = nccl.get_unique_id() if rank == 0 else None
     nccl_id = comm_mpi.bcast(nccl_id, root=0)
     return nccl.NcclCommunicator(size, nccl_id, rank)
@@ -73,6 +106,7 @@ def get_operands(qibo_circ, datatype, rank, comm):
 
 def compute_optimal_path(network, n_samples, size, comm):
     """Compute contraction path and broadcast optimal selection."""
+    _require_cuquantum()
     path, info = network.contract_path(
         optimize={
             "samples": n_samples,
@@ -101,6 +135,8 @@ def compute_slices(info, rank, size):
 
 def reduce_result(result, comm, method="MPI", root=0):
     """Reduce results across processes."""
+    if method == "NCCL":
+        _require_cuquantum()
     if method == "MPI":
         return comm.reduce(sendobj=result, op=MPI.SUM, root=root)
 
@@ -148,6 +184,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
     Returns:
         Dense vector of quantum circuit.
     """
+    _require_cuquantum()
     comm, rank, size, device_id = initialize_mpi()
     operands = get_operands(qibo_circ, datatype, rank, comm)
     network = Network(*operands, options={"device_id": device_id})
@@ -179,6 +216,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
     Returns:
         Dense vector of quantum circuit.
     """
+    _require_cuquantum()
     comm_mpi, rank, size, device_id = initialize_mpi()
     comm_nccl = initialize_nccl(comm_mpi, rank, size)
     operands = get_operands(qibo_circ, datatype, rank, comm_mpi)
@@ -203,6 +241,7 @@ def dense_vector_tn(qibo_circ, datatype):
     Returns:
         Dense vector of quantum circuit.
     """
+    _require_cuquantum()
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(*myconvertor.state_vector_operands())
 
@@ -231,6 +270,7 @@ def expectation_tn_nccl(qibo_circ, datatype, observable, n_samples=8):
         Expectation of quantum circuit due to pauli string.
     """
 
+    _require_cuquantum()
     comm_mpi, rank, size, device_id = initialize_mpi()
 
     comm_nccl = initialize_nccl(comm_mpi, rank, size)
@@ -299,6 +339,7 @@ def expectation_tn_MPI(qibo_circ, datatype, observable, n_samples=8):
     Returns:
         Expectation of quantum circuit due to pauli string.
     """
+    _require_cuquantum()
     # Initialize MPI and device
     comm, rank, size, device_id = initialize_mpi()
 
@@ -358,6 +399,7 @@ def expectation_tn(qibo_circ, datatype, observable):
     Returns:
         Expectation of quantum circuit due to pauli string.
     """
+    _require_cuquantum()
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
 
     observable = check_observable(observable, qibo_circ.nqubits)
@@ -383,6 +425,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
     Returns:
         Dense vector of quantum circuit.
     """
+    _require_cuquantum()
     myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
     mps_helper = MPSContractionHelper(myconvertor.num_qubits)
 
diff --git a/src/qibotn/expectation_runner.py b/src/qibotn/expectation_runner.py
index 22c2f55..6af6de8 100644
--- a/src/qibotn/expectation_runner.py
+++ b/src/qibotn/expectation_runner.py
@@ -16,9 +16,11 @@ from qibotn.observables import check_observable
 class ExpectationConfig:
     ansatz: str = "tn"
     mpi: bool = False
-    bond: int = 1024
-    cut_ratio: float = 1e-12
+    bond: int | None = 1024
+    cut_ratio: float | None = 1e-12
     tensor_module: str = "torch"
+    quimb_backend: str = "torch"
+    dtype: str = "complex128"
     torch_threads: int = 8
     parallel_opts: dict | None = None
 
@@ -28,6 +30,7 @@ class ExpectationResult:
     value: float
     seconds: float
     rank: int = 0
+    parallel_stats: list | None = None
 
 
 def exact_for_observable(circuit, observable, nqubits):
@@ -54,6 +57,8 @@ def run_cpu_expectation(circuit, observable, config):
         "max_bond_dimension": config.bond,
         "cut_ratio": config.cut_ratio,
         "tensor_module": config.tensor_module,
+        "quimb_backend": config.quimb_backend,
+        "dtype": config.dtype,
         "torch_threads": config.torch_threads,
         "parallel_opts": config.parallel_opts or {},
     }
@@ -68,4 +73,10 @@ def run_cpu_expectation(circuit, observable, config):
     elapsed = time.perf_counter() - start
 
     rank = getattr(backend, "rank", 0)
-    return ExpectationResult(float(np.real(value)), elapsed, rank=rank)
+    stats = getattr(backend, "parallel_stats", None)
+    return ExpectationResult(
+        float(np.real(value)),
+        elapsed,
+        rank=rank,
+        parallel_stats=list(stats) if stats is not None else None,
+    )
diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py
index fcb4441..b44cfb2 100644
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,4 +1,16 @@
-from cuquantum.tensornet import contract, contract_path
+try:
+    from cuquantum.tensornet import contract, contract_path
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    contract = None
+    contract_path = None
+
+
+def _require_cuquantum():
+    if contract is None or contract_path is None:
+        raise ImportError(
+            "The cuQuantum MPS contraction helper requires cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
 
 # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
 
@@ -113,6 +125,7 @@ class MPSContractionHelper:
         return self._contract(interleaved_inputs, options=options) / norm
 
     def _contract(self, interleaved_inputs, options=None):
+        _require_cuquantum()
         path = contract_path(*interleaved_inputs, options=options)[0]
 
         return contract(*interleaved_inputs, options=options, optimize={"path": path})
diff --git a/src/qibotn/mps_utils.py b/src/qibotn/mps_utils.py
index 1450837..ff3d010 100644
--- a/src/qibotn/mps_utils.py
+++ b/src/qibotn/mps_utils.py
@@ -1,6 +1,19 @@
-import cupy as cp
-from cuquantum.tensornet import contract
-from cuquantum.tensornet.experimental import contract_decompose
+try:
+    import cupy as cp
+    from cuquantum.tensornet import contract
+    from cuquantum.tensornet.experimental import contract_decompose
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    contract = None
+    contract_decompose = None
+
+
+def _require_cuquantum():
+    if cp is None or contract is None or contract_decompose is None:
+        raise ImportError(
+            "The cuQuantum MPS helpers require cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
 
 
 def initial(num_qubits, dtype):
@@ -13,6 +26,7 @@ def initial(num_qubits, dtype):
     Returns:
         The initial MPS tensors.
     """
+    _require_cuquantum()
     state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
     mps_tensors = [state_tensor] * num_qubits
     return mps_tensors
@@ -28,6 +42,7 @@ def mps_site_right_swap(mps_tensors, i, **kwargs):
     Returns:
         The updated MPS tensors.
     """
+    _require_cuquantum()
     # contraction followed by QR decomposition
     a, _, b = contract_decompose(
         "ipj,jqk->iqj,jpk",
@@ -60,6 +75,7 @@ def apply_gate(mps_tensors, gate, qubits, **kwargs):
         The updated MPS tensors.
     """
 
+    _require_cuquantum()
     n_qubits = len(qubits)
     if n_qubits == 1:
         # single-qubit gate
diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py
index 9445c97..2603b4f 100644
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -17,6 +17,196 @@ except ImportError:
 
 
 SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass")
+_COTENGRA_DASK_PATCHED = False
+_COTENGRA_DASK_SUBMIT_PATCHED = False
+_DASK_TRIAL_DEBUG = False
+
+
+def _optimizer_search_stats(opt):
+    scores = list(getattr(opt, "scores", ()))
+    finite_scores = [score for score in scores if np.isfinite(score)]
+    times = list(getattr(opt, "times", ()))
+    best = getattr(opt, "best", {}) or {}
+    return {
+        "completed_trials": len(scores),
+        "finite_trials": len(finite_scores),
+        "failed_trials": len(scores) - len(finite_scores),
+        "requested_trials": int(getattr(opt, "max_repeats", 0) or 0),
+        "trial_seconds_sum": float(sum(times)),
+        "best_score": float(best.get("score", float("inf"))),
+        "best_flops": float(best.get("flops", float("inf"))),
+        "best_write": float(best.get("write", float("inf"))),
+        "best_size": float(best.get("size", float("inf"))),
+    }
+
+
+def _attach_search_stats(tree, opt):
+    try:
+        tree.qibotn_search_stats = _optimizer_search_stats(opt)
+    except Exception:
+        pass
+    return tree
+
+
+def _dask_worker_slots(client):
+    info = client.scheduler_info(n_workers=-1)
+    workers = info.get("workers", {})
+    return workers, sum(int(w.get("nthreads", 1) or 1) for w in workers.values())
+
+
+def _print_dask_worker_summary(client):
+    workers, slots = _dask_worker_slots(client)
+    by_host = {}
+    for worker in workers.values():
+        host = worker.get("host", "unknown")
+        by_host.setdefault(host, {"workers": 0, "threads": 0})
+        by_host[host]["workers"] += 1
+        by_host[host]["threads"] += int(worker.get("nthreads", 1) or 1)
+    print(
+        "qibotn_dask_workers "
+        f"workers={len(workers)} threads={slots} by_host={by_host}",
+        flush=True,
+    )
+
+
+def _run_trial_with_debug(fn, *args, **kwargs):
+    import os
+    import socket
+
+    try:
+        from distributed import get_worker
+
+        worker = get_worker()
+        worker_address = worker.address
+    except Exception:
+        worker_address = "unknown"
+
+    method = kwargs.get("method", "unknown")
+    pid = os.getpid()
+    host = socket.gethostname()
+    print(
+        "qibotn_trial_start "
+        f"worker={worker_address} host={host} pid={pid} method={method}",
+        flush=True,
+    )
+    start = time.perf_counter()
+    try:
+        trial = fn(*args, **kwargs)
+    except Exception as exc:
+        elapsed = time.perf_counter() - start
+        print(
+            "qibotn_trial_error "
+            f"worker={worker_address} host={host} pid={pid} "
+            f"method={method} seconds={elapsed:.3f} error={exc!r}",
+            flush=True,
+        )
+        raise
+    elapsed = time.perf_counter() - start
+    print(
+        "qibotn_trial_done "
+        f"worker={worker_address} host={host} pid={pid} method={method} "
+        f"seconds={elapsed:.3f} score={trial.get('score', float('nan')):.6g} "
+        f"flops={trial.get('flops', float('nan')):.6g} "
+        f"size={trial.get('size', float('nan')):.6g}",
+        flush=True,
+    )
+    return trial
+
+
+def _patch_cotengra_dask_submit(debug_trials=False):
+    global _COTENGRA_DASK_SUBMIT_PATCHED, _DASK_TRIAL_DEBUG
+    _DASK_TRIAL_DEBUG = bool(debug_trials)
+    if _COTENGRA_DASK_SUBMIT_PATCHED:
+        return
+
+    import cotengra.parallel as ctg_parallel
+    import cotengra.hyperoptimizers.hyper as hyper
+
+    original_submit = ctg_parallel.submit
+
+    def submit(pool, fn, *args, **kwargs):
+        backend = pool.__class__.__module__.split(".", 1)[0]
+        if _DASK_TRIAL_DEBUG and backend == "distributed":
+            return original_submit(
+                pool,
+                _run_trial_with_debug,
+                fn,
+                *args,
+                **kwargs,
+            )
+        return original_submit(pool, fn, *args, **kwargs)
+
+    ctg_parallel.submit = submit
+    hyper.submit = submit
+    _COTENGRA_DASK_SUBMIT_PATCHED = True
+
+
+def _patch_cotengra_dask_as_completed():
+    """Make cotengra 0.7.5 handle distributed.Future objects.
+
+    This cotengra release routes all parallel futures through
+    ``concurrent.futures.as_completed()``, which does not accept dask
+    ``distributed.Future`` instances. Keep cotengra's optimizer/reporting logic
+    intact and only swap the wait primitive when the futures are from dask.
+    """
+    global _COTENGRA_DASK_PATCHED
+    if _COTENGRA_DASK_PATCHED:
+        return
+
+    from cotengra.hyperoptimizers.hyper import HyperOptimizer
+
+    def _get_and_report_next_future(self):
+        futures_map = {future: setting for setting, future in self._futures}
+        if not futures_map:
+            return {
+                "score": float("inf"),
+                "flops": float("inf"),
+                "write": float("inf"),
+                "size": float("inf"),
+                "time": 0.0,
+            }
+
+        future0 = next(iter(futures_map))
+        if future0.__class__.__module__.split(".", 1)[0] == "distributed":
+            from distributed import as_completed
+
+            deadline = getattr(self, "_qibotn_deadline", None)
+            timeout = None if deadline is None else max(0.0, deadline - time.time())
+            try:
+                future = next(iter(as_completed(futures_map, timeout=timeout)))
+            except TimeoutError:
+                for future in futures_map:
+                    future.cancel()
+                self._futures = []
+                return {
+                    "score": float("inf"),
+                    "flops": float("inf"),
+                    "write": float("inf"),
+                    "size": float("inf"),
+                    "time": 0.0,
+                }
+        else:
+            import concurrent.futures as _cf
+
+            future = next(_cf.as_completed(futures_map))
+
+        setting = futures_map[future]
+        self._futures = [(s, f) for s, f in self._futures if f is not future]
+        try:
+            trial = future.result()
+        except Exception:
+            trial = {
+                "score": float("inf"),
+                "flops": float("inf"),
+                "write": float("inf"),
+                "size": float("inf"),
+                "time": 0.0,
+            }
+        self._maybe_report_result(setting, trial)
+        return trial
+
+    HyperOptimizer._get_and_report_next_future = _get_and_report_next_future
+    _COTENGRA_DASK_PATCHED = True
 
 
 def _search_chunk(
@@ -46,7 +236,7 @@ def _search_chunk(
         **kwargs,
     )
     tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
-    return tree.combo_cost(factor=256), tree
+    return tree.combo_cost(factor=256), _attach_search_stats(tree, opt)
 
 
 def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
@@ -164,6 +354,8 @@ def _dask_search(
     dask_address=None,
     n_workers=None,
     optlib=None,
+    debug_trials=False,
+    close_workers=False,
 ):
     """Run one centralized cotengra hyper-optimizer over a dask pool.
 
@@ -180,6 +372,9 @@ def _dask_search(
 
     import cotengra as ctg
 
+    _patch_cotengra_dask_as_completed()
+    _patch_cotengra_dask_submit(debug_trials=debug_trials)
+
     close_client = False
     close_cluster = False
     cluster = None
@@ -205,7 +400,20 @@ def _dask_search(
     if optlib is not None:
         kwargs["optlib"] = optlib
 
+    retire_workers = []
     try:
+        workers, worker_slots = _dask_worker_slots(client)
+        if close_workers:
+            retire_workers = list(workers)
+        if debug_trials:
+            _print_dask_worker_summary(client)
+        if total_repeats < worker_slots:
+            print(
+                "qibotn_dask_underutilized "
+                f"requested_trials={total_repeats} worker_slots={worker_slots} "
+                "hint='increase --tn-search-repeats to at least worker_slots'",
+                flush=True,
+            )
         opt = ctg.HyperOptimizer(
             methods=SEARCH_METHODS,
             max_repeats=total_repeats,
@@ -216,8 +424,31 @@ def _dask_search(
             progbar=False,
             **kwargs,
         )
-        return tn.contraction_tree(optimize=opt, output_inds=output_inds)
+        opt._num_workers = max(1, worker_slots)
+        opt.pre_dispatch = max(1, min(int(total_repeats), worker_slots))
+        if max_time is not None:
+            opt._qibotn_deadline = time.time() + max_time
+        tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
+        return _attach_search_stats(tree, opt)
     finally:
+        if close_workers and retire_workers:
+            try:
+                retired = client.retire_workers(
+                    workers=retire_workers,
+                    close_workers=True,
+                    remove=True,
+                )
+                print(
+                    "qibotn_dask_workers_closed "
+                    f"requested={len(retire_workers)} retired={len(retired)}",
+                    flush=True,
+                )
+            except Exception as exc:
+                print(
+                    "qibotn_dask_workers_close_failed "
+                    f"requested={len(retire_workers)} error={exc!r}",
+                    flush=True,
+                )
         if close_client:
             client.close()
         if close_cluster:
@@ -234,6 +465,8 @@ def _mpi_search(
     trial_timeout=None,
     search_backend="processpool",
     dask_address=None,
+    debug_trials=False,
+    dask_close_workers=False,
 ):
     comm = MPI.COMM_WORLD
     rank, size = comm.Get_rank(), comm.Get_size()
@@ -258,6 +491,8 @@ def _mpi_search(
                     slicing_opts=slicing_opts,
                     dask_address=dask_address,
                     n_workers=n_workers,
+                    debug_trials=debug_trials,
+                    close_workers=dask_close_workers,
                 )
                 payload = ("ok", tree)
             except Exception as exc:
@@ -296,7 +531,8 @@ def _mpi_search(
 def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
                          max_time=300, n_workers=48, slicing_opts=None,
                          trial_timeout=None, search_backend=None,
-                         dask_address=None):
+                         dask_address=None, debug_trials=False,
+                         dask_close_workers=False):
     """Parallel contraction path search.
 
     Args:
@@ -324,6 +560,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
             trial_timeout,
             search_backend=search_backend,
             dask_address=dask_address,
+            debug_trials=debug_trials,
+            dask_close_workers=dask_close_workers,
         )
     elif method == 'processpool':
         return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
@@ -336,6 +574,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
             slicing_opts=slicing_opts,
             dask_address=dask_address,
             n_workers=n_workers,
+            debug_trials=debug_trials,
+            close_workers=dask_close_workers,
         )
     else:
         raise ValueError(f"Unknown method: {method}")
@@ -431,17 +671,37 @@ def _to_numpy_vector(value, is_torch):
 
 
 def _zero_vector_like(arrays):
-    return np.zeros(1, dtype=np.complex128)
+    array = arrays[0]
+    if type(array).__module__.startswith("torch"):
+        return np.zeros(1, dtype=np.complex64 if "64" in str(array.dtype) else np.complex128)
+    return np.zeros(1, dtype=np.asarray(array).dtype)
 
 
-def contract_tree_slices(tree, arrays, slice_indices, backend=None):
+def contract_tree_slices(tree, arrays, slice_indices, backend=None, implementation=None):
     """Contract a subset of cotengra slices and return their local sum."""
     backend = backend or _array_backend(arrays)
     is_torch = backend == "torch"
     local = None
+    cpp_contract = None
+    if implementation == "cpp":
+        if backend != "torch":
+            raise ValueError("implementation='cpp' requires torch arrays.")
+        from qibotn.torch_contractor import contract_tree_cpp
+
+        cpp_contract = contract_tree_cpp
 
     for slice_id in slice_indices:
-        value = tree.contract_slice(arrays, slice_id, backend=backend)
+        if cpp_contract is not None:
+            value = cpp_contract(tree, tree.slice_arrays(arrays, slice_id))
+        elif implementation is None:
+            value = tree.contract_slice(arrays, slice_id, backend=backend)
+        else:
+            value = tree.contract_slice(
+                arrays,
+                slice_id,
+                backend=backend,
+                implementation=implementation,
+            )
         value = _to_numpy_vector(value, is_torch)
         local = value if local is None else local + value
 
@@ -455,6 +715,7 @@ def parallel_contract(
     comm=None,
     assignment="block",
     return_stats=False,
+    implementation=None,
 ):
     if method == 'mpi':
         if not _HAVE_MPI or comm is None:
@@ -465,11 +726,20 @@ def parallel_contract(
             comm,
             assignment=assignment,
             return_stats=return_stats,
+            implementation=implementation,
         )
     raise ValueError(f"Unknown method: {method}")
 
 
-def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False):
+def _contract_mpi(
+    tree,
+    arrays,
+    comm,
+    root=0,
+    assignment="block",
+    return_stats=False,
+    implementation=None,
+):
     rank, size = comm.Get_rank(), comm.Get_size()
     backend = _array_backend(arrays)
     is_torch = backend == "torch"
@@ -482,15 +752,14 @@ def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=F
             "appear in the output."
         )
 
-    if nslices <= 1:
-        if rank != root:
-            return (None, stats) if return_stats else None
-        result = tree.contract(arrays, backend=backend)
-        result = _to_numpy_vector(result, is_torch)
-        return (result, stats) if return_stats else result
-
     plan = mpi_slice_plan(nslices, rank, size, assignment=assignment)
-    local = contract_tree_slices(tree, arrays, plan.indices, backend=backend)
+    local = contract_tree_slices(
+        tree,
+        arrays,
+        plan.indices,
+        backend=backend,
+        implementation=implementation,
+    )
     stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment)
 
     result = np.zeros_like(local) if rank == root else None
diff --git a/src/qibotn/torch_contractor.py b/src/qibotn/torch_contractor.py
new file mode 100644
index 0000000..f2e7a67
--- /dev/null
+++ b/src/qibotn/torch_contractor.py
@@ -0,0 +1,252 @@
+"""Torch C++ contraction backend for cotengra trees.
+
+This module compiles a restricted cotengra contraction tree into a compact
+execution plan, then executes that plan in a C++ torch extension. It is an
+experimental CPU path for reducing Python-level overhead between many
+pairwise contractions.
+"""
+
+from __future__ import annotations
+
+import importlib
+import os
+from functools import lru_cache
+from pathlib import Path
+from collections import defaultdict
+
+
+_EXTENSION = None
+_CONTRACTORS = {}
+SMALL_GEMM_BATCH_FLOPS = 1_000_000
+
+
+def _load_extension():
+    global _EXTENSION
+    if _EXTENSION is not None:
+        return _EXTENSION
+
+    from torch.utils.cpp_extension import load
+
+    source = Path(__file__).resolve().parent / "csrc" / "torch_contractor.cpp"
+    mklroot = os.environ.get("MKLROOT")
+    extra_cflags = ["-O3"]
+    extra_ldflags = []
+    extra_include_paths = []
+    if mklroot:
+        mklroot_path = Path(mklroot)
+        mkl_include = mklroot_path / "include"
+        mkl_lib = mklroot_path / "lib"
+        if (mkl_include / "mkl_cblas.h").exists() and (
+            (mkl_lib / "libmkl_rt.so").exists()
+            or (mkl_lib / "libmkl_rt.so.2").exists()
+        ):
+            extra_cflags.append("-DQIBOTN_USE_MKL")
+            extra_include_paths.append(str(mkl_include))
+            extra_ldflags.extend([f"-L{mkl_lib}", "-lmkl_rt"])
+
+    _EXTENSION = load(
+        name="qibotn_torch_contractor",
+        sources=[str(source)],
+        extra_cflags=extra_cflags,
+        extra_ldflags=extra_ldflags,
+        extra_include_paths=extra_include_paths,
+        verbose=False,
+    )
+    return _EXTENSION
+
+
+def _is_plain_permutation(expr):
+    if expr is None:
+        return None
+    if isinstance(expr, tuple):
+        return tuple(int(i) for i in expr)
+    if not isinstance(expr, str):
+        return None
+    if "," in expr or "->" not in expr:
+        return None
+    source, target = expr.split("->", 1)
+    if len(source) != len(target):
+        return None
+    if len(set(source)) != len(source) or set(source) != set(target):
+        return None
+    return tuple(source.index(ix) for ix in target)
+
+
+def _maybe_tuple(values):
+    return () if values is None else tuple(int(x) for x in values)
+
+
+def _shape_from_inds(tree, node):
+    return tuple(int(tree.size_dict[ix]) for ix in tree.get_inds(node))
+
+
+def _matmul_signature(op):
+    kind = op[3]
+    if kind != 0:
+        return None
+    left_shape = op[5]
+    right_shape = op[7]
+    if len(left_shape) == 2 and len(right_shape) == 2:
+        m, k, n = left_shape[-2], left_shape[-1], right_shape[-1]
+        return ("mm", int(m), int(k), int(n), int(m * k * n))
+    return None
+
+
+def _normalize_node_ids(tree, contractions):
+    leaf_to_id = {
+        frozenset((i,)): i
+        for i in range(tree.N)
+    }
+    next_id = len(leaf_to_id)
+    node_to_id = dict(leaf_to_id)
+    for parent, _left, _right, _tdot, _arg, _perm in contractions:
+        if parent not in node_to_id:
+            node_to_id[parent] = next_id
+            next_id += 1
+
+    return node_to_id, next_id
+
+
+@lru_cache(maxsize=32)
+def compile_torch_plan(tree):
+    """Compile ``tree`` into C++ contractor plan fields.
+
+    The supported subset is the same pairwise matmul lowering used by
+    cotengra for torch CPU. Single-tensor diagonal/sum preprocessing is not
+    supported yet because it appears only in less common trees; callers should
+    fall back to cotengra for those cases.
+    """
+
+    contract_mod = importlib.import_module("cotengra.contract")
+    contractions = contract_mod.extract_contractions(tree)
+    node_to_id, ntemps = _normalize_node_ids(tree, contractions)
+    plan = []
+
+    for parent, left, right, tdot, arg, perm in contractions:
+        if left is None or right is None:
+            raise NotImplementedError(
+                "C++ torch contractor does not support cotengra preprocessing."
+            )
+
+        left_shape = _shape_from_inds(tree, left)
+        right_shape = _shape_from_inds(tree, right)
+        if tdot:
+            parsed = contract_mod._parse_tensordot_axes_to_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+        else:
+            parsed = contract_mod._parse_eq_to_batch_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+
+        (
+            eq_a,
+            eq_b,
+            new_shape_a,
+            new_shape_b,
+            new_shape_ab,
+            perm_ab,
+            pure_multiplication,
+        ) = parsed
+
+        left_perm = _is_plain_permutation(eq_a)
+        right_perm = _is_plain_permutation(eq_b)
+        if left_perm is None and eq_a is not None:
+            raise NotImplementedError(f"Unsupported left preparation: {eq_a!r}")
+        if right_perm is None and eq_b is not None:
+            raise NotImplementedError(f"Unsupported right preparation: {eq_b!r}")
+
+        plan.append(
+            (
+                node_to_id[parent],
+                node_to_id[left],
+                node_to_id[right],
+                1 if pure_multiplication else 0,
+                left_perm or (),
+                _maybe_tuple(new_shape_a),
+                right_perm or (),
+                _maybe_tuple(new_shape_b),
+                _maybe_tuple(new_shape_ab),
+                _maybe_tuple(perm_ab),
+            )
+        )
+
+        if perm is not None:
+            raise NotImplementedError(
+                "C++ torch contractor does not support cotengra tensordot perm."
+            )
+
+    root_id = node_to_id[tree.root]
+    return tuple(plan), int(ntemps), int(root_id)
+
+
+@lru_cache(maxsize=32)
+def compile_batch_groups(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
+    plan, _ntemps, _root_id = compile_torch_plan(tree)
+    contractions = importlib.import_module("cotengra.contract").extract_contractions(tree)
+    node_to_id, _ntemps = _normalize_node_ids(tree, contractions)
+    depth = {frozenset((i,)): 0 for i in range(tree.N)}
+    tensor_depth = {i: 0 for i in range(tree.N)}
+    groups = defaultdict(list)
+
+    for op_index, (contract_op, contraction) in enumerate(zip(plan, contractions)):
+        parent, left, right, _tdot, _arg, _perm = contraction
+        d = max(depth[left], depth[right]) + 1
+        depth[parent] = d
+        tensor_depth[contract_op[0]] = d
+        sig = _matmul_signature(contract_op)
+        if sig is None:
+            continue
+        kind, m, k, n, flops = sig
+        if flops > max_flops:
+            continue
+        groups[(d, kind, m, k, n)].append(op_index)
+
+    batch_groups = tuple(
+        tuple(items)
+        for _key, items in sorted(groups.items(), key=lambda item: (item[0], item[1][0]))
+        if len(items) >= 2
+    )
+    return batch_groups
+
+
+def batch_group_summary(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
+    plan, _ntemps, _root_id = compile_torch_plan(tree)
+    groups = compile_batch_groups(tree, max_flops=max_flops)
+    covered = sum(len(group) for group in groups)
+    calls_saved = sum(len(group) - 1 for group in groups)
+    by_shape = []
+    for group in groups:
+        op = plan[group[0]]
+        sig = _matmul_signature(op)
+        by_shape.append((sig[1:4], len(group), group[:8]))
+    return {
+        "groups": len(groups),
+        "covered_ops": covered,
+        "calls_saved": calls_saved,
+        "by_shape": by_shape,
+    }
+
+
+def contract_tree_cpp(tree, arrays):
+    """Contract a cotengra tree using the experimental C++ torch contractor."""
+
+    contractor = prepare_torch_cpp_contractor(tree)
+    return contractor.contract(list(arrays))
+
+
+def prepare_torch_cpp_contractor(tree):
+    """Load the extension and compile ``tree`` without running contraction."""
+
+    ext = _load_extension()
+    key = id(tree)
+    contractor = _CONTRACTORS.get(key)
+    if contractor is None:
+        plan, ntemps, root_id = compile_torch_plan(tree)
+        contractor = ext.Contractor(list(plan), ntemps, root_id)
+        _CONTRACTORS[key] = contractor
+    return contractor
diff --git a/tests/test_cpu_backend.py b/tests/test_cpu_backend.py
index 300c6fe..877ddd8 100644
--- a/tests/test_cpu_backend.py
+++ b/tests/test_cpu_backend.py
@@ -5,7 +5,10 @@ from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Z
 
 from qibotn.backends.cpu import CpuTensorNet
-from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit
+from qibotn.benchmark_cases import (
+    build_circuit as build_benchmark_circuit,
+    exact_pauli_sum,
+)
 
 
 def build_circuit(nqubits=6):
@@ -109,6 +112,59 @@ def test_cpu_mps_sampling_uses_nshots():
     assert set(result.frequencies()) <= {"0000", "1111"}
 
 
+def test_cpu_mps_mpo_expectation_matches_statevector():
+    circuit = build_circuit(nqubits=4)
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": {"mpo_tensors": mpo},
+            "max_bond_dimension": 64,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_dense_observable_dict_matches_known_value():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 1))
+
+    bell = np.zeros((4, 4), dtype=complex)
+    bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": {"matrix": bell, "qubits": [0, 1]},
+            "max_bond_dimension": 16,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, 1.0, abs_tol=1e-12)
+
+
 def test_cpu_generic_tn_long_pauli_string_matches_statevector():
     circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42)
     observable = {"pauli_string_pattern": "XZ"}
diff --git a/tests/test_vidal_backend.py b/tests/test_vidal_backend.py
index 149cd0c..0af5736 100644
--- a/tests/test_vidal_backend.py
+++ b/tests/test_vidal_backend.py
@@ -2,10 +2,18 @@ import math
 
 import numpy as np
 from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
+from qibo.symbols import Symbol, X, Y, Z
 
-from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason
+from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.backends.vidal import (
+    VidalBackend,
+    _can_route_non_adjacent,
+    _unsupported_reason,
+    _operator_terms_to_mpo,
+    _symbolic_hamiltonian_to_operator_terms,
+)
 from qibotn.backends.vidal_tebd import (
+    VidalTEBDExecutor,
     _route_non_adjacent_gates,
     _gate_sites,
 )
@@ -37,6 +45,25 @@ def test_vidal_backend_expectation_matches_statevector():
     np.testing.assert_allclose(value, exact, atol=1e-12)
 
 
+def test_vidal_backend_accepts_unlimited_bond_and_no_cutoff():
+    circuit = build_local_circuit(nqubits=6, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.5 * X(0) * Z(1) - 0.7 * Z(5)
+    )
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=None,
+        cut_ratio=None,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
 def test_vidal_backend_fallback_for_non_adjacent_gate():
     """compile_circuit=False (default) → falls back to qmatchatea for non-adjacent."""
     circuit = Circuit(4)
@@ -128,6 +155,208 @@ def test_routing_non_adjacent_cnot():
     np.testing.assert_allclose(value, exact, atol=1e-12)
 
 
+def test_routing_preserves_reversed_non_adjacent_gate_order():
+    circuit = Circuit(6)
+    circuit.add(gates.X(5))
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(5, 0))
+
+    observable = hamiltonians.SymbolicHamiltonian(form=X(0) + Z(5) + Z(0) * Z(5))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_preprocesses_non_adjacent_circuit():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=True)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_preprocesses_toffoli_locally():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(1))
+    circuit.add(gates.TOFFOLI(0, 1, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=128,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=True)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_expectation_preserves_complex_coefficients():
+    circuit = Circuit(1)
+    observable = hamiltonians.SymbolicHamiltonian(form=(1.0 + 2.0j) * Z(0))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=8,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 1.0 + 2.0j, atol=1e-12)
+
+
+def test_vidal_expectation_supports_custom_local_symbols():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    a0 = Symbol(0, np.array([[0.2, 1.0], [1.0, -0.3]], dtype=complex), name="A")
+    b2 = Symbol(2, np.array([[0.7, -0.4j], [0.4j, 0.1]], dtype=complex), name="B")
+    a3 = Symbol(3, np.array([[0.5, 0.2], [0.2, -0.8]], dtype=complex), name="A")
+    observable = hamiltonians.SymbolicHamiltonian(form=0.7 * a0 * b2 - 0.4 * a3)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_executor_mpo_expectation_matches_pauli_sum():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=64,
+        tensor_module="torch",
+    )
+    executor.run_circuit(circuit)
+
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    mpo_value = executor.expectation_mpo(mpo)
+    pauli_value = executor.expectation_pauli_sum([(1.0, (("X", 0), ("Z", 1)))])
+
+    np.testing.assert_allclose(mpo_value, pauli_value, atol=1e-12)
+
+
+def test_vidal_backend_accepts_mpo_observable_dict():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, {"mpo_tensors": mpo}, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_symbolic_hamiltonian_auto_mpo_matches_operator_sum():
+    circuit = build_local_circuit(nqubits=5, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.3 * X(0) * Z(1) - 0.2j * Y(2) + 0.7 * Z(3) * X(4)
+    )
+
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=64,
+        tensor_module="torch",
+    )
+    executor.run_circuit(circuit)
+    terms = _symbolic_hamiltonian_to_operator_terms(observable)
+
+    term_value = executor.expectation_operator_sum(terms)
+    mpo_value = executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits))
+
+    np.testing.assert_allclose(mpo_value, term_value, atol=1e-12)
+
+
+def test_vidal_backend_accepts_dense_two_qubit_observable():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 1))
+
+    bell = np.zeros((4, 4), dtype=complex)
+    bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
+    observable = {"matrix": bell, "qubits": [0, 1]}
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=16,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 1.0, atol=1e-12)
+
+
+def test_vidal_backend_dense_observable_preserves_complex_value():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(1))
+
+    op = np.zeros((4, 4), dtype=complex)
+    op[0, 3] = 1.0
+    observable = {"coefficient": 1.0j, "matrix": op, "qubits": [0, 1]}
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=16,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 0.25j, atol=1e-12)
+
+
 def test_truncation_error_no_truncation():
     """With large bond, truncation error should be essentially zero."""
     circuit = build_local_circuit(nqubits=6, nlayers=2)
@@ -141,6 +370,10 @@ def test_truncation_error_no_truncation():
     assert backend.last_truncation_error < 1e-14, (
         f"Expected near-zero truncation error, got {backend.last_truncation_error}"
     )
+    assert backend.last_max_truncation_error < 1e-14, (
+        "Expected near-zero max truncation error, got "
+        f"{backend.last_max_truncation_error}"
+    )
 
 
 def test_vidal_backend_matches_statevector_multiterm():
diff --git a/tools/baseline_mps_expectation.py b/tools/baseline_mps_expectation.py
index 311cef7..ef12ae3 100644
--- a/tools/baseline_mps_expectation.py
+++ b/tools/baseline_mps_expectation.py
@@ -19,6 +19,22 @@ from qibotn.backends.qmatchatea import QMatchaTeaBackend
 from qibotn.backends.vidal_tebd import run_vidal_ring_xz
 
 
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def format_optional(value, fmt="g"):
+    return "None" if value is None else format(value, fmt)
+
+
 def build_circuit(nqubits, nlayers, seed):
     return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
 
@@ -35,7 +51,8 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--nqubits", type=int, default=40)
     parser.add_argument("--nlayers", type=int, default=30)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512)
+    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
     parser.add_argument("--torch-threads", type=int, default=32)
@@ -109,7 +126,8 @@ def main():
             mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
         print(
             f"nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"bond={args.bond} seed={args.seed} "
+            f"bond={format_optional(args.bond)} "
+            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
             f"tensor_module={args.tensor_module} svd_control=E! "
             f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
         )
@@ -128,7 +146,7 @@ def main():
             value, timings = run_segment_vidal_mpi_ring_xz(
                 circuit,
                 max_bond=args.bond,
-                cut_ratio=1e-12,
+                cut_ratio=args.cut_ratio,
                 tensor_module=args.tensor_module,
                 comm=MPI.COMM_WORLD,
             )
@@ -136,7 +154,7 @@ def main():
             value = run_vidal_ring_xz(
                 circuit,
                 max_bond=args.bond,
-                cut_ratio=1e-12,
+                cut_ratio=args.cut_ratio,
                 tensor_module=args.tensor_module,
             )
     else:
@@ -144,7 +162,7 @@ def main():
         backend.configure_tn_simulation(
             ansatz="MPS",
             max_bond_dimension=args.bond,
-            cut_ratio=1e-12,
+            cut_ratio=args.cut_ratio,
             svd_control="E!",
             tensor_module=args.tensor_module,
             compile_circuit=True,
diff --git a/tools/example_tn_case.py b/tools/example_tn_case.py
new file mode 100644
index 0000000..c35f057
--- /dev/null
+++ b/tools/example_tn_case.py
@@ -0,0 +1,33 @@
+"""Example custom case for tools/run_tn_custom.py."""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+from qibo import Circuit, gates
+
+
+def build_circuit(nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for qubit in range(nqubits):
+            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        for qubit in range(layer % 2, nqubits - 1, 2):
+            circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+            circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+    return circuit
+
+
+def build_observable(nqubits, seed):
+    return {
+        "terms": [
+            {
+                "coefficient": 1.0 / max(1, nqubits - 1),
+                "operators": [("Z", site), ("Z", site + 1)],
+            }
+            for site in range(nqubits - 1)
+        ]
+    }
diff --git a/tools/inspect_contraction_tree.py b/tools/inspect_contraction_tree.py
new file mode 100644
index 0000000..a6422ba
--- /dev/null
+++ b/tools/inspect_contraction_tree.py
@@ -0,0 +1,208 @@
+"""Inspect cotengra contraction trees for dominant torch matmul shapes."""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import math
+import pickle
+from collections import Counter, defaultdict
+from pathlib import Path
+
+
+def _prod(values):
+    out = 1
+    for value in values:
+        out *= int(value)
+    return out
+
+
+def _broadcast_batch(a_batch, b_batch):
+    if a_batch == b_batch:
+        return _prod(a_batch)
+    if not a_batch:
+        return _prod(b_batch)
+    if not b_batch:
+        return _prod(a_batch)
+
+    ndim = max(len(a_batch), len(b_batch))
+    a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
+    b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
+    return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
+
+
+def _load_tree(path, index):
+    with Path(path).open("rb") as f:
+        payload = pickle.load(f)
+    trees = payload["trees"] if isinstance(payload, dict) else payload
+    if not isinstance(trees, (list, tuple)):
+        trees = [trees]
+    return trees[index]
+
+
+def _analyze_tree(tree):
+    contract_mod = importlib.import_module("cotengra.contract")
+    contractions = contract_mod.extract_contractions(tree)
+    size_dict = tree.size_dict
+    ops = []
+    counts = Counter()
+
+    for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
+        if left is None and right is None:
+            counts["preprocess"] += 1
+            continue
+
+        left_inds = tree.get_inds(left)
+        right_inds = tree.get_inds(right)
+        parent_inds = tree.get_inds(parent)
+        left_shape = tuple(size_dict[ix] for ix in left_inds)
+        right_shape = tuple(size_dict[ix] for ix in right_inds)
+
+        if tdot:
+            parsed = contract_mod._parse_tensordot_axes_to_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+        else:
+            parsed = contract_mod._parse_eq_to_batch_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+
+        (
+            _eq_a,
+            _eq_b,
+            new_shape_a,
+            new_shape_b,
+            _new_shape_ab,
+            _perm_ab,
+            pure_multiplication,
+        ) = parsed
+
+        matmul_shape = None
+        matmul_flops = 0
+        if pure_multiplication:
+            kind = "mul"
+        else:
+            a_shape = tuple(new_shape_a or left_shape)
+            b_shape = tuple(new_shape_b or right_shape)
+            batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
+            m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
+            kind = "mm" if batch == 1 else "bmm"
+            matmul_shape = (batch, m, k, n)
+            matmul_flops = batch * m * k * n
+
+        tree_flops = int(tree.get_flops(parent))
+        out_size = int(tree.get_size(parent))
+        ops.append(
+            {
+                "index": op_index,
+                "kind": kind,
+                "matmul_shape": matmul_shape,
+                "matmul_flops": matmul_flops,
+                "tree_flops": tree_flops,
+                "out_size": out_size,
+                "left_shape": left_shape,
+                "right_shape": right_shape,
+                "left_rank": len(left_inds),
+                "right_rank": len(right_inds),
+                "out_rank": len(parent_inds),
+                "perm": perm,
+            }
+        )
+        counts[kind] += 1
+
+    return contractions, ops, counts
+
+
+def _format_log(value, base):
+    return "-inf" if value <= 0 else f"{math.log(value, base):.3f}"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.")
+    parser.add_argument("--index", type=int, default=0, help="Tree index in the file.")
+    parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.")
+    parser.add_argument(
+        "--dtype-bytes",
+        type=int,
+        default=8,
+        help="Bytes per element for memory estimates, for example 8 for complex64.",
+    )
+    args = parser.parse_args()
+
+    tree = _load_tree(args.tree, args.index)
+    contractions, ops, counts = _analyze_tree(tree)
+    nslices = int(getattr(tree, "multiplicity", 1))
+    per_slice_flops = sum(op["tree_flops"] for op in ops)
+    per_slice_write = sum(op["out_size"] for op in ops)
+    max_out = max((op["out_size"] for op in ops), default=0)
+    all_flops = per_slice_flops * nslices
+    all_write = per_slice_write * nslices
+
+    print(f"tree={args.tree} index={args.index}")
+    print(
+        "summary "
+        f"slices={nslices} contractions={len(contractions)} "
+        f"counts={dict(counts)}"
+    )
+    print(
+        "per_slice "
+        f"log10_flops={_format_log(per_slice_flops, 10)} "
+        f"log10_write={_format_log(per_slice_write, 10)} "
+        f"log2_max_output={_format_log(max_out, 2)} "
+        f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}"
+    )
+    print(
+        "all_slices "
+        f"log10_flops={_format_log(all_flops, 10)} "
+        f"log10_write={_format_log(all_write, 10)}"
+    )
+
+    print(f"\ntop_{args.top}_ops_by_flops")
+    for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]:
+        print(
+            f"op={op['index']} kind={op['kind']} "
+            f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} "
+            f"matmul={op['matmul_shape']} "
+            f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) "
+            f"lhs={op['left_shape']} rhs={op['right_shape']}"
+        )
+
+    by_shape = defaultdict(lambda: [0, 0, 0])
+    for op in ops:
+        shape = op["matmul_shape"]
+        if shape is None:
+            continue
+        by_shape[shape][0] += 1
+        by_shape[shape][1] += op["tree_flops"]
+        by_shape[shape][2] += op["out_size"]
+
+    print(f"\ntop_{args.top}_matmul_shapes_by_flops")
+    for shape, (count, flops, out_size) in sorted(
+        by_shape.items(),
+        key=lambda item: item[1][1],
+        reverse=True,
+    )[: args.top]:
+        print(
+            f"shape={shape} count={count} "
+            f"flops={flops:.6e} output={out_size:.6e}"
+        )
+
+    print(f"\ntop_{args.top}_matmul_shapes_by_count")
+    for shape, (count, flops, out_size) in sorted(
+        by_shape.items(),
+        key=lambda item: item[1][0],
+        reverse=True,
+    )[: args.top]:
+        print(
+            f"shape={shape} count={count} "
+            f"flops={flops:.6e} output={out_size:.6e}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/manage_tn_dask_cluster.sh b/tools/manage_tn_dask_cluster.sh
new file mode 100755
index 0000000..2fb7446
--- /dev/null
+++ b/tools/manage_tn_dask_cluster.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Manage the dask cluster used by TN path search.
+#
+# Defaults target two servers:
+#   scheduler: 10.20.1.103:8786
+#   workers:   10.20.1.103, 10.20.1.102
+#
+# Usage:
+#   tools/manage_tn_dask_cluster.sh start
+#   tools/manage_tn_dask_cluster.sh status
+#   tools/manage_tn_dask_cluster.sh stop
+#
+# Common overrides:
+#   SCHEDULER_HOST=10.20.1.103
+#   WORKER_HOSTS="10.20.1.103 10.20.1.102"
+#   NWORKERS=48
+#   NTHREADS=1
+#   ROOT_DIR=/home/yx/qibotn
+#   PYTHON_BIN=.venv/bin/python
+
+ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
+SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
+DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
+WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}"
+NWORKERS="${NWORKERS:-48}"
+NTHREADS="${NTHREADS:-1}"
+MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
+LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
+LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
+SSH_BIN="${SSH_BIN:-ssh}"
+DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
+DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
+DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
+
+SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
+
+is_local_host() {
+  local host="$1"
+  [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
+  [[ "$host" == "$(hostname)" ]] && return 0
+  [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
+  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
+}
+
+run_on_host() {
+  local host="$1"
+  shift
+  local cmd="$*"
+  if is_local_host "$host"; then
+    bash -lc "$cmd"
+  else
+    "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
+  fi
+}
+
+start_scheduler() {
+  local host="$SCHEDULER_HOST"
+  local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
+  local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
+  run_on_host "$host" "
+    set -euo pipefail
+    cd '$ROOT_DIR'
+    mkdir -p '$LOG_DIR'
+    if [[ -s '$pid_file' ]]; then
+      pid=\$(cat '$pid_file')
+      if kill -0 \"\$pid\" 2>/dev/null; then
+        echo \"scheduler already running on $host pid=\$pid\"
+        exit 0
+      fi
+    fi
+    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
+    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
+    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
+    setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
+      --host '$SCHEDULER_HOST' \
+      --port '$SCHEDULER_PORT' \
+      --dashboard-address '$DASHBOARD_ADDRESS' \
+      > '$log' 2>&1 < /dev/null &
+    pid=\$!
+    echo \"\$pid\" > '$pid_file'
+    echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
+  "
+}
+
+start_worker() {
+  local host="$1"
+  local log="$LOG_DIR/worker_${host}.log"
+  local pid_file="$LOG_DIR/worker_${host}.pid"
+  run_on_host "$host" "
+    set -euo pipefail
+    cd '$ROOT_DIR'
+    mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
+    if [[ -s '$pid_file' ]]; then
+      pid=\$(cat '$pid_file')
+      if kill -0 \"\$pid\" 2>/dev/null; then
+        echo \"worker already running on $host pid=\$pid\"
+        exit 0
+      fi
+    fi
+    TCM_ENABLE=1 \
+    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
+    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
+    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
+    setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
+      '$SCHEDULER_ADDR' \
+      --host '$host' \
+      --nworkers '$NWORKERS' \
+      --nthreads '$NTHREADS' \
+      --memory-limit '$MEMORY_LIMIT' \
+      --local-directory '$LOCAL_DIRECTORY' \
+      > '$log' 2>&1 < /dev/null &
+    pid=\$!
+    echo \"\$pid\" > '$pid_file'
+    echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
+  "
+}
+
+stop_host() {
+  local host="$1"
+  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
+  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
+  run_on_host "$host" "
+    set +e
+    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
+      [[ -f \"\$pid_file\" ]] || continue
+      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
+        continue
+      fi
+      pid=\$(cat \"\$pid_file\")
+      kill \"\$pid\" 2>/dev/null || true
+      rm -f \"\$pid_file\"
+    done
+    pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
+    pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
+    true
+  "
+}
+
+status_host() {
+  local host="$1"
+  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
+  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
+  echo "--------------------------------------------------------------------------------"
+  echo "host=$host"
+  run_on_host "$host" "
+    set +e
+    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
+      [[ -f \"\$pid_file\" ]] || continue
+      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
+        continue
+      fi
+      pid=\$(cat \"\$pid_file\")
+      if kill -0 \"\$pid\" 2>/dev/null; then
+        ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
+      else
+        echo \"stale pid_file=\$pid_file pid=\$pid\"
+      fi
+    done
+    pgrep -af '[d]istributed.cli.dask' || true
+  "
+}
+
+case "${1:-help}" in
+  start)
+    start_scheduler
+    sleep 2
+    for host in $WORKER_HOSTS; do
+      start_worker "$host"
+    done
+    echo
+    echo "Dask scheduler: $SCHEDULER_ADDR"
+    echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
+    ;;
+  stop)
+    for host in $WORKER_HOSTS; do
+      stop_host "$host"
+    done
+    stop_host "$SCHEDULER_HOST"
+    ;;
+  status)
+    status_host "$SCHEDULER_HOST"
+    for host in $WORKER_HOSTS; do
+      [[ "$host" == "$SCHEDULER_HOST" ]] && continue
+      status_host "$host"
+    done
+    ;;
+  restart)
+    "$0" stop
+    sleep 2
+    "$0" start
+    ;;
+  help|*)
+    cat <<EOF
+Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
+
+Defaults:
+  SCHEDULER_HOST=$SCHEDULER_HOST
+  SCHEDULER_PORT=$SCHEDULER_PORT
+  WORKER_HOSTS="$WORKER_HOSTS"
+  NWORKERS=$NWORKERS
+  NTHREADS=$NTHREADS
+  ROOT_DIR=$ROOT_DIR
+  PYTHON_BIN=$PYTHON_BIN
+  DASK_WORKER_TTL="$DASK_WORKER_TTL"
+  DASK_TICK_LIMIT=$DASK_TICK_LIMIT
+  DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
+
+Search command after start:
+  TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
+    --case main1 \\
+    --dask-address $SCHEDULER_ADDR \\
+    --torch-threads 48 \\
+    --dtype complex64 \\
+    --tn-search-repeats 2048 \\
+    --tn-search-time 300
+EOF
+    exit 2
+    ;;
+esac
diff --git a/tools/mps_contest_runner.py b/tools/mps_contest_runner.py
new file mode 100644
index 0000000..353cc3e
--- /dev/null
+++ b/tools/mps_contest_runner.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python
+"""Contest-style multi-node Vidal/MPS expectation runner."""
+
+from __future__ import annotations
+
+import argparse
+import math
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+from mpi4py import MPI
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from qibotn.backends.vidal import VidalBackend  # noqa: E402
+from qibotn.expectation_runner import exact_for_observable  # noqa: E402
+
+
+@dataclass(frozen=True)
+class CaseSpec:
+    circuit_kind: str
+    observables: tuple[str, ...]
+    nqubits: int
+    nlayers: int
+    bond: int | None
+    seed: int
+
+
+CASES = {
+    "main1": CaseSpec(
+        circuit_kind="reversed_cnot",
+        observables=("ring_xz",),
+        nqubits=128,
+        nlayers=24,
+        bond=512,
+        seed=31001,
+    ),
+    "main2": CaseSpec(
+        circuit_kind="rxx_rzz",
+        observables=("open_zz", "range2_xx", "mixed_local"),
+        nqubits=128,
+        nlayers=32,
+        bond=1024,
+        seed=31002,
+    ),
+    "strong": CaseSpec(
+        circuit_kind="scramble",
+        observables=("ring_xz", "long_z_string", "dense3_spread"),
+        nqubits=256,
+        nlayers=48,
+        bond=2048,
+        seed=41001,
+    ),
+}
+
+
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def format_optional(value, fmt="g"):
+    return "None" if value is None else format(value, fmt)
+
+
+def set_torch_threads(nthreads):
+    try:
+        import torch
+
+        torch.set_num_threads(nthreads)
+    except Exception:
+        pass
+
+
+def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        if include_rx:
+            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    for layer in range(nlayers):
+        if kind == "reversed_cnot":
+            add_single_qubit_layer(circuit, nqubits, rng)
+            for qubit in range(0, nqubits - 1, 2):
+                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
+                circuit.add(gate)
+            for qubit in range(1, nqubits - 1, 2):
+                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
+                circuit.add(gate)
+
+        elif kind == "rxx_rzz":
+            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+
+        elif kind == "scramble":
+            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def dense_observable(nqubits, qubits, seed, dim):
+    del nqubits
+    rng = np.random.default_rng(seed)
+    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
+    matrix = (raw + raw.conj().T) / 2.0
+    matrix = matrix / np.linalg.norm(matrix)
+    return {"matrix": matrix, "qubits": list(qubits)}
+
+
+def observable(kind, nqubits, seed):
+    q1 = nqubits // 4
+    q2 = nqubits // 2
+    q3 = (3 * nqubits) // 4
+    last = nqubits - 1
+
+    if kind == "boundary_ZZ_q1":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
+    if kind == "boundary_ZZ_q2":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
+    if kind == "boundary_ZZ_q3":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
+    if kind == "long_Z_5_sites":
+        return hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last))
+    if kind == "mixed_XZYZX":
+        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
+    if kind == "ring_xz":
+        form = 0
+        for qubit in range(nqubits):
+            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "open_zz":
+        form = 0
+        for qubit in range(nqubits - 1):
+            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "range2_xx":
+        form = 0
+        for qubit in range(nqubits - 2):
+            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "mixed_local":
+        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "complex_iZ0":
+        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
+    if kind == "dense2_mid":
+        return dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
+    if kind == "dense3_spread":
+        return dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def selected_observables(args, case):
+    if args.observables:
+        return tuple(args.observables)
+    if args.obs_filter:
+        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
+    return case.observables
+
+
+def apply_case_defaults(args):
+    case = CASES[args.case]
+    if args.nqubits is None:
+        args.nqubits = case.nqubits
+    if args.nlayers is None:
+        args.nlayers = case.nlayers
+    if args.bond == "case-default":
+        args.bond = case.bond
+    if args.seed is None:
+        args.seed = case.seed
+    args.observables = selected_observables(args, case)
+
+
+def run_case(args):
+    set_torch_threads(args.torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    case = CASES[args.case]
+    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
+
+    if rank == 0:
+        print("=" * 88, flush=True)
+        print(
+            "backend=vidal_mps "
+            f"case={args.case} circuit={case.circuit_kind} ranks={size} "
+            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
+            f"bond={format_optional(args.bond)} cut_ratio={format_optional(args.cut_ratio)} "
+            f"torch_threads={args.torch_threads} seed={args.seed} "
+            f"observables={','.join(args.observables)}",
+            flush=True,
+        )
+        print("observable exact value abs_error rel_error seconds trunc_sum trunc_max status", flush=True)
+
+    for obs_name in args.observables:
+        obs = observable(obs_name, args.nqubits, args.seed)
+        exact = None
+        if args.exact and rank == 0:
+            if args.nqubits > args.exact_max_qubits:
+                raise ValueError(
+                    f"--exact is limited to {args.exact_max_qubits} qubits by default."
+                )
+            exact = exact_for_observable(circuit, obs, args.nqubits)
+
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=args.bond,
+            cut_ratio=args.cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+
+        comm.Barrier()
+        start = time.perf_counter()
+        try:
+            value = backend.expectation(
+                circuit,
+                obs,
+                preprocess=True,
+                compile_circuit=False,
+            )
+            status = "ok"
+        except Exception as exc:
+            value = np.nan
+            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
+        seconds = time.perf_counter() - start
+
+        if rank == 0:
+            abs_error = float("nan") if exact is None else abs(value - exact)
+            rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
+            exact_text = "nan" if exact is None else f"{exact:.16e}"
+            print(
+                f"{obs_name} {exact_text} {value!r} "
+                f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} "
+                f"{backend.last_truncation_error:.6e} "
+                f"{backend.last_max_truncation_error:.6e} {status}",
+                flush=True,
+            )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("mode", choices=("run", "validate", "list"))
+    parser.add_argument("--case", choices=sorted(CASES), default="main1")
+    parser.add_argument("--observables", nargs="+")
+    parser.add_argument("--obs-filter", default="")
+    parser.add_argument("--nqubits", type=int)
+    parser.add_argument("--nlayers", type=int)
+    parser.add_argument("--bond", "--bonds", dest="bond", default="case-default")
+    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--torch-threads", type=int, default=8)
+    parser.add_argument("--exact", action="store_true")
+    parser.add_argument("--exact-max-qubits", type=int, default=24)
+    args = parser.parse_args()
+
+    if args.mode == "list":
+        for name, case in CASES.items():
+            print(
+                f"{name}: circuit={case.circuit_kind} "
+                f"observables={','.join(case.observables)} "
+                f"nqubits={case.nqubits} nlayers={case.nlayers} "
+                f"bond={case.bond} seed={case.seed}"
+            )
+        return
+
+    apply_case_defaults(args)
+    if isinstance(args.bond, str):
+        args.bond = optional_int(args.bond)
+
+    if args.mode == "validate":
+        args.exact = True
+        args.nqubits = min(args.nqubits, args.exact_max_qubits)
+
+    run_case(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/run_tn_custom.py b/tools/run_tn_custom.py
new file mode 100644
index 0000000..049ebed
--- /dev/null
+++ b/tools/run_tn_custom.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python
+"""Run TN expectation for a user-provided circuit and observable.
+
+The case module should define:
+
+    def build_circuit(nqubits, nlayers, seed): ...
+    def build_observable(nqubits, seed): ...
+
+``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn
+dict form:
+
+    {"terms": [
+        {"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]},
+    ]}
+
+For a single repeated Pauli string, pass ``--pauli-pattern`` instead of
+defining ``build_observable``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib.util
+import inspect
+import json
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from qibotn.expectation_runner import (  # noqa: E402
+    ExpectationConfig,
+    exact_for_observable,
+    run_cpu_expectation,
+)
+
+
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def load_module(path):
+    path = Path(path).resolve()
+    spec = importlib.util.spec_from_file_location(path.stem, path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Cannot import case module from {path}.")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def call_builder(fn, **kwargs):
+    sig = inspect.signature(fn)
+    if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
+        return fn(**kwargs)
+    accepted = {
+        name: value
+        for name, value in kwargs.items()
+        if name in sig.parameters
+    }
+    return fn(**accepted)
+
+
+def load_observable(args, module):
+    if args.pauli_pattern:
+        return {"pauli_string_pattern": args.pauli_pattern}
+    if args.observable_json:
+        with Path(args.observable_json).open() as f:
+            return json.load(f)
+    if hasattr(module, "build_observable"):
+        return call_builder(
+            module.build_observable,
+            nqubits=args.nqubits,
+            nlayers=args.nlayers,
+            seed=args.seed,
+        )
+    if hasattr(module, "OBSERVABLE"):
+        return module.OBSERVABLE
+    raise ValueError(
+        "No observable supplied. Define build_observable/OBSERVABLE in the case "
+        "module, or pass --pauli-pattern / --observable-json."
+    )
+
+
+def build_parallel_opts(args):
+    slicing_opts = {}
+    if args.tn_target_slices is not None:
+        slicing_opts["target_slices"] = args.tn_target_slices
+    if args.tn_target_size is not None:
+        slicing_opts["target_size"] = args.tn_target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": args.tn_search_workers or args.torch_threads,
+        "max_repeats": args.tn_search_repeats,
+        "max_time": args.tn_search_time,
+        "print_stats": not args.no_tn_stats,
+    }
+    if args.tn_search_backend is not None:
+        opts["search_backend"] = args.tn_search_backend
+    if args.dask_address is not None:
+        opts["dask_address"] = args.dask_address
+    if args.dask_close_workers:
+        opts["dask_close_workers"] = True
+    if args.tn_save_tree is not None:
+        opts["save_tree_path"] = args.tn_save_tree
+    if args.tn_load_tree is not None:
+        opts["load_tree_path"] = args.tn_load_tree
+    if args.tn_search_only:
+        opts["search_only"] = True
+    return opts
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run CPU TN expectation for a custom qibo circuit module."
+    )
+    parser.add_argument("case_module", help="Python file defining build_circuit.")
+    parser.add_argument("--nqubits", type=int, required=True)
+    parser.add_argument("--nlayers", type=int, default=0)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--mpi", action="store_true")
+    parser.add_argument("--exact", action="store_true")
+    parser.add_argument("--exact-max-qubits", type=int, default=24)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
+    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
+    parser.add_argument("--torch-threads", type=int, default=8)
+    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
+    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128")
+    parser.add_argument("--pauli-pattern")
+    parser.add_argument("--observable-json")
+    parser.add_argument("--tn-target-slices", type=int)
+    parser.add_argument("--tn-target-size", type=int, default=2**32)
+    parser.add_argument("--tn-search-workers", type=int)
+    parser.add_argument("--tn-search-repeats", type=int, default=128)
+    parser.add_argument("--tn-search-time", type=float, default=60.0)
+    parser.add_argument("--tn-search-backend", choices=("processpool", "dask"))
+    parser.add_argument("--dask-address")
+    parser.add_argument("--dask-close-workers", action="store_true")
+    parser.add_argument("--tn-save-tree")
+    parser.add_argument("--tn-load-tree")
+    parser.add_argument("--tn-search-only", action="store_true")
+    parser.add_argument("--no-tn-stats", action="store_true")
+    args = parser.parse_args()
+
+    rank = 0
+    if args.mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    module = load_module(args.case_module)
+    if not hasattr(module, "build_circuit"):
+        raise ValueError("case_module must define build_circuit.")
+
+    circuit = call_builder(
+        module.build_circuit,
+        nqubits=args.nqubits,
+        nlayers=args.nlayers,
+        seed=args.seed,
+    )
+    observable = load_observable(args, module)
+
+    config = ExpectationConfig(
+        ansatz="tn",
+        mpi=args.mpi,
+        bond=args.bond,
+        cut_ratio=args.cut_ratio,
+        tensor_module="torch",
+        quimb_backend=args.quimb_backend,
+        dtype=args.dtype,
+        torch_threads=args.torch_threads,
+        parallel_opts=build_parallel_opts(args),
+    )
+
+    if rank == 0:
+        mode = "MPI" if args.mpi else "serial"
+        print(
+            f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} "
+            f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} "
+            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
+            f"torch_threads={args.torch_threads}",
+            flush=True,
+        )
+        print("observable exact value abs_error rel_error seconds", flush=True)
+
+    exact = None
+    if args.exact and rank == 0:
+        if args.nqubits > args.exact_max_qubits:
+            raise ValueError(
+                f"--exact is limited to {args.exact_max_qubits} qubits by default."
+            )
+        exact = exact_for_observable(circuit, observable, args.nqubits)
+
+    result = run_cpu_expectation(circuit, observable, config)
+    if args.mpi and result.rank != 0:
+        return
+
+    abs_error = float("nan") if exact is None else abs(result.value - exact)
+    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
+    exact_text = "nan" if exact is None else f"{exact:.16e}"
+    print(
+        f"custom {exact_text} {result.value:.16e} "
+        f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}",
+        flush=True,
+    )
+
+    for stat in result.parallel_stats or ():
+        cost = stat["path_cost"]
+        search_stats = stat.get("search_stats", {})
+        print(
+            "tn_term_summary "
+            f"term={stat.get('term_index', 0)} "
+            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
+            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
+            f"completed_trials={search_stats.get('completed_trials', 'na')} "
+            f"finite_trials={search_stats.get('finite_trials', 'na')} "
+            f"failed_trials={search_stats.get('failed_trials', 'na')} "
+            f"requested_trials={search_stats.get('requested_trials', 'na')} "
+            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
+            f"slices={cost.get('slices')} "
+            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
+            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
+            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
+            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
+            f"rank_slices={stat.get('rank_slices')}",
+            flush=True,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/run_vidal_mpi_contest_cases.sh b/tools/run_vidal_mpi_contest_cases.sh
new file mode 100755
index 0000000..f2524e7
--- /dev/null
+++ b/tools/run_vidal_mpi_contest_cases.sh
@@ -0,0 +1,340 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Contest-style Vidal/MPI MPS cases.
+#
+# Usage:
+#   tools/run_vidal_mpi_contest_cases.sh main1
+#   tools/run_vidal_mpi_contest_cases.sh main2
+#   tools/run_vidal_mpi_contest_cases.sh strong
+#   tools/run_vidal_mpi_contest_cases.sh all
+#
+# Common overrides:
+#   PYTHON_BIN=.venv/bin/python
+#   MPIEXEC=mpiexec
+#   MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
+#   HOSTFILE=hostfile                 # optional; used only if the file exists
+#   RANKS=8
+#   TORCH_THREADS=8
+#   CUT_RATIO=1e-12
+#   OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
+#
+# Per-case overrides:
+#   MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
+#   MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
+#   STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
+MPIEXEC="${MPIEXEC:-mpiexec}"
+HOSTFILE="${HOSTFILE:-}"
+RANKS="${RANKS:-4}"
+TORCH_THREADS="${TORCH_THREADS:-1}"
+CUT_RATIO="${CUT_RATIO:-1e-12}"
+OBS_FILTER="${OBS_FILTER:-}"
+
+RUNNER_DIR="$ROOT_DIR/.tmp"
+mkdir -p "$RUNNER_DIR"
+RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")"
+cleanup() {
+  rm -f "$RUNNER"
+}
+trap cleanup EXIT
+
+cat > "$RUNNER" <<'PY'
+from __future__ import annotations
+
+import argparse
+import math
+import time
+
+import numpy as np
+from mpi4py import MPI
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+from qibotn.backends.vidal import VidalBackend
+
+
+def set_torch_threads(nthreads):
+    try:
+        import torch
+
+        torch.set_num_threads(nthreads)
+    except Exception:
+        pass
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+            if kind in ("rxx_rzz", "scramble"):
+                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind == "reversed_cnot":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
+        elif kind == "rxx_rzz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
+        elif kind == "scramble":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def ring_xz(nqubits):
+    form = 0
+    for q in range(nqubits):
+        form += 0.5 * X(q) * Z((q + 1) % nqubits)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def open_zz(nqubits):
+    form = 0
+    for q in range(nqubits - 1):
+        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def range2_xx(nqubits):
+    form = 0
+    for q in range(nqubits - 2):
+        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def dense_observable(nqubits, qubits, seed, dim):
+    rng = np.random.default_rng(seed)
+    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
+    matrix = (raw + raw.conj().T) / 2.0
+    matrix = matrix / np.linalg.norm(matrix)
+    return {"matrix": matrix, "qubits": list(qubits)}
+
+
+def observables_for_case(nqubits, seed):
+    q1 = nqubits // 4
+    q2 = nqubits // 2
+    q3 = (3 * nqubits) // 4
+    last = nqubits - 1
+
+    return [
+        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
+        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
+        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
+        (
+            "long_Z_5_sites",
+            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
+        ),
+        (
+            "mixed_XZYZX",
+            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
+        ),
+        ("ring_xz", ring_xz(nqubits)),
+        ("open_zz", open_zz(nqubits)),
+        ("range2_xx", range2_xx(nqubits)),
+        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
+        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
+        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
+    ]
+
+
+def run_case(args):
+    set_torch_threads(args.torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
+    observables = observables_for_case(args.nqubits, args.seed)
+    if args.obs_filter:
+        wanted = set(args.obs_filter.split(","))
+        observables = [(name, obs) for name, obs in observables if name in wanted]
+        if not observables:
+            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
+
+    if rank == 0:
+        print("=" * 88, flush=True)
+        print(
+            "case "
+            f"label={args.label} kind={args.kind} ranks={size} "
+            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
+            f"bond={args.bond} cut_ratio={args.cut_ratio:g} "
+            f"torch_threads={args.torch_threads} seed={args.seed} "
+            f"obs_filter={args.obs_filter or 'all'}",
+            flush=True,
+        )
+        print(
+            "observable value seconds trunc_sum trunc_max status",
+            flush=True,
+        )
+
+    for obs_name, observable in observables:
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=args.bond,
+            cut_ratio=args.cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+
+        comm.Barrier()
+        start = time.perf_counter()
+        try:
+            value = backend.expectation(
+                circuit,
+                observable,
+                preprocess=True,
+                compile_circuit=False,
+            )
+            status = "ok"
+        except Exception as exc:  # pragma: no cover - printed for manual runs
+            value = np.nan
+            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
+        seconds = time.perf_counter() - start
+
+        if rank == 0:
+            print(
+                f"{obs_name} {value!r} {seconds:.3f} "
+                f"{backend.last_truncation_error:.6e} "
+                f"{backend.last_max_truncation_error:.6e} {status}",
+                flush=True,
+            )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--label", required=True)
+    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
+    parser.add_argument("--nqubits", type=int, required=True)
+    parser.add_argument("--nlayers", type=int, required=True)
+    parser.add_argument("--bond", type=int, required=True)
+    parser.add_argument("--cut-ratio", type=float, required=True)
+    parser.add_argument("--seed", type=int, required=True)
+    parser.add_argument("--torch-threads", type=int, required=True)
+    parser.add_argument("--obs-filter", default="")
+    run_case(parser.parse_args())
+
+
+if __name__ == "__main__":
+    main()
+PY
+
+if [[ -n "${MPIEXEC_FULL:-}" ]]; then
+  read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
+else
+  mpi_prefix=("$MPIEXEC")
+  if [[ -n "$HOSTFILE" && -f "$HOSTFILE" ]]; then
+    mpi_prefix+=("-hostfile" "$HOSTFILE")
+  fi
+  mpi_prefix+=("-n" "$RANKS")
+fi
+
+run_case() {
+  local label="$1"
+  local kind="$2"
+  local nq="$3"
+  local layers="$4"
+  local bond="$5"
+  local seed="$6"
+
+  echo
+  echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed"
+  echo "MPI: ${mpi_prefix[*]}"
+  "${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \
+    --label "$label" \
+    --kind "$kind" \
+    --nqubits "$nq" \
+    --nlayers "$layers" \
+    --bond "$bond" \
+    --cut-ratio "$CUT_RATIO" \
+    --seed "$seed" \
+    --torch-threads "$TORCH_THREADS" \
+    --obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")"
+}
+
+case "${1:-help}" in
+  main1)
+    run_case \
+      "main1-reversed-cnot" \
+      "reversed_cnot" \
+      "${MAIN1_NQ:-128}" \
+      "${MAIN1_LAYERS:-50}" \
+      "${MAIN1_BOND:-1024}" \
+      "${MAIN1_SEED:-31001}"
+    ;;
+  main2)
+    run_case \
+      "main2-rxx-rzz" \
+      "rxx_rzz" \
+      "${MAIN2_NQ:-128}" \
+      "${MAIN2_LAYERS:-64}" \
+      "${MAIN2_BOND:-2048}" \
+      "${MAIN2_SEED:-31002}"
+    ;;
+  strong)
+    run_case \
+      "strong-scramble" \
+      "scramble" \
+      "${STRONG_NQ:-256}" \
+      "${STRONG_LAYERS:-64}" \
+      "${STRONG_BOND:-2048}" \
+      "${STRONG_SEED:-41001}"
+    ;;
+  all)
+    "$0" main1
+    "$0" main2
+    "$0" strong
+    ;;
+  smoke)
+    MAIN1_NQ="${MAIN1_NQ:-32}" \
+    MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \
+    MAIN1_BOND="${MAIN1_BOND:-128}" \
+    "$0" main1
+    ;;
+  help|*)
+    cat >&2 <<'EOF'
+Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke]
+
+Cases:
+  main1   128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024
+  main2   128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048
+  strong  256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048
+  smoke   Small syntax/runtime check of main1
+
+Common overrides:
+  PYTHON_BIN=.venv/bin/python
+  MPIEXEC=mpiexec
+  MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
+  HOSTFILE=hostfile
+  RANKS=8
+  TORCH_THREADS=8
+  CUT_RATIO=1e-12
+  OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
+
+Per-case overrides:
+  MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
+  MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
+  STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
+EOF
+    exit 2
+    ;;
+esac
diff --git a/tools/slice_existing_tree.py b/tools/slice_existing_tree.py
new file mode 100644
index 0000000..4e94e9c
--- /dev/null
+++ b/tools/slice_existing_tree.py
@@ -0,0 +1,59 @@
+"""Slice an existing saved cotengra tree without re-running path search."""
+
+from __future__ import annotations
+
+import argparse
+import pickle
+from pathlib import Path
+
+from qibotn.parallel import contraction_tree_costs
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", help="Input pickle saved by --tn-save-tree.")
+    parser.add_argument("output", help="Output pickle path.")
+    parser.add_argument("--term", type=int, default=0)
+    parser.add_argument("--target-slices", type=int, default=2)
+    parser.add_argument("--max-repeats", type=int, default=64)
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    output_path = Path(args.output)
+    with input_path.open("rb") as f:
+        payload = pickle.load(f)
+
+    trees = payload["trees"] if isinstance(payload, dict) else payload
+    if not isinstance(trees, (list, tuple)):
+        trees = [trees]
+    tree = trees[args.term]
+
+    print("original", contraction_tree_costs(tree), flush=True)
+    sliced = tree.slice(
+        target_slices=args.target_slices,
+        max_repeats=args.max_repeats,
+        seed=args.seed,
+    )
+    print("sliced", contraction_tree_costs(sliced), flush=True)
+    print(f"sliced_inds={sliced.sliced_inds}", flush=True)
+
+    new_trees = list(trees)
+    new_trees[args.term] = sliced
+
+    if isinstance(payload, dict):
+        out_payload = dict(payload)
+        out_payload["trees"] = new_trees
+        out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
+        out_payload["nterms"] = len(new_trees)
+    else:
+        out_payload = new_trees
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("wb") as f:
+        pickle.dump(out_payload, f)
+    print(f"saved {output_path}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/tn_contest_runner.py b/tools/tn_contest_runner.py
new file mode 100644
index 0000000..680cecd
--- /dev/null
+++ b/tools/tn_contest_runner.py
@@ -0,0 +1,435 @@
+#!/usr/bin/env python
+"""Contest-style CPU TN path search and contraction runner.
+
+This file is intentionally self-contained: define contest circuits and
+observables here, run path search once, then load the saved trees for repeated
+MPI contractions.
+"""
+
+from __future__ import annotations
+
+import argparse
+import math
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from urllib.parse import urlparse
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+ROOT = Path(__file__).resolve().parents[1]
+SRC = ROOT / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from qibotn.expectation_runner import (  # noqa: E402
+    ExpectationConfig,
+    exact_for_observable,
+    run_cpu_expectation,
+)
+
+
+@dataclass(frozen=True)
+class CaseSpec:
+    circuit_kind: str
+    observables: tuple[str, ...]
+    nqubits: int
+    nlayers: int
+    seed: int
+    target_slices: int | None = None
+
+
+CASES = {
+    "main1": CaseSpec(
+        circuit_kind="rxx_rzz_chain",
+        observables=("ring_xz",),
+        nqubits=34,
+        nlayers=20,
+        seed=31001,
+        target_slices=None,
+    ),
+    "main2": CaseSpec(
+        circuit_kind="scramble_chain",
+        observables=("open_zz", "range2_xx"),
+        nqubits=36,
+        nlayers=18,
+        seed=31002,
+        target_slices=None,
+    ),
+    "strong": CaseSpec(
+        circuit_kind="reversed_cnot",
+        observables=("ring_xz", "long_z_string"),
+        nqubits=40,
+        nlayers=24,
+        seed=41001,
+        target_slices=None,
+    ),
+}
+
+
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def set_torch_threads(nthreads):
+    try:
+        import torch
+
+        torch.set_num_threads(nthreads)
+    except Exception:
+        pass
+
+
+def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        if include_rx:
+            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    """Define contest circuits here."""
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    for layer in range(nlayers):
+        if kind == "rxx_rzz_chain":
+            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+
+        elif kind == "scramble_chain":
+            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+
+        elif kind == "reversed_cnot":
+            add_single_qubit_layer(circuit, nqubits, rng)
+            for qubit in range(0, nqubits - 1, 2):
+                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
+                circuit.add(gate)
+            for qubit in range(1, nqubits - 1, 2):
+                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
+                circuit.add(gate)
+
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def pauli_sum_observable(kind, nqubits, seed):
+    """Define contest observables here.
+
+    TN path currently expects Pauli products / SymbolicHamiltonian terms.
+    Keep production contest observables Hermitian unless complex output is
+    explicitly required by the scoring rule.
+    """
+    del seed
+    if kind == "ring_xz":
+        form = 0
+        for qubit in range(nqubits):
+            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+
+    if kind == "open_zz":
+        form = 0
+        for qubit in range(nqubits - 1):
+            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+
+    if kind == "range2_xx":
+        form = 0
+        for qubit in range(nqubits - 2):
+            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+
+    if kind == "long_z_string":
+        stride = max(1, nqubits // 16)
+        form = None
+        for qubit in range(0, nqubits, stride):
+            form = Z(qubit) if form is None else form * Z(qubit)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+
+    if kind == "mixed_local":
+        q1 = nqubits // 4
+        q2 = nqubits // 2
+        q3 = (3 * nqubits) // 4
+        form = 0.25 * X(0) - 0.5 * Z(nqubits - 1)
+        form += 0.125 * X(q1) * Z(q2) * Y(q3)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices):
+    slice_label = "auto" if target_slices is None else f"s{target_slices}"
+    return (
+        Path(tree_dir)
+        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl"
+    )
+
+
+def build_parallel_opts(args, tree_file=None, search_only=False):
+    slicing_opts = {}
+    if args.tn_target_slices is not None:
+        slicing_opts["target_slices"] = args.tn_target_slices
+    if args.tn_target_size is not None:
+        slicing_opts["target_size"] = args.tn_target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": args.tn_search_workers or args.torch_threads,
+        "max_repeats": args.tn_search_repeats,
+        "max_time": args.tn_search_time,
+        "print_stats": not args.no_tn_stats,
+    }
+    if args.tn_search_backend is not None:
+        opts["search_backend"] = args.tn_search_backend
+    if args.dask_address is not None:
+        opts["dask_address"] = args.dask_address
+    if args.dask_close_workers:
+        opts["dask_close_workers"] = True
+    if args.tn_debug_trials:
+        opts["debug_trials"] = True
+    if search_only:
+        opts["search_only"] = True
+        opts["save_tree_path"] = str(tree_file)
+    elif tree_file is not None:
+        opts["load_tree_path"] = str(tree_file)
+    return opts
+
+
+def run_one(args, case_name, obs_name, mode):
+    case = CASES[case_name]
+    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
+    observable = pauli_sum_observable(obs_name, args.nqubits, args.seed)
+    path = tree_path(
+        args.tree_dir,
+        case_name,
+        obs_name,
+        args.nqubits,
+        args.nlayers,
+        args.tn_target_slices,
+    )
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    rank = 0
+    if args.mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    if rank == 0:
+        print("=" * 88, flush=True)
+        print(
+            f"mode={mode} case={case_name} circuit={case.circuit_kind} "
+            f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"seed={args.seed} gates={len(circuit.queue)} tree={path}",
+            flush=True,
+        )
+
+    if mode == "contract" and not path.exists():
+        raise FileNotFoundError(f"Missing tree file: {path}. Run search first.")
+
+    exact = None
+    if args.exact and rank == 0 and mode != "search":
+        if args.nqubits > args.exact_max_qubits:
+            raise ValueError(
+                f"--exact is limited to {args.exact_max_qubits} qubits by default."
+            )
+        exact = exact_for_observable(circuit, observable, args.nqubits)
+
+    config = ExpectationConfig(
+        ansatz="tn",
+        mpi=args.mpi,
+        bond=args.bond,
+        cut_ratio=args.cut_ratio,
+        tensor_module="torch",
+        quimb_backend=args.quimb_backend,
+        dtype=args.dtype,
+        torch_threads=args.torch_threads,
+        parallel_opts=build_parallel_opts(
+            args,
+            tree_file=path,
+            search_only=(mode == "search"),
+        ),
+    )
+    result = run_cpu_expectation(circuit, observable, config)
+    if args.mpi and result.rank != 0:
+        return
+
+    if mode == "search":
+        print(f"searched observable={obs_name} tree={path}", flush=True)
+    else:
+        abs_error = float("nan") if exact is None else abs(result.value - exact)
+        rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
+        exact_text = "nan" if exact is None else f"{exact:.16e}"
+        print(
+            f"result observable={obs_name} exact={exact_text} "
+            f"value={result.value:.16e} abs_error={abs_error:.6e} "
+            f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}",
+            flush=True,
+        )
+
+    for stat in result.parallel_stats or ():
+        cost = stat["path_cost"]
+        search_stats = stat.get("search_stats", {})
+        print(
+            "tn_term_summary "
+            f"observable={obs_name} "
+            f"term={stat.get('term_index', 0)} "
+            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
+            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
+            f"completed_trials={search_stats.get('completed_trials', 'na')} "
+            f"finite_trials={search_stats.get('finite_trials', 'na')} "
+            f"failed_trials={search_stats.get('failed_trials', 'na')} "
+            f"requested_trials={search_stats.get('requested_trials', 'na')} "
+            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
+            f"slices={cost.get('slices')} "
+            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
+            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
+            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
+            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
+            f"rank_slices={stat.get('rank_slices')}",
+            flush=True,
+        )
+
+
+def selected_observables(args, case):
+    if args.observables:
+        return tuple(args.observables)
+    if args.obs_filter:
+        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
+    return case.observables
+
+
+def apply_case_defaults(args):
+    case = CASES[args.case]
+    if args.nqubits is None:
+        args.nqubits = case.nqubits
+    if args.nlayers is None:
+        args.nlayers = case.nlayers
+    if args.seed is None:
+        args.seed = case.seed
+    if args.tn_target_slices is None:
+        args.tn_target_slices = case.target_slices
+    args.observables = selected_observables(args, case)
+
+
+def stop_dask_cluster(args):
+    if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address:
+        return
+    script = ROOT / "tools" / "manage_tn_dask_cluster.sh"
+    if not script.exists():
+        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
+        return
+
+    env = os.environ.copy()
+    parsed = urlparse(args.dask_address)
+    if parsed.hostname:
+        env.setdefault("SCHEDULER_HOST", parsed.hostname)
+    if parsed.port:
+        env.setdefault("SCHEDULER_PORT", str(parsed.port))
+
+    print("dask_stop_after_search start", flush=True)
+    subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False)
+    print("dask_stop_after_search done", flush=True)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list"))
+    parser.add_argument("--case", choices=sorted(CASES), default="main1")
+    parser.add_argument("--observables", nargs="+")
+    parser.add_argument("--obs-filter", default="")
+    parser.add_argument("--tree-dir", default="trees/contest_tn")
+    parser.add_argument("--nqubits", type=int)
+    parser.add_argument("--nlayers", type=int)
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--mpi", action="store_true")
+    parser.add_argument("--exact", action="store_true")
+    parser.add_argument("--exact-max-qubits", type=int, default=24)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
+    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
+    parser.add_argument("--torch-threads", type=int, default=8)
+    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
+    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
+    parser.add_argument("--tn-target-slices", type=int)
+    parser.add_argument("--tn-target-size", type=int, default=2**32)
+    parser.add_argument("--tn-search-workers", type=int)
+    parser.add_argument("--tn-search-repeats", type=int, default=2048)
+    parser.add_argument("--tn-search-time", type=float, default=300.0)
+    parser.add_argument(
+        "--tn-search-backend",
+        choices=("processpool", "dask"),
+        default="dask",
+        help=(
+            "Path-search backend. Defaults to dask. Without --dask-address, "
+            "non-MPI search starts a local dask cluster."
+        ),
+    )
+    parser.add_argument("--dask-address")
+    parser.add_argument("--dask-close-workers", action="store_true")
+    parser.add_argument(
+        "--keep-dask",
+        action="store_true",
+        help=(
+            "Keep an external dask cluster running after search. By default, "
+            "tools/manage_tn_dask_cluster.sh stop is called after search when "
+            "--dask-address is used."
+        ),
+    )
+    parser.add_argument(
+        "--tn-debug-trials",
+        action="store_true",
+        help="Print dask worker summary and per-trial start/done logs.",
+    )
+    parser.add_argument("--no-tn-stats", action="store_true")
+    args = parser.parse_args()
+
+    if args.mode == "list":
+        for name, case in CASES.items():
+            print(
+                f"{name}: circuit={case.circuit_kind} "
+                f"observables={','.join(case.observables)} "
+                f"nqubits={case.nqubits} nlayers={case.nlayers} "
+                f"seed={case.seed} target_slices={case.target_slices}"
+            )
+        return
+
+    apply_case_defaults(args)
+    set_torch_threads(args.torch_threads)
+
+    modes = ("search", "contract") if args.mode == "all" else (args.mode,)
+    if args.mode == "validate":
+        args.exact = True
+        args.nqubits = min(args.nqubits, args.exact_max_qubits)
+        modes = ("search", "contract")
+
+    for mode in modes:
+        for obs_name in args.observables:
+            run_one(args, args.case, obs_name, mode)
+        if mode == "search":
+            stop_dask_cluster(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/torch_profile_tn_complex64.py b/tools/torch_profile_tn_complex64.py
new file mode 100644
index 0000000..b7392f9
--- /dev/null
+++ b/tools/torch_profile_tn_complex64.py
@@ -0,0 +1,114 @@
+"""Run the 34q/20L TN complex64 benchmark under torch.profiler briefly."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import signal
+import sys
+from pathlib import Path
+
+from mpi4py import MPI
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seconds", type=float, default=30.0)
+    parser.add_argument("--out-dir", default="torch_profiles/tn_complex64")
+    parser.add_argument("--torch-threads", type=int, default=48)
+    args = parser.parse_args()
+
+    repo_root = Path(__file__).resolve().parents[1]
+    os.chdir(repo_root)
+    sys.path.insert(0, str(repo_root))
+
+    import torch
+    from torch.profiler import ProfilerActivity, profile
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    out_dir = Path(args.out_dir)
+    if rank == 0:
+        out_dir.mkdir(parents=True, exist_ok=True)
+    comm.Barrier()
+
+    torch.set_num_threads(args.torch_threads)
+
+    def run_benchmark():
+        import benchmark_cpu_expectation
+
+        sys.argv = [
+            "benchmark_cpu_expectation.py",
+            "--mpi",
+            "--ansatz",
+            "tn",
+            "--nqubits",
+            "34",
+            "--nlayers",
+            "20",
+            "--circuits",
+            "rxx_rzz",
+            "--pauli-pattern",
+            "XZ",
+            "--tn-load-tree",
+            "trees/rxx_rzz_34q20l_s4.pkl",
+            "--quimb-backend",
+            "torch",
+            "--torch-threads",
+            str(args.torch_threads),
+            "--dtype",
+            "complex64",
+        ]
+        benchmark_cpu_expectation.main()
+
+    trace_path = out_dir / f"rank{rank}_trace.json"
+    stacks_path = out_dir / f"rank{rank}_stacks.txt"
+    summary_path = out_dir / f"rank{rank}_summary.txt"
+
+    prof = profile(
+        activities=[ProfilerActivity.CPU],
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+    )
+
+    class ProfileTimeout(Exception):
+        pass
+
+    def alarm_handler(signum, frame):
+        raise ProfileTimeout()
+
+    old_handler = signal.signal(signal.SIGALRM, alarm_handler)
+    signal.setitimer(signal.ITIMER_REAL, args.seconds)
+    try:
+        with prof:
+            try:
+                run_benchmark()
+            except ProfileTimeout:
+                pass
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+        signal.signal(signal.SIGALRM, old_handler)
+
+    prof.export_chrome_trace(str(trace_path))
+    try:
+        prof.export_stacks(str(stacks_path), "self_cpu_time_total")
+    except Exception as exc:  # pragma: no cover - diagnostic only
+        stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8")
+
+    summary = prof.key_averages(group_by_stack_n=5).table(
+        sort_by="self_cpu_time_total",
+        row_limit=40,
+    )
+    summary_path.write_text(summary, encoding="utf-8")
+
+    print(
+        f"torch_profile_done rank={rank}/{size} "
+        f"trace={trace_path} summary={summary_path}",
+        flush=True,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/vidal_mpi_contest_runner.py b/tools/vidal_mpi_contest_runner.py
new file mode 100644
index 0000000..405f47c
--- /dev/null
+++ b/tools/vidal_mpi_contest_runner.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+
+import argparse
+import math
+import time
+
+import numpy as np
+from mpi4py import MPI
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+
+from qibotn.backends.vidal import VidalBackend
+
+
+def optional_int(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return int(text)
+
+
+def optional_float(text):
+    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
+        return None
+    return float(text)
+
+
+def format_optional(value, fmt="g"):
+    return "None" if value is None else format(value, fmt)
+
+
+def set_torch_threads(nthreads):
+    try:
+        import torch
+
+        torch.set_num_threads(nthreads)
+    except Exception:
+        pass
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+            if kind in ("rxx_rzz", "scramble"):
+                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind == "reversed_cnot":
+            for q in range(0, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
+            for q in range(1, nqubits - 1, 2):
+                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
+        elif kind == "rxx_rzz":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
+        elif kind == "scramble":
+            for q in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(q, q + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def ring_xz(nqubits):
+    form = 0
+    for q in range(nqubits):
+        form += 0.5 * X(q) * Z((q + 1) % nqubits)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def open_zz(nqubits):
+    form = 0
+    for q in range(nqubits - 1):
+        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def range2_xx(nqubits):
+    form = 0
+    for q in range(nqubits - 2):
+        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def dense_observable(nqubits, qubits, seed, dim):
+    rng = np.random.default_rng(seed)
+    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
+    matrix = (raw + raw.conj().T) / 2.0
+    matrix = matrix / np.linalg.norm(matrix)
+    return {"matrix": matrix, "qubits": list(qubits)}
+
+
+def observables_for_case(nqubits, seed):
+    q1 = nqubits // 4
+    q2 = nqubits // 2
+    q3 = (3 * nqubits) // 4
+    last = nqubits - 1
+
+    return [
+        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
+        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
+        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
+        (
+            "long_Z_5_sites",
+            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
+        ),
+        (
+            "mixed_XZYZX",
+            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
+        ),
+        ("ring_xz", ring_xz(nqubits)),
+        ("open_zz", open_zz(nqubits)),
+        ("range2_xx", range2_xx(nqubits)),
+        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
+        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
+        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
+    ]
+
+
+def run_case(args):
+    set_torch_threads(args.torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
+    observables = observables_for_case(args.nqubits, args.seed)
+    if args.obs_filter:
+        wanted = set(args.obs_filter.split(","))
+        observables = [(name, obs) for name, obs in observables if name in wanted]
+        if not observables:
+            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
+
+    if rank == 0:
+        print("=" * 88, flush=True)
+        print(
+            "case "
+            f"label={args.label} kind={args.kind} ranks={size} "
+            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
+            f"bond={format_optional(args.bond)} "
+            f"cut_ratio={format_optional(args.cut_ratio)} "
+            f"torch_threads={args.torch_threads} seed={args.seed} "
+            f"obs_filter={args.obs_filter or 'all'}",
+            flush=True,
+        )
+        print(
+            "observable value seconds trunc_sum trunc_max status",
+            flush=True,
+        )
+
+    for obs_name, observable in observables:
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=args.bond,
+            cut_ratio=args.cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+
+        comm.Barrier()
+        start = time.perf_counter()
+        try:
+            value = backend.expectation(
+                circuit,
+                observable,
+                preprocess=True,
+                compile_circuit=False,
+            )
+            status = "ok"
+        except Exception as exc:  # pragma: no cover - printed for manual runs
+            value = np.nan
+            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
+        seconds = time.perf_counter() - start
+
+        if rank == 0:
+            print(
+                f"{obs_name} {value!r} {seconds:.3f} "
+                f"{backend.last_truncation_error:.6e} "
+                f"{backend.last_max_truncation_error:.6e} {status}",
+                flush=True,
+            )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--label", required=True)
+    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
+    parser.add_argument("--nqubits", type=int, required=True)
+    parser.add_argument("--nlayers", type=int, required=True)
+    parser.add_argument("--bond", type=optional_int, required=True)
+    parser.add_argument("--cut-ratio", type=optional_float, required=True)
+    parser.add_argument("--seed", type=int, required=True)
+    parser.add_argument("--torch-threads", type=int, required=True)
+    parser.add_argument("--obs-filter", default="")
+    run_case(parser.parse_args())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl
new file mode 100644
index 0000000..76eeedd
Binary files /dev/null and b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl differ
diff --git a/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl b/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl
new file mode 100644
index 0000000..f2180f4
Binary files /dev/null and b/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl
new file mode 100644
index 0000000..9c221b7
Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl
new file mode 100644
index 0000000..34f7300
Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl
new file mode 100644
index 0000000..dd1b7a6
Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl differ
diff --git a/trees/rxx_rzz_30q20l.pkl b/trees/rxx_rzz_30q20l.pkl
new file mode 100644
index 0000000..ea1d3e3
Binary files /dev/null and b/trees/rxx_rzz_30q20l.pkl differ
diff --git a/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl b/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl
new file mode 100644
index 0000000..cee2158
Binary files /dev/null and b/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl differ
diff --git a/trees/rxx_rzz_30q20l_from_existing_s4.pkl b/trees/rxx_rzz_30q20l_from_existing_s4.pkl
new file mode 100644
index 0000000..cee2158
Binary files /dev/null and b/trees/rxx_rzz_30q20l_from_existing_s4.pkl differ
diff --git a/trees/rxx_rzz_34q20l_s4.pkl b/trees/rxx_rzz_34q20l_s4.pkl
new file mode 100644
index 0000000..f09fef0
Binary files /dev/null and b/trees/rxx_rzz_34q20l_s4.pkl differ