diff --git a/bench_profile.py b/bench_profile.py
deleted file mode 100644
index 26f8ac7..0000000
--- a/bench_profile.py
+++ /dev/null
@@ -1,460 +0,0 @@
-"""Benchmark: qibotn/quimb generic TN — single-process torch profiling version."""
-
-import os
-import pickle
-import time
-import argparse
-import numpy as np
-import cotengra as ctg
-import qibo
-from qibo import Circuit, gates
-
-
-def make_circuit(circuit_type, nqubits, nlayers=1):
-    c = Circuit(nqubits)
-
-    if circuit_type == "qft":
-        from qibo.models import QFT
-        return QFT(nqubits)
-
-    elif circuit_type == "variational":
-        for layer in range(nlayers):
-            for q in range(nqubits):
-                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
-
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CZ(q, q + 1))
-
-    elif circuit_type == "ghz":
-        c.add(gates.H(0))
-        for q in range(nqubits - 1):
-            c.add(gates.CNOT(q, q + 1))
-
-    elif circuit_type == "brickwork":
-        for q in range(nqubits):
-            c.add(gates.H(q))
-
-        for layer in range(nlayers):
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CNOT(q, q + 1))
-                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
-                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
-
-    else:
-        raise ValueError(f"Unknown circuit: {circuit_type}")
-
-    return c
-
-
-def make_z_observable(nqubits):
-    """Z on qubit 0 only — single contraction for benchmarking."""
-    return ["z"], [(0,)], [1.0]
-
-
-def export_profiler_outputs(prof, trace_path):
-    """Export Chrome trace and text table."""
-    prof.export_chrome_trace(trace_path)
-
-    table_path = trace_path.replace(".json", ".txt")
-    with open(table_path, "w") as f:
-        f.write(
-            prof.key_averages().table(
-                sort_by="self_cpu_time_total",
-                row_limit=200,
-            )
-        )
-
-    print(f"  [torch profiler trace] {trace_path}")
-    print(f"  [torch profiler table] {table_path}")
-
-
-def run_quimb_tn(
-    circuit,
-    nqubits,
-    num_slices,
-    load_path=None,
-    save_path=None,
-):
-    """Mode: expval — compute <Z_0> via local_expectation."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    operators, sites, coeffs = make_z_observable(nqubits)
-    ops = b._string_to_quimb_operator(operators[0])
-
-    qc = b._qibo_circuit_to_quimb(
-        circuit,
-        quimb_circuit_type=b.circuit_ansatz,
-        gate_opts={"max_bond": None, "cutoff": 1e-10},
-    )
-
-    if load_path:
-        with open(load_path, "rb") as f:
-            saved = pickle.load(f)
-        tree = saved["tree"]
-        t_search = 0.0
-        print(f"  [path loaded]  {load_path}")
-    else:
-        opt = ctg.HyperOptimizer(
-            methods=["kahypar", "random-greedy", "spinglass"],
-            max_repeats=16,
-            parallel=True,
-            max_time=60,
-            slicing_opts={"target_slices": num_slices},
-            progbar=True,
-        )
-
-        t0 = time.time()
-        rehearsal = qc.local_expectation(
-            ops,
-            where=sites[0],
-            optimize=opt,
-            simplify_sequence="R",
-            rehearse=True,
-        )
-        t_search = time.time() - t0
-        tree = rehearsal["tree"]
-
-        print(
-            f"  [path search]  {t_search:.3f}s  "
-            f"flops~2^{tree.contraction_cost():.2f}  "
-            f"size~2^{tree.contraction_width():.2f}  "
-            f"slices={tree.multiplicity}"
-        )
-
-        if save_path:
-            with open(save_path, "wb") as f:
-                pickle.dump({"tree": tree}, f)
-            print(f"  [path saved]  {save_path}")
-
-    t0 = time.time()
-    expval = qc.local_expectation(
-        ops,
-        where=sites[0],
-        optimize=tree,
-        simplify_sequence="R",
-    )
-    t_contract = time.time() - t0
-
-    print(f"  [contraction]  {t_contract:.3f}s")
-
-    return float(expval.real), t_search + t_contract
-
-
-def run_quimb_tn_statevector(
-    circuit,
-    nqubits,
-    num_slices,
-    load_path=None,
-    save_path=None,
-    profile=False,
-    profile_dir="profiles",
-):
-    """Mode: statevector — contract full TN to dense vector, single process."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    import torch
-
-    qc = b._qibo_circuit_to_quimb(
-        circuit,
-        quimb_circuit_type=b.circuit_ansatz,
-        gate_opts={"max_bond": None, "cutoff": 1e-10},
-    )
-
-    # 让 quimb 生成 torch tensor，这样 torch.profiler 能抓到 aten op。
-    qc.to_backend = torch.from_numpy
-
-    if load_path:
-        with open(load_path, "rb") as f:
-            saved = pickle.load(f)
-        tree = saved["tree"]
-        t_search = 0.0
-        print(f"  [path loaded]  {load_path}")
-    else:
-        opt = ctg.HyperOptimizer(
-            methods=["kahypar", "random-greedy", "spinglass"],
-            max_repeats=500,
-            parallel=48,
-            max_time=100,
-            minimize="size",
-            slicing_opts={"target_slices": num_slices},
-            progbar=True,
-        )
-
-        t0 = time.time()
-        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
-        t_search = time.time() - t0
-        tree = rehearsal["tree"]
-
-        print(
-            f"  [path search]  {t_search:.3f}s  "
-            f"flops~2^{tree.contraction_cost():.2f}  "
-            f"size~2^{tree.contraction_width():.2f}  "
-            f"slices={tree.multiplicity}"
-        )
-
-        if save_path:
-            with open(save_path, "wb") as f:
-                pickle.dump({"tree": tree}, f)
-            print(f"  [path saved]  {save_path}")
-
-    os.makedirs(profile_dir, exist_ok=True)
-
-    if profile:
-        from torch.profiler import profile as torch_profile
-        from torch.profiler import ProfilerActivity, record_function
-
-        trace_path = os.path.join(
-            profile_dir,
-            (
-                f"trace_statevector_"
-                f"{circuit.nqubits}q_"
-                f"slices{tree.multiplicity}_"
-                f"{int(time.time())}.json"
-            ),
-        )
-
-        t0 = time.time()
-
-        with torch_profile(
-            activities=[ProfilerActivity.CPU],
-            record_shapes=True,
-            profile_memory=True,
-            with_stack=True,
-        ) as prof:
-            with record_function("qibotn_to_dense_contraction"):
-                sv = qc.to_dense(optimize=tree).reshape(-1)
-
-            with record_function("torch_to_numpy_view_or_copy"):
-                if type(sv).__module__.startswith("torch"):
-                    sv_tn = sv.detach().cpu().numpy()
-                else:
-                    sv_tn = np.asarray(sv)
-
-        t_contract = time.time() - t0
-
-        export_profiler_outputs(prof, trace_path)
-
-    else:
-        t0 = time.time()
-        sv = qc.to_dense(optimize=tree).reshape(-1)
-        t_contract = time.time() - t0
-
-        if type(sv).__module__.startswith("torch"):
-            sv_tn = sv.detach().cpu().numpy()
-        else:
-            sv_tn = np.asarray(sv)
-
-    print(f"  [contraction]  {t_contract:.3f}s")
-
-    return sv_tn, t_search + t_contract
-
-
-def run_quimb_tn_samples(circuit, nshots=1024):
-    """Mode: samples — sample from circuit output distribution."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    t0 = time.time()
-    result = b.execute_circuit(circuit, nshots=nshots)
-    t_total = time.time() - t0
-
-    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
-
-    try:
-        freqs = result.frequencies()
-        print(f"  top states: {dict(list(freqs.items())[:5])}")
-    except Exception:
-        pass
-
-    return result, t_total
-
-
-def qibojit_expval(circuit, nqubits):
-    """Compute <Z_0> via qibojit statevector."""
-    qibo.set_backend("qibojit", platform="numba")
-
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-
-    sv = np.array(result.state(), dtype=complex).flatten()
-    probs = np.abs(sv) ** 2
-
-    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
-    expval = float(np.dot(probs, 1 - 2 * bits))
-
-    return expval, elapsed
-
-
-def run_qibojit(circuit):
-    """Compute full statevector via qibojit."""
-    qibo.set_backend("qibojit", platform="numba")
-
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-
-    sv = np.array(result.state(), dtype=complex).flatten()
-
-    return sv, elapsed
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--nqubits", type=int, default=10)
-    parser.add_argument(
-        "--circuit",
-        type=str,
-        default="qft",
-        choices=["qft", "variational", "ghz", "brickwork"],
-    )
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--num-slices", type=int, default=1)
-    parser.add_argument("--nshots", type=int, default=1024)
-
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="statevector",
-        choices=["expval", "statevector", "samples"],
-        help="expval: local_expectation; statevector: to_dense; samples: sampling",
-    )
-
-    parser.add_argument(
-        "--no-compare",
-        action="store_true",
-        help="Skip qibojit reference run",
-    )
-
-    parser.add_argument(
-        "--save-path",
-        type=str,
-        default=None,
-        help="Save contraction tree to a pickle file",
-    )
-
-    parser.add_argument(
-        "--load-path",
-        type=str,
-        default=None,
-        help="Load contraction tree from a pickle file and skip path search",
-    )
-
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="Enable torch profiler for statevector contraction stage",
-    )
-
-    parser.add_argument(
-        "--profile-dir",
-        type=str,
-        default="profiles",
-        help="Directory to save torch profiler traces",
-    )
-
-    parser.add_argument(
-        "--save-statevector",
-        action="store_true",
-        help="Save TN statevector to data/sv_tn_*.npy",
-    )
-
-    args = parser.parse_args()
-
-    print(
-        f"Circuit: {args.circuit}, "
-        f"nqubits={args.nqubits}, "
-        f"nlayers={args.nlayers}, "
-        f"mode={args.mode}, "
-        f"profile={args.profile}"
-    )
-
-    np.random.seed(42)
-    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
-
-    try:
-        if args.mode == "expval":
-            expval_tn, t_tn = run_quimb_tn(
-                circuit_tn,
-                args.nqubits,
-                args.num_slices,
-                load_path=args.load_path,
-                save_path=args.save_path,
-            )
-
-            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
-
-        elif args.mode == "statevector":
-            sv_tn, t_tn = run_quimb_tn_statevector(
-                circuit_tn,
-                args.nqubits,
-                args.num_slices,
-                load_path=args.load_path,
-                save_path=args.save_path,
-                profile=args.profile,
-                profile_dir=args.profile_dir,
-            )
-
-            print(
-                f"\n[quimb TN]  time={t_tn:.4f}s  "
-                f"statevector shape={sv_tn.shape}"
-            )
-
-            if args.save_statevector:
-                os.makedirs("data", exist_ok=True)
-                out_path = f"data/sv_tn_{args.circuit}{args.nqubits}.npy"
-                np.save(out_path, sv_tn)
-                print(f"[saved statevector] {out_path}")
-
-        else:
-            _, t_tn = run_quimb_tn_samples(
-                circuit_tn,
-                nshots=args.nshots,
-            )
-
-            print(f"\n[quimb TN]  time={t_tn:.4f}s")
-            args.no_compare = True
-
-    except Exception as e:
-        print(f"[quimb TN] FAILED: {e}")
-        raise
-
-    if not args.no_compare and args.mode != "statevector":
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-
-        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
-
-        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
-        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
-
-        if t_tn > 0:
-            print(f"Speedup  : {t_ref / t_tn:.2f}x")
-
-    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-
-        sv_ref, t_ref = run_qibojit(circuit_ref)
-
-        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
-        l2_err = np.linalg.norm(sv_ref - sv_tn)
-
-        print(f"[qibojit]   time={t_ref:.4f}s")
-        print(f"Fidelity : {fid:.8f}  (1=perfect)")
-        print(f"L2 error : {l2_err:.2e}")
-
-        if t_tn > 0:
-            print(f"Speedup  : {t_ref / t_tn:.2f}x")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/benchmark_expectation.py b/benchmark_expectation.py
new file mode 100644
index 0000000..4c70ecf
--- /dev/null
+++ b/benchmark_expectation.py
@@ -0,0 +1,56 @@
+"""Expectation value benchmark with timing breakdown."""
+import time
+import numpy as np
+import torch
+import qibo
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Z
+import quimb as qu
+from qibotn.parallel import parallel_path_search
+
+# Config
+NQUBITS = 25
+NLAYERS = 10
+WORKERS = 96
+
+# Build circuit
+np.random.seed(42)
+circuit = Circuit(NQUBITS)
+for _ in range(NLAYERS):
+    for q in range(NQUBITS):
+        circuit.add(gates.RY(q, theta=np.random.uniform(0, 2*np.pi)))
+        circuit.add(gates.RZ(q, theta=np.random.uniform(0, 2*np.pi)))
+    for q in range(NQUBITS):
+        circuit.add(gates.CNOT(q % NQUBITS, (q + 1) % NQUBITS))
+
+# Setup backend
+qibo.set_backend("qibotn", platform="quimb")
+backend = qibo.get_backend()
+backend.configure_tn_simulation(ansatz="tn")
+torch.set_num_threads(WORKERS)
+
+# Prepare TN
+t0 = time.time()
+qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
+op = qu.pauli('x') & qu.pauli('z')
+tn = qc.local_expectation(op, (0, 1), rehearse='tn')
+t_prepare = time.time() - t0
+
+# Search path
+t0 = time.time()
+tree = parallel_path_search(tn, tn.outer_inds(), 'processpool', 1024, 300, WORKERS)
+t_search = time.time() - t0
+
+# Contract
+t0 = time.time()
+for tensor in tn.tensors:
+    tensor._data = torch.from_numpy(np.asarray(tensor._data)).to(torch.complex128)
+val = tn.contract(all, output_inds=(), optimize=tree)
+t_contract = time.time() - t0
+
+# Results
+print(f"Prepare:  {t_prepare:.4f}s")
+print(f"Search:   {t_search:.4f}s")
+print(f"Contract: {t_contract:.4f}s")
+print(f"Total:    {t_prepare+t_search+t_contract:.4f}s")
+print(f"Expectation: {0.5 * complex(val).real:.10f}")
diff --git a/benchmark_mps.py b/benchmark_mps.py
deleted file mode 100644
index 827fd77..0000000
--- a/benchmark_mps.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""Benchmark: qibojit (reference) vs qibotn/quimb MPS, with error comparison."""
-import time
-import argparse
-import os
-import numpy as np
-import qibo
-import quimb.tensor as qtn
-from qibo import Circuit, gates
-
-DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
-
-
-def make_circuit(circuit_type, nqubits, nlayers=1, add_measurements=False):
-    c = Circuit(nqubits)
-    if circuit_type == "qft":
-        from qibo.models import QFT
-        c = QFT(nqubits)
-    elif circuit_type == "variational":
-        for layer in range(nlayers):
-            for q in range(nqubits):
-                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CZ(q, q + 1))
-    elif circuit_type == "ghz":
-        c.add(gates.H(0))
-        for q in range(nqubits - 1):
-            c.add(gates.CNOT(q, q + 1))
-    else:
-        raise ValueError(f"Unknown circuit: {circuit_type}")
-    if add_measurements:
-        c.add(gates.M(*range(nqubits)))
-    return c
-
-
-def run_qibojit(circuit):
-    qibo.set_backend("qibojit", platform="numba")
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-    sv = result.state()
-    return sv, elapsed
-
-
-def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer, nshots=None):
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="mps", max_bond_dimension=max_bond, svd_cutoff=svd_cutoff)
-    b.contractions_optimizer = optimizer
-
-    t0 = time.time()
-    if nshots:
-        result = b.execute_circuit(circuit, nshots=nshots)
-        elapsed = time.time() - t0
-        return result.frequencies(), elapsed, 0.0
-    else:
-        # MPS simulation
-        circ_quimb = qtn.CircuitMPS.from_openqasm2_str(
-            circuit.to_qasm(),
-            gate_opts={"max_bond": max_bond, "cutoff": svd_cutoff},
-        )
-        t_mps = time.time() - t0
-        # to_dense separately
-        t1 = time.time()
-        #sv = circ_quimb.psi.to_dense().reshape(-1)
-        sv = None
-        t_dense = time.time() - t1
-        return sv, t_mps, t_dense
-
-
-def run_quimb_permmps(circuit, max_bond, svd_cutoff, nshots=None):
-    gates_list = [
-        qtn.Gate(g.name, params=list(g.parameters), qubits=list(g.qubits))
-        for g in circuit.queue
-        if g.name.lower() != "measure"
-    ]
-    t0 = time.time()
-    circ = qtn.CircuitPermMPS.from_gates(
-        gates_list,
-        N=circuit.nqubits,
-        max_bond=max_bond,
-        cutoff=svd_cutoff,
-    )
-    if nshots:
-        from collections import Counter
-        result = Counter(circ.sample(nshots))
-        elapsed = time.time() - t0
-        return dict(result), elapsed
-    else:
-        mps = circ.get_psi_unordered()
-        sv = mps.to_dense().reshape(-1)
-        elapsed = time.time() - t0
-        return sv, elapsed
-
-
-def compare_statevector(sv_ref, sv_mps):
-    sv_ref = np.array(sv_ref, dtype=complex).flatten()
-    sv_mps = np.array(sv_mps, dtype=complex).flatten()
-    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
-    l2_err = np.linalg.norm(sv_ref - sv_mps)
-    return fidelity, l2_err
-
-
-def compare_frequencies(freq_ref, freq_mps, nshots):
-    all_keys = set(freq_ref) | set(freq_mps)
-    tvd = 0.5 * sum(abs(freq_ref.get(k, 0) - freq_mps.get(k, 0)) for k in all_keys) / nshots
-    return tvd
-
-
-def jit_cache_path(circuit_type, nqubits, nlayers):
-    os.makedirs(DATA_DIR, exist_ok=True)
-    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=10)
-    parser.add_argument("--circuit", type=str, default="ghz",
-                        choices=["qft", "variational", "ghz"])
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--max-bond", type=int, default=None,
-                        help="Max bond dimension for MPS (None = unlimited)")
-    parser.add_argument("--svd-cutoff", type=float, default=1e-6)
-    parser.add_argument("--optimizer", type=str, default="eager")
-    parser.add_argument("--nshots", type=int, default=None,
-                        help="Use sampling mode with given number of shots instead of statevector")
-    parser.add_argument("--permmps", action="store_true",
-                        help="Use CircuitPermMPS directly instead of qibotn backend")
-    parser.add_argument("--skip-jit", action="store_true",
-                        help="Skip qibojit run, load cached statevector if available")
-    parser.add_argument("--no-compare", action="store_true",
-                        help="Skip correctness comparison entirely")
-    args = parser.parse_args()
-
-    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
-    print(f"MPS config: max_bond={args.max_bond}, svd_cutoff={args.svd_cutoff}, optimizer={args.optimizer}")
-
-    ref = None
-    t_ref = None
-
-    if not args.no_compare:
-        cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
-        if args.nshots:
-            # frequency mode: run qibojit with same nshots
-            np.random.seed(42)
-            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers, add_measurements=True)
-            qibo.set_backend("qibojit", platform="numba")
-            t0 = time.time()
-            result_ref = circuit_ref(nshots=args.nshots)
-            t_ref = time.time() - t0
-            ref = dict(result_ref.frequencies())
-            print(f"\n[qibojit]  time={t_ref:.4f}s")
-        elif args.skip_jit and os.path.exists(cache_path):
-            ref = np.load(cache_path)
-            print(f"\n[qibojit]  loaded from cache: {cache_path}")
-        else:
-            np.random.seed(42)
-            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-            ref, t_ref = run_qibojit(circuit_ref)
-            np.save(cache_path, ref)
-            print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
-
-    np.random.seed(42)
-    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
-    label = "quimb PermMPS" if args.permmps else "quimb MPS"
-    try:
-        if args.permmps:
-            out, t_mps = run_quimb_permmps(circuit_mps, args.max_bond, args.svd_cutoff, args.nshots)
-            t_dense = 0.0
-        else:
-            out, t_mps, t_dense = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer, args.nshots)
-        print(f"[{label}] MPS sim={t_mps:.4f}s  to_dense={t_dense:.4f}s  total={t_mps+t_dense:.4f}s")
-        if not args.no_compare:
-            if args.nshots:
-                tvd = compare_frequencies(ref, out, args.nshots)
-                print(f"\nTVD      : {tvd:.6f}  (0=perfect)")
-            else:
-                fidelity, l2_err = compare_statevector(ref, out)
-                print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
-                print(f"L2 error : {l2_err:.2e}")
-            if t_ref is not None and t_mps > 0:
-                print(f"Speedup  : {t_ref/t_mps:.2f}x")
-    except Exception as e:
-        print(f"[quimb MPS] FAILED: {e}")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark_qmatchatea.py b/benchmark_qmatchatea.py
deleted file mode 100644
index 9e7923b..0000000
--- a/benchmark_qmatchatea.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""Benchmark: qibojit (reference) vs qibotn/qmatchatea MPS."""
-import time
-import argparse
-import os
-import numpy as np
-import qibo
-from qibo import Circuit, gates
-from qibo.backends import construct_backend
-
-DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
-
-
-def make_circuit(circuit_type, nqubits, nlayers=1):
-    c = Circuit(nqubits)
-    if circuit_type == "qft":
-        from qibo.models import QFT
-        return QFT(nqubits)
-    elif circuit_type == "variational":
-        for layer in range(nlayers):
-            for q in range(nqubits):
-                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CZ(q, q + 1))
-    elif circuit_type == "ghz":
-        c.add(gates.H(0))
-        for q in range(nqubits - 1):
-            c.add(gates.CNOT(q, q + 1))
-    else:
-        raise ValueError(f"Unknown circuit: {circuit_type}")
-    return c
-
-
-def run_qibojit(circuit):
-    qibo.set_backend("qibojit", platform="numba")
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-    return result.state(), elapsed
-
-
-def run_qmatchatea(circuit, max_bond, cut_ratio):
-    import qmatchatea, qtealeaves.observables
-    from qibo.backends import construct_backend as _cb
-    b = _cb(backend="qibotn", platform="qmatchatea")
-    b.configure_tn_simulation(ansatz="MPS", max_bond_dimension=max_bond, cut_ratio=cut_ratio)
-
-    qk_circuit = b._qibocirc_to_qiskitcirc(circuit)
-    run_qk_params = qmatchatea.preprocessing.qk_transpilation_params(False)
-    observables = qtealeaves.observables.TNObservables()
-    observables += qtealeaves.observables.TNState2File(name="temp", formatting="D")
-
-    t0 = time.time()
-    results = qmatchatea.run_simulation(
-        circ=qk_circuit,
-        convergence_parameters=b.convergence_params,
-        transpilation_parameters=run_qk_params,
-        backend=b.qmatchatea_backend,
-        observables=observables,
-    )
-    elapsed = time.time() - t0
-    tn_state = results.observables.get("tn_state")
-    if tn_state is None:
-        results.load_state()
-        tn_state = results.observables["tn_state"]
-    sv_obj = tn_state.to_statevector(qiskit_order=False, max_qubit_equivalent=40)
-    sv = np.array(sv_obj.elem, dtype=complex).flatten()
-    return sv, elapsed
-
-
-def compare(sv_ref, sv_mps):
-    sv_ref = np.array(sv_ref, dtype=complex).flatten()
-    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
-    l2_err = np.linalg.norm(sv_ref - sv_mps)
-    return fidelity, l2_err
-
-
-def jit_cache_path(circuit_type, nqubits, nlayers):
-    os.makedirs(DATA_DIR, exist_ok=True)
-    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=10)
-    parser.add_argument("--circuit", type=str, default="ghz",
-                        choices=["qft", "variational", "ghz"])
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--max-bond", type=int, default=64)
-    parser.add_argument("--cut-ratio", type=float, default=1e-6)
-    parser.add_argument("--skip-jit", action="store_true")
-    args = parser.parse_args()
-
-    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
-    print(f"MPS config: max_bond={args.max_bond}, cut_ratio={args.cut_ratio}")
-
-    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
-    t_ref = None
-
-    if args.skip_jit and os.path.exists(cache_path):
-        sv_ref = np.load(cache_path)
-        print(f"\n[qibojit]  loaded from cache: {cache_path}")
-    else:
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-        sv_ref, t_ref = run_qibojit(circuit_ref)
-        np.save(cache_path, sv_ref)
-        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
-
-    np.random.seed(42)
-    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
-    try:
-        sv_mps, t_mps = run_qmatchatea(circuit_mps, args.max_bond, args.cut_ratio)
-        fidelity, l2_err = compare(sv_ref, sv_mps)
-        print(f"[qmatchatea] time={t_mps:.4f}s")
-        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
-        print(f"L2 error : {l2_err:.2e}")
-        if t_ref is not None and t_mps > 0:
-            print(f"Speedup  : {t_ref/t_mps:.2f}x")
-    except Exception as e:
-        print(f"[qmatchatea] FAILED: {e}")
-        raise
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark_tn.py b/benchmark_tn.py
deleted file mode 100644
index e0e9848..0000000
--- a/benchmark_tn.py
+++ /dev/null
@@ -1,386 +0,0 @@
-"""Benchmark: qibotn/quimb generic TN — expectation values."""
-import multiprocessing
-multiprocessing.set_start_method("spawn", force=True)
-import pickle
-import time
-import threading
-import argparse
-import numpy as np
-import cotengra as ctg
-import qibo
-from qibo import Circuit, gates
-
-class TimedTrialFn:
-    def __init__(self, trial_fn, timeout=15):
-        self.trial_fn = trial_fn
-        self.timeout = timeout
-
-    def __call__(self, *args, **kwargs):
-        result = [None]
-        exc = [None]
-
-        def _run():
-            try:
-                result[0] = self.trial_fn(*args, **kwargs)
-            except Exception as e:
-                exc[0] = e
-
-        t = threading.Thread(target=_run, daemon=True)
-        t.start()
-        t.join(self.timeout)
-        if t.is_alive():
-            raise TimeoutError(f"trial exceeded {self.timeout}s")
-        if exc[0] is not None:
-            raise exc[0]
-        return result[0]
-
-def make_circuit(circuit_type, nqubits, nlayers=1):
-    c = Circuit(nqubits)
-    if circuit_type == "qft":
-        from qibo.models import QFT
-        return QFT(nqubits)
-    elif circuit_type == "variational":
-        for layer in range(nlayers):
-            for q in range(nqubits):
-                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CZ(q, q + 1))
-    elif circuit_type == "ghz":
-        c.add(gates.H(0))
-        for q in range(nqubits - 1):
-            c.add(gates.CNOT(q, q + 1))
-    elif circuit_type == "brickwork":
-        for q in range(nqubits):
-            c.add(gates.H(q))
-        for layer in range(nlayers):
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CNOT(q, q + 1))
-                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
-                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
-    else:
-        raise ValueError(f"Unknown circuit: {circuit_type}")
-    return c
-
-
-
-def make_z_observable(nqubits):
-    """Z on qubit 0 only — single contraction for benchmarking"""
-    return ["z"], [(0,)], [1.0]
-
-
-def run_quimb_tn(circuit, nqubits, num_slices, load_path=None, save_path=None):
-    """Mode: expval — compute <Z_0> via local_expectation (lightcone pruning)."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    operators, sites, coeffs = make_z_observable(nqubits)
-    ops = b._string_to_quimb_operator(operators[0])
-    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
-                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
-
-    if load_path:
-        with open(load_path, "rb") as f:
-            saved = pickle.load(f)
-        tree = saved["tree"]
-        t_search = 0.0
-        print(f"  [path loaded]  {load_path}")
-    else:
-        opt = ctg.HyperOptimizer(
-            methods=['kahypar', 'random-greedy', 'spinglass'],
-            max_repeats=16,
-            parallel=True,
-            max_time=60,
-            slicing_opts={'target_slices': num_slices},
-            progbar=True,
-        )
-        t0 = time.time()
-        rehearsal = qc.local_expectation(ops, where=sites[0], optimize=opt,
-                                         simplify_sequence="R", rehearse=True)
-        t_search = time.time() - t0
-        tree = rehearsal['tree']
-        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
-
-        if save_path:
-            with open(save_path, "wb") as f:
-                pickle.dump({"tree": tree}, f)
-            print(f"  [path saved]  {save_path}")
-
-    t0 = time.time()
-    expval = qc.local_expectation(ops, where=sites[0], optimize=tree, simplify_sequence="R")
-    t_contract = time.time() - t0
-    print(f"  [contraction]  {t_contract:.3f}s")
-
-    return float(expval.real), t_search + t_contract
-
-
-def run_quimb_tn_statevector(circuit, nqubits, num_slices, load_path=None, save_path=None):
-    """Mode: statevector — contract full TN to dense vector."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    import torch
-    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
-                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
-    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
-    if load_path:
-        with open(load_path, "rb") as f:
-            saved = pickle.load(f)
-        tree = saved["tree"]
-        t_search = 0.0
-        print(f"  [path loaded]  {load_path}")
-    else:
-        opt = ctg.HyperOptimizer(
-            #methods=['kahypar', 'random-greedy', 'spinglass'],
-            max_repeats=1024,
-            #parallel="concurrent.futures",
-            parallel=64,
-            max_time=60,
-            minimize='size',
-            slicing_opts={'target_slices': num_slices},
-            #slicing_opts={'target_size': 2**30},
-            progbar=True,
-            on_trial_error='ignore'
-        )
-        
-        t0 = time.time()
-        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
-        t_search = time.time() - t0
-        tree = rehearsal['tree']
-        #print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
-
-        if save_path:
-            with open(save_path, "wb") as f:
-                pickle.dump({"tree": tree}, f)
-            print(f"  [path saved]  {save_path}")
-        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
-        return None, t_search
-
-    t0 = time.time()
-    sv = qc.to_dense(optimize=tree).reshape(-1)
-    t_contract = time.time() - t0
-    print(f"  [contraction]  {t_contract:.3f}s")
-    sv_tn = np.array(sv)
-    return sv_tn, t_search + t_contract
-
-
-def _contract_mpi(tree, arrays, comm, root=0):
-    """Contract slices via MPI, returning result as the same array type as input.
-
-    Unlike ``cotengra.ContractionTree.contract_mpi``, this works with any
-    array backend (numpy, torch, etc.) — it only converts to numpy at the
-    MPI-reduce boundary and converts back.
-    """
-    size = comm.Get_size()
-    rank = comm.Get_rank()
-
-    result_np = None
-    is_torch = type(arrays[0]).__module__.startswith("torch")
-
-    for i in range(rank, tree.multiplicity, size):
-        x = tree.contract_slice(arrays, i)
-        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
-        
-        if result_np is None:
-            result_np = x_np
-        else:
-            result_np += x_np
-
-    if result_np is None:
-        result_np = np.zeros(1, dtype=np.complex64)
-
-    if rank == root:
-        result = np.zeros_like(result_np)
-    else:
-        result = None
-    comm.Reduce(result_np, result, root=root)
-
-    if rank == root:
-        import torch
-        return torch.from_numpy(np.asarray(result)) if is_torch else result
-    return None
-
-
-def run_quimb_tn_statevector_mpi(circuit, nqubits, num_slices, load_path=None, save_path=None):
-    """MPI-parallel statevector via custom MPI contraction (supports torch backend)."""
-    from mpi4py import MPI
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    import torch
-    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
-                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
-    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
-
-    if load_path:
-        if rank == 0:
-            with open(load_path, "rb") as f:
-                saved = pickle.load(f)
-            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
-            print(f"  [path loaded]  {load_path}")
-        else:
-            tree = psi = None
-            t_search = 0.0
-    else:
-        # each rank runs serial search over its share of trials
-        total_repeats = 1024
-        rank_repeats = max(1, total_repeats // size)
-        opt = ctg.HyperOptimizer(
-            methods=['kahypar', 'random-greedy', 'spinglass'],
-            max_repeats=rank_repeats,
-            parallel=False,
-            max_time=100,
-            minimize='size',
-            slicing_opts={'target_slices': max(num_slices, size), 'allow_outer': False},
-            progbar=(rank == 0),
-        )
-        t0 = time.time()
-        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
-        t_search = time.time() - t0
-        local_tree = rehearsal['tree']
-        local_psi = rehearsal['tn']
-        local_size = local_tree.contraction_width()
-
-        # gather all trees to rank 0, pick best by contraction_width
-        all_results = comm.gather((local_size, local_tree, local_psi), root=0)
-        if rank == 0:
-            _, tree, psi = min(all_results, key=lambda x: x[0])
-            print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}  slices={tree.multiplicity}")
-            if save_path:
-                with open(save_path, "wb") as f:
-                    pickle.dump({"tree": tree, "psi": psi}, f)
-                print(f"  [path saved]   {save_path}")
-        else:
-            tree = psi = None
-
-    tree = comm.bcast(tree, root=0)
-    psi = comm.bcast(psi, root=0)
-    t_search = comm.bcast(t_search, root=0)
-
-    arrays = psi.arrays
-    t0 = time.time()
-    sv = _contract_mpi(tree, arrays, comm, root=0)
-    t_contract = time.time() - t0
-
-    if rank == 0:
-        print(f"  [contraction]  {t_contract:.3f}s")
-        return np.array(sv).reshape(-1), t_search + t_contract
-    return None, t_search + t_contract
-
-
-def run_quimb_tn_samples(circuit, nshots=1024):
-    """Mode: samples — sample from circuit output distribution."""
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    t0 = time.time()
-    result = b.execute_circuit(circuit, nshots=nshots)
-    t_total = time.time() - t0
-    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
-    print(f"  top states: {dict(list(result.frequencies().items())[:5])}")
-    return result, t_total
-
-
-def qibojit_expval(circuit, nqubits):
-    """Compute <Z_0> via qibojit statevector."""
-    qibo.set_backend("qibojit", platform="numba")
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-    sv = np.array(result.state(), dtype=complex).flatten()
-    probs = np.abs(sv) ** 2
-    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
-    expval = float(np.dot(probs, 1 - 2 * bits))
-    return expval, elapsed
-
-
-def run_qibojit(circuit):
-    """Compute full statevector via qibojit."""
-    qibo.set_backend("qibojit", platform="numba")
-    t0 = time.time()
-    result = circuit()
-    elapsed = time.time() - t0
-    sv = np.array(result.state(), dtype=complex).flatten()
-    return sv, elapsed
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=10)
-    parser.add_argument("--circuit", type=str, default="qft",
-                        choices=["qft", "variational", "ghz", "brickwork"])
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--num-slices", type=int, default=1)
-    parser.add_argument("--nshots", type=int, default=1024)
-    parser.add_argument("--mode", type=str, default="statevector",
-                        choices=["expval", "statevector", "samples"],
-                        help="expval: local_expectation; statevector: to_dense; samples: sampling")
-    parser.add_argument("--mpi", action="store_true",
-                        help="Use MPI-parallel contraction (run with mpirun -n N)")
-    parser.add_argument("--no-compare", action="store_true",
-                        help="Skip qibojit reference run")
-    parser.add_argument("--save-path", type=str, default=None,
-                        help="Save contraction tree to a pickle file")
-    parser.add_argument("--load-path", type=str, default=None,
-                        help="Load contraction tree from a pickle file (skip path search)")
-    args = parser.parse_args()
-
-    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}, mode={args.mode}")
-
-    np.random.seed(42)
-    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
-    try:
-        if args.mode == "expval":
-            expval_tn, t_tn = run_quimb_tn(circuit_tn, args.nqubits, args.num_slices,
-                                           load_path=args.load_path, save_path=args.save_path)
-            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
-        elif args.mode == "statevector":
-            if args.mpi:
-                sv_tn, t_tn = run_quimb_tn_statevector_mpi(circuit_tn, args.nqubits, args.num_slices,
-                                                           load_path=args.load_path, save_path=args.save_path)
-            else:
-                sv_tn, t_tn = run_quimb_tn_statevector(circuit_tn, args.nqubits, args.num_slices,
-                                                       load_path=args.load_path, save_path=args.save_path)
-            if sv_tn is not None:
-                print(f"\n[quimb TN]  time={t_tn:.4f}s  statevector shape={sv_tn.shape}")
-                np.save(f"data/sv_tn_{args.circuit}{args.nqubits}.npy", sv_tn)
-        else:
-            _, t_tn = run_quimb_tn_samples(circuit_tn, args.nqubits, args.nshots)
-            print(f"\n[quimb TN]  time={t_tn:.4f}s")
-            args.no_compare = True  # samples 模式无法和 qibojit 期望值对比
-    except Exception as e:
-        print(f"[quimb TN] FAILED: {e}")
-        raise
-
-    if not args.no_compare and args.mode != "statevector":
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
-        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
-        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
-        if t_tn > 0:
-            print(f"Speedup  : {t_ref/t_tn:.2f}x")
-    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-        sv_ref, t_ref = run_qibojit(circuit_ref)
-        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
-        l2_err = np.linalg.norm(sv_ref - sv_tn)
-        print(f"[qibojit]   time={t_ref:.4f}s")
-        print(f"Fidelity : {fid:.8f}  (1=perfect)")
-        print(f"L2 error : {l2_err:.2e}")
-        if t_tn > 0:
-            print(f"Speedup  : {t_ref/t_tn:.2f}x")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmark_tn_mpi.py b/benchmark_tn_mpi.py
index ae76eca..8dc80d1 100644
--- a/benchmark_tn_mpi.py
+++ b/benchmark_tn_mpi.py
@@ -1,4 +1,5 @@
 """MPI-parallel TN benchmark: path search + contraction via MPI."""
+import json
 import pickle
 import time
 import argparse
@@ -8,9 +9,38 @@ import qibo
 from qibo import Circuit, gates
 from mpi4py import MPI
 from concurrent.futures import ProcessPoolExecutor, as_completed
+from qibotn.observables import check_observable, extract_gates_and_qubits
 
 
-def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time=600):
+def _load_observable(observable_file=None, observable_json=None):
+    if observable_file:
+        with open(observable_file, "r", encoding="utf8") as f:
+            return json.load(f)
+    if observable_json:
+        return json.loads(observable_json)
+    return None
+
+
+def _term_to_quimb_operator(term):
+    """Convert one extracted Hamiltonian term to a quimb operator."""
+    import quimb as qu
+
+    coeff = complex(term[0][2]) if term else 1.0
+    op = None
+    where = []
+
+    for qubit, gate_name, _ in term:
+        qubit = int(qubit)
+        gate_name = str(gate_name).upper()
+        if gate_name == "I":
+            continue
+        where.append(qubit)
+        op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
+
+    return complex(coeff), op, tuple(where)
+
+
+def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
     import pickle, cotengra as ctg, random
     random.seed(seed)
     tn = pickle.loads(tn_bytes)
@@ -29,10 +59,14 @@ def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks
 
 
 def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
-                    timeout=60):
+                    timeout):
     import pickle, os, signal
     from concurrent.futures import ProcessPoolExecutor, as_completed
     tn_bytes = pickle.dumps(tn)
+    if n_workers <= 1:
+        return _run_serial_search(
+            tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
+        )[1]
     repeats_per = max(1, total_repeats // n_workers)
     best_cost, best_tree = float('inf'), None
 
@@ -152,7 +186,7 @@ def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
         psi_tn = qc.to_dense(rehearse="tn")
         local_tree = parallel_search(
             psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
-            num_slices=num_slices, n_ranks=size, timeout=60,
+            num_slices=num_slices, n_ranks=size, timeout=600,
         )
         t_search = time.time() - t0
         local_psi = psi_tn
@@ -181,7 +215,7 @@ def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
 
     # --- contraction: all ranks work in parallel ---
     import torch
-    torch.set_num_threads(max(1, 48 // size))
+    torch.set_num_threads(max(1, 96 // size))
     arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
     t0 = time.time()
     sv = _contract_mpi(tree, arrays, comm, root=0)
@@ -193,6 +227,72 @@ def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
     return None, t_search + t_contract
 
 
+def run_mpi_expval(
+    circuit,
+    nqubits,
+    observable=None,
+    total_repeats=1024,
+    search_workers=1,
+    search_timeout=300,
+):
+    """Compute a Hamiltonian expectation value directly from TN via MPI.
+    MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
+    path search for each term."""
+    import torch
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    observable = check_observable(observable, nqubits)
+    ham_gate_map = extract_gates_and_qubits(observable)
+
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+
+    my_terms = ham_gate_map[rank::size]
+    torch.set_num_threads(max(1, 96 // size))
+    t0 = time.time()
+
+    my_exp = 0.0 + 0.0j
+    for term in my_terms:
+        coeff, op, where = _term_to_quimb_operator(term)
+        if op is None:
+            my_exp += coeff
+            continue
+        tn = qc.local_expectation_tn(op, where=where)
+        if len(tn.outer_inds()) == 0:
+            val = complex(tn.contract())
+        else:
+            tree = parallel_search(
+                tn,
+                tn.outer_inds(),
+                total_repeats,
+                n_workers=search_workers,
+                num_slices=1,
+                n_ranks=size,
+                timeout=search_timeout,
+            )
+            if tree is None:
+                raise RuntimeError("Failed to find a contraction tree for expectation TN.")
+            arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
+            acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
+            val = complex(acc.item() if hasattr(acc, 'item') else acc)
+        my_exp += coeff * val
+
+    t_total = time.time() - t0
+
+    all_results = comm.gather(my_exp, root=0)
+    if rank == 0:
+        total_exp = sum(all_results)
+        print(f"\n[TN expval]  time={t_total:.4f}s  expval={total_exp.real:.12f}")
+        return np.real_if_close(total_exp), t_total
+    return None, t_total
+
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--nqubits", type=int, default=30)
@@ -201,9 +301,14 @@ def main():
     parser.add_argument("--nlayers", type=int, default=3)
     parser.add_argument("--num-slices", type=int, default=1)
     parser.add_argument("--total-repeats", type=int, default=1024)
+    parser.add_argument("--search-workers", type=int, default=1)
+    parser.add_argument("--search-timeout", type=int, default=300)
+    parser.add_argument("--observable-file", type=str, default=None)
+    parser.add_argument("--observable-json", type=str, default=None)
     parser.add_argument("--save-path", type=str, default=None)
     parser.add_argument("--load-path", type=str, default=None)
     parser.add_argument("--no-compare", action="store_true")
+    parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
     args = parser.parse_args()
 
     comm = MPI.COMM_WORLD
@@ -215,6 +320,27 @@ def main():
 
     np.random.seed(42)
     circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    observable = _load_observable(args.observable_file, args.observable_json)
+
+    if args.mode == "expval":
+        try:
+            expval, t_total = run_mpi_expval(
+                circuit,
+                args.nqubits,
+                observable=observable,
+                total_repeats=args.total_repeats,
+                search_workers=args.search_workers,
+                search_timeout=args.search_timeout,
+            )
+        except Exception as e:
+            if rank == 0:
+                print(f"[FAILED] {e}")
+            raise
+        if rank == 0:
+            np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
+            if not args.no_compare:
+                print("No built-in reference comparison for arbitrary observables.")
+        return
 
     try:
         sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
@@ -230,14 +356,20 @@ def main():
         np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
 
         if not args.no_compare:
-            from benchmark_tn import run_qibojit
+            from qibotn.bak.benchmark_tn import run_qibojit
+            import gc
             np.random.seed(42)
             circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
             sv_ref, t_ref = run_qibojit(circuit_ref)
-            fid = abs(np.dot(sv_ref.conj(), sv)) ** 2
+            np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
             print(f"[qibojit]       time={t_ref:.4f}s")
-            print(f"Fidelity : {fid:.8f}")
-            print(f"L2 error : {np.linalg.norm(sv_ref - sv):.2e}")
+            # free memory before loading via mmap for expval comparison
+            del sv, sv_ref
+            gc.collect()
+            from compare_jit_tn_quimb import check_results
+            ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
+            tn_path  = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
+            check_results(ref_path, tn_path, args.nqubits)
             if t_total > 0:
                 print(f"Speedup  : {t_ref/t_total:.2f}x")
 
diff --git a/compare_jit_tn_quimb.py b/compare_jit_tn_quimb.py
deleted file mode 100644
index d56132a..0000000
--- a/compare_jit_tn_quimb.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-import os
-import sys
-
-def check_results(ref_path, tn_path):
-    # 1. 检查文件是否存在
-    if not os.path.exists(ref_path) or not os.path.exists(tn_path):
-        print(f"Error: 找不到文件！\n参考文件: {ref_path}\n待测文件: {tn_path}")
-        return
-
-    print(f"正在加载数据并对比: \n  [Ref] {ref_path}\n  [TN ] {tn_path}\n")
-
-    try:
-        # 2. 加载状态向量
-        # mmap_mode='r' 可以防止大文件直接撑爆内存
-        sv_ref = np.load(ref_path, mmap_mode='r')
-        sv_tn = np.load(tn_path, mmap_mode='r')
-
-        # 3. 计算保真度 (Fidelity)
-        # fid = |<ref|tn>|^2
-        inner_product = np.dot(sv_ref.conj(), sv_tn)
-        fidelity = np.abs(inner_product)**2
-
-        # 4. 计算 L2 误差 (欧氏距离)
-        l2_error = np.linalg.norm(sv_ref - sv_tn)
-
-        # 5. 打印结果
-        print("-" * 30)
-        print(f"保真度 (Fidelity): {fidelity:.12f}")
-        #print(f"L2 范数误差:       {l2_error:.2e}")
-        print("-" * 30)
-
-        # phase-invariant L2: align global phase first
-        phase = inner_product / np.abs(inner_product)
-        l2_phase_corrected = np.linalg.norm(sv_ref - sv_tn / phase)
-        print(f"L2 误差(相位校正后): {l2_phase_corrected:.2e}")
-
-        if fidelity > 0.999999:
-            print("✅ 验证通过：结果高度一致。")
-        else:
-            print("❌ 警告：保真度较低，请检查收缩路径或截断误差。")
-
-    except Exception as e:
-        print(f"计算过程中发生错误: {e}")
-
-if __name__ == "__main__":
-    # 你可以在这里直接修改文件名
-    REF_FILE = 'data/sv_qibojit_qft30.npy'
-    TN_FILE  = 'data/sv_tn_qft30_mpi.npy'
-    
-    check_results(REF_FILE, TN_FILE)
\ No newline at end of file
diff --git a/hostfile b/hostfile
deleted file mode 100644
index aa46b20..0000000
--- a/hostfile
+++ /dev/null
@@ -1,2 +0,0 @@
-10.20.6.74:1                                                                                         
-10.20.6.102:1
diff --git a/inspect_path.py b/inspect_path.py
deleted file mode 100644
index aa83a22..0000000
--- a/inspect_path.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import pickle
-import sys
-
-path = sys.argv[1] if len(sys.argv) > 1 else 'path/path.pkl.34.mpi'
-
-with open(path, 'rb') as f:
-    d = pickle.load(f)
-tree = d['tree']
-
-width = tree.contraction_width()
-slices = tree.multiplicity
-sliced_width = width - (slices.bit_length() - 1)
-
-print(f"Path: {path}")
-print(f"Width (unsliced):     {width:.1f}")
-print(f"Slices:               {slices}")
-print(f"Sliced width:         {sliced_width:.1f}")
-print(f"Peak memory (total):  {2**width * 16 / 1e9:.1f} GB")
-print(f"Peak per slice:       {2**sliced_width * 16 / 1e9:.2f} GB")
-print(f"Sliced indices:       {len(tree.sliced_inds)}")
-print(f"Cost (log2 flops):    {tree.contraction_cost(log=True):.2f}")
diff --git a/run_qibojit_ref.py b/run_qibojit_ref.py
deleted file mode 100644
index 58a9acf..0000000
--- a/run_qibojit_ref.py
+++ /dev/null
@@ -1,18 +0,0 @@
-"""Run qibojit on 30-qubit QFT and save statevector for comparison."""
-import time
-import numpy as np
-import qibo
-from qibo.models import QFT
-
-#np.random.seed(42)
-circuit = QFT(32)
-
-qibo.set_backend("qibojit", platform="numba")
-t0 = time.time()
-result = circuit()
-elapsed = time.time() - t0
-
-sv = np.array(result.state(), dtype=complex).flatten()
-np.save("data/sv_qibojit_qft30.npy", sv)
-print(f"[qibojit] time={elapsed:.4f}s  shape={sv.shape}")
-print(f"Saved to sv_qibojit_qft30.npy")
diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py
index 5ece54f..3868eec 100644
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -352,6 +352,149 @@ def _string_to_quimb_operator(self, op_str):
     return op
 
 
+def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
+    """
+    Compute expectation value with optional parallel acceleration.
+
+    Parameters
+    ----------
+    circuit : qibo.models.Circuit
+        The quantum circuit.
+    observable : qibo.hamiltonians.SymbolicHamiltonian or form
+        The observable to measure.
+    parallel : str, optional
+        Parallelization method: 'mpi', 'processpool', or None (default).
+    parallel_opts : dict, optional
+        Options for parallel execution:
+        - max_repeats: int (default 1024)
+        - max_time: int (default 300)
+        - search_workers: int (default 48, processpool only)
+        - mpi_contract: bool (default False, use MPI for contraction)
+
+    Returns
+    -------
+    float
+        The expectation value.
+    """
+    from qibotn.observables import check_observable, extract_gates_and_qubits
+
+    if parallel_opts is None:
+        parallel_opts = {}
+
+    observable = check_observable(observable, circuit.nqubits)
+
+    if parallel is None:
+        # Use original implementation
+        from qibotn.observables import extract_gates_and_qubits
+        all_terms = extract_gates_and_qubits(observable)
+
+        qc = self._qibo_circuit_to_quimb(
+            circuit,
+            quimb_circuit_type=self.circuit_ansatz,
+            gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff},
+        )
+
+        exp_val = 0.0
+        for coeff, factors in all_terms:
+            op = None
+            where = []
+            for qubit, gate_name in factors:
+                p = qu.pauli(gate_name.lower())
+                op = p if op is None else op & p
+                where.append(qubit)
+
+            val = qc.local_expectation(
+                op, tuple(where),
+                backend=self.backend,
+                optimize=self.contractions_optimizer,
+                simplify_sequence="ADCRS",
+                simplify_atol=1e-12,
+            )
+            exp_val += coeff * val
+
+        return self.real(exp_val)
+
+    else:
+        # Use parallel implementation
+        return self._expectation_parallel(circuit, observable, parallel, parallel_opts)
+
+
+def _expectation_parallel(self, circuit, observable, method, opts):
+    """Parallel expectation value computation."""
+    from qibotn.observables import extract_gates_and_qubits
+    from qibotn.parallel import parallel_path_search, parallel_contract
+    import torch
+
+    try:
+        from mpi4py import MPI
+        comm = MPI.COMM_WORLD if method == 'mpi' else None
+        rank = comm.Get_rank() if comm else 0
+        size = comm.Get_size() if comm else 1
+    except ImportError:
+        comm, rank, size = None, 0, 1
+
+    max_repeats = opts.get('max_repeats', 1024)
+    max_time = opts.get('max_time', 300)
+    search_workers = opts.get('search_workers', 48)
+    mpi_contract = opts.get('mpi_contract', False)
+    torch_threads = opts.get('torch_threads', None)
+
+    qc = self._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=self.circuit_ansatz,
+        gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff},
+    )
+
+    all_terms = extract_gates_and_qubits(observable)
+    my_terms = all_terms[rank::size]
+
+    if method == 'mpi' and comm:
+        torch.set_num_threads(max(1, 96 // size))
+    elif torch_threads:
+        torch.set_num_threads(torch_threads)
+
+    my_exp = 0.0
+    for coeff, factors in my_terms:
+        op = None
+        where = []
+        for qubit, gate_name in factors:
+            p = qu.pauli(gate_name.lower())
+            op = p if op is None else op & p
+            where.append(qubit)
+
+        tn = qc.local_expectation(op, tuple(where), rehearse='tn')
+
+        tree = parallel_path_search(
+            tn, tn.outer_inds(),
+            method=method,
+            total_repeats=max_repeats,
+            max_time=max_time,
+            n_workers=search_workers
+        )
+
+        if tree is None:
+            continue
+
+        if mpi_contract and comm and size > 1:
+            arrays = [self.engine.asarray(a) for a in tn.arrays]
+            val = parallel_contract(tree, arrays, method='mpi', comm=comm)
+        else:
+            for tensor in tn.tensors:
+                tensor._data = torch.from_numpy(self.engine.asarray(tensor._data)).to(torch.complex128)
+            val = complex(tn.contract(all, output_inds=(), optimize=tree))
+
+        my_exp += coeff * complex(val)
+
+    if comm:
+        all_exp = comm.gather(my_exp, root=0)
+        if rank == 0:
+            total_exp = sum(all_exp)
+            return self.real(total_exp)
+        return 0.0
+
+    return self.real(my_exp)
+
+
 CLASSES_ROOTS = {"numpy": "Numpy", "torch": "PyTorch", "jax": "Jax"}
 
 METHODS = {
@@ -362,6 +505,8 @@ METHODS = {
     "exp_value_observable_symbolic": exp_value_observable_symbolic,
     "_qibo_circuit_to_quimb": _qibo_circuit_to_quimb,
     "_string_to_quimb_operator": _string_to_quimb_operator,
+    "expectation": expectation,
+    "_expectation_parallel": _expectation_parallel,
     "circuit_ansatz": circuit_ansatz,
 }
 
diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index 680c817..ae36893 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -4,83 +4,16 @@ from cupy.cuda import nccl
 from cupy.cuda.runtime import getDeviceCount
 from cuquantum.tensornet import Network, contract
 from mpi4py import MPI
-from qibo import hamiltonians
-from qibo.symbols import I, X, Y, Z
 
 from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.circuit_to_mps import QiboCircuitToMPS
 from qibotn.mps_contraction_helper import MPSContractionHelper
-
-
-def check_observable(observable, circuit_nqubit):
-    """Checks the type of observable and returns the appropriate Hamiltonian."""
-    if observable is None:
-        return build_observable(circuit_nqubit)
-    elif isinstance(observable, dict):
-        return create_hamiltonian_from_dict(observable, circuit_nqubit)
-    elif isinstance(observable, hamiltonians.SymbolicHamiltonian):
-        # TODO: check if the observable is compatible with the circuit
-        return observable
-    else:
-        raise TypeError("Invalid observable type.")
-
-
-def build_observable(circuit_nqubit):
-    """Helper function to construct a target observable."""
-    hamiltonian_form = 0
-    for i in range(circuit_nqubit):
-        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
-
-    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
-    return hamiltonian
-
-
-def create_hamiltonian_from_dict(data, circuit_nqubit):
-    """Create a Qibo SymbolicHamiltonian from a dictionary representation.
-
-    Ensures that each Hamiltonian term explicitly acts on all circuit qubits
-    by adding identity (`I`) gates where needed.
-
-    Args:
-        data (dict): Dictionary containing Hamiltonian terms.
-        circuit_nqubit (int): Total number of qubits in the quantum circuit.
-
-    Returns:
-        hamiltonians.SymbolicHamiltonian: The constructed Hamiltonian.
-    """
-    PAULI_GATES = {"X": X, "Y": Y, "Z": Z}
-
-    terms = []
-
-    for term in data["terms"]:
-        coeff = term["coefficient"]
-        operators = term["operators"]  # List of tuples like [("Z", 0), ("X", 1)]
-
-        # Convert the operator list into a dictionary {qubit_index: gate}
-        operator_dict = {q: PAULI_GATES[g] for g, q in operators}
-
-        # Build the full term ensuring all qubits are covered
-        full_term_expr = [
-            operator_dict[q](q) if q in operator_dict else I(q)
-            for q in range(circuit_nqubit)
-        ]
-
-        # Multiply all operators together to form a single term
-        term_expr = full_term_expr[0]
-        for op in full_term_expr[1:]:
-            term_expr *= op
-
-        # Scale by the coefficient
-        final_term = coeff * term_expr
-        terms.append(final_term)
-
-    if not terms:
-        raise ValueError("No valid Hamiltonian terms were added.")
-
-    # Combine all terms
-    hamiltonian_form = sum(terms)
-
-    return hamiltonians.SymbolicHamiltonian(hamiltonian_form)
+from qibotn.observables import (
+    build_observable,
+    check_observable,
+    create_hamiltonian_from_dict,
+    extract_gates_and_qubits,
+)
 
 
 def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
@@ -111,45 +44,6 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
     return gates
 
 
-def extract_gates_and_qubits(hamiltonian):
-    """
-    Extracts the gates and their corresponding qubits from a Qibo Hamiltonian.
-
-    Parameters:
-        hamiltonian (qibo.hamiltonians.Hamiltonian or qibo.hamiltonians.SymbolicHamiltonian):
-            A Qibo Hamiltonian object.
-
-    Returns:
-        list of tuples: [(coefficient, [(gate, qubit), ...]), ...]
-            - coefficient: The prefactor of the term.
-            - list of (gate, qubit): Each term's gates and the qubits they act on.
-    """
-    extracted_terms = []
-
-    if isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
-        for term in hamiltonian.terms:
-            coeff = term.coefficient  # Extract coefficient
-            gate_qubit_list = []
-
-            # Extract gate and qubit information
-            for factor in term.factors:
-                gate_name = str(factor)[
-                    0
-                ]  # Extract the gate type (X, Y, Z) from 'X0', 'Z1'
-                qubit = int(str(factor)[1:])  # Extract the qubit index
-                gate_qubit_list.append((qubit, gate_name, coeff))
-                coeff = 1.0
-
-            extracted_terms.append(gate_qubit_list)
-
-    else:
-        raise ValueError(
-            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
-        )
-
-    return extracted_terms
-
-
 def initialize_mpi():
     """Initialize MPI communication and device selection."""
     comm = MPI.COMM_WORLD
diff --git a/src/qibotn/observables.py b/src/qibotn/observables.py
new file mode 100644
index 0000000..309ace4
--- /dev/null
+++ b/src/qibotn/observables.py
@@ -0,0 +1,71 @@
+"""Observable helpers shared by tensor-network backends and benchmarks."""
+
+from qibo import hamiltonians
+from qibo.symbols import I, X, Y, Z
+
+
+def check_observable(observable, circuit_nqubit):
+    """Checks the type of observable and returns the appropriate Hamiltonian."""
+    if observable is None:
+        return build_observable(circuit_nqubit)
+    if isinstance(observable, dict):
+        return create_hamiltonian_from_dict(observable, circuit_nqubit)
+    if isinstance(observable, hamiltonians.SymbolicHamiltonian):
+        return observable
+    raise TypeError("Invalid observable type.")
+
+
+def build_observable(circuit_nqubit):
+    """Construct the default benchmark observable used by qibotn."""
+    hamiltonian_form = 0
+    for i in range(circuit_nqubit):
+        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
+
+    return hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
+
+
+def create_hamiltonian_from_dict(data, circuit_nqubit):
+    """Create a Qibo SymbolicHamiltonian from the qibotn dict representation."""
+    pauli_gates = {"X": X, "Y": Y, "Z": Z}
+    terms = []
+
+    for term in data["terms"]:
+        coeff = term["coefficient"]
+        operators = term["operators"]
+        operator_dict = {q: pauli_gates[g] for g, q in operators}
+
+        full_term_expr = [
+            operator_dict[q](q) if q in operator_dict else I(q)
+            for q in range(circuit_nqubit)
+        ]
+
+        term_expr = full_term_expr[0]
+        for op in full_term_expr[1:]:
+            term_expr *= op
+
+        terms.append(coeff * term_expr)
+
+    if not terms:
+        raise ValueError("No valid Hamiltonian terms were added.")
+
+    return hamiltonians.SymbolicHamiltonian(sum(terms))
+
+
+def extract_gates_and_qubits(hamiltonian):
+    """Extract per-term Pauli factors from a Qibo SymbolicHamiltonian.
+
+    Returns list of terms, where each term is (coefficient, [(qubit, gate_name), ...]).
+    """
+    extracted_terms = []
+
+    if not isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
+        raise ValueError(
+            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
+        )
+
+    for term in hamiltonian.terms:
+        coeff = term.coefficient
+        factors = [(int(str(f)[1:]), str(f)[0]) for f in term.factors]
+        extracted_terms.append((coeff, factors))
+
+    return extracted_terms
diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py
new file mode 100644
index 0000000..eab6b43
--- /dev/null
+++ b/src/qibotn/parallel.py
@@ -0,0 +1,148 @@
+"""Parallel path search and contraction utilities for tensor networks."""
+import os
+import pickle
+import signal
+import numpy as np
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+try:
+    from mpi4py import MPI
+    _HAVE_MPI = True
+except ImportError:
+    _HAVE_MPI = False
+    MPI = None
+
+
+def _serial_search(tn_bytes, output_inds, repeats, seed, max_time):
+    """Single-process path search with cotengra."""
+    import random
+    import cotengra as ctg
+    random.seed(seed)
+    tn = pickle.loads(tn_bytes)
+    opt = ctg.HyperOptimizer(
+        methods=["kahypar", "kahypar-agglom", "spinglass"],
+        max_repeats=repeats,
+        parallel=False,
+        minimize="combo-256",
+        max_time=max_time,
+        optlib="random",
+        progbar=False,
+    )
+    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
+    return tree.combo_cost(factor=256), tree
+
+
+def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
+                        max_time=300, n_workers=48):
+    """Parallel contraction path search.
+
+    Args:
+        tn: Tensor network (quimb TensorNetwork)
+        output_inds: Output indices for contraction
+        method: 'processpool' or 'mpi'
+        total_repeats: Total optimization repeats
+        max_time: Timeout per worker (seconds)
+        n_workers: Number of workers (processpool only)
+
+    Returns:
+        Best contraction tree found
+    """
+    if method == 'mpi':
+        if not _HAVE_MPI:
+            raise ImportError("mpi4py not available")
+        return _mpi_search(tn, output_inds, total_repeats, max_time)
+    elif method == 'processpool':
+        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time)
+    else:
+        raise ValueError(f"Unknown method: {method}")
+
+
+def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time):
+    """ProcessPool-based parallel search."""
+    tn_bytes = pickle.dumps(tn)
+    repeats_per = max(1, total_repeats // n_workers)
+    pool = ProcessPoolExecutor(max_workers=n_workers)
+    futures = [
+        pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time)
+        for seed in range(n_workers)
+    ]
+    best_cost, best_tree = float("inf"), None
+    try:
+        for fut in as_completed(futures, timeout=max_time + 5):
+            try:
+                cost, tree = fut.result()
+                if cost < best_cost:
+                    best_cost, best_tree = cost, tree
+            except Exception:
+                pass
+    except TimeoutError:
+        pass
+    finally:
+        for fut in futures:
+            fut.cancel()
+        for pid in list(pool._processes.keys()):
+            try:
+                os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+        pool.shutdown(wait=False)
+    return best_tree
+
+
+def _mpi_search(tn, output_inds, total_repeats, max_time):
+    """MPI-based parallel search across ranks."""
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    tn_bytes = pickle.dumps(tn)
+    repeats_per = max(1, total_repeats // size)
+    local_cost, local_tree = _serial_search(tn_bytes, output_inds, repeats_per, rank, max_time)
+    all_results = comm.gather((local_cost, local_tree), root=0)
+    best_tree = None
+    if rank == 0:
+        best_cost = float("inf")
+        for cost, tree in all_results:
+            if tree is not None and cost < best_cost:
+                best_cost, best_tree = cost, tree
+    best_tree = comm.bcast(best_tree, root=0)
+    return best_tree
+
+
+def parallel_contract(tree, arrays, method='mpi', comm=None):
+    """Parallel sliced contraction.
+
+    Args:
+        tree: Contraction tree
+        arrays: List of tensor arrays
+        method: 'mpi' (only MPI supported for now)
+        comm: MPI communicator (required for MPI)
+
+    Returns:
+        Contracted result (on root rank for MPI)
+    """
+    if method == 'mpi':
+        if not _HAVE_MPI or comm is None:
+            raise ValueError("MPI method requires mpi4py and comm")
+        return _contract_mpi(tree, arrays, comm)
+    else:
+        raise ValueError(f"Unknown method: {method}")
+
+
+def _contract_mpi(tree, arrays, comm, root=0):
+    """Distribute contraction slices across MPI ranks."""
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    is_torch = type(arrays[0]).__module__.startswith("torch")
+
+    result_np = None
+    for i in range(rank, tree.multiplicity, size):
+        x = tree.contract_slice(arrays, i)
+        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
+        result_np = x_np if result_np is None else result_np + x_np
+
+    if result_np is None:
+        result_np = np.zeros(1, dtype=np.complex128)
+
+    result = np.zeros_like(result_np) if rank == root else None
+    comm.Reduce(result_np, result, root=root)
+    return result
diff --git a/sweep_bond_32q.py b/sweep_bond_32q.py
deleted file mode 100644
index 5aae456..0000000
--- a/sweep_bond_32q.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""Bond dimension sweep for 32-qubit variational circuit."""
-import os
-import sys
-import numpy as np
-
-sys.path.insert(0, os.path.dirname(__file__))
-from benchmark_mps import make_circuit, run_qibojit, run_quimb_mps, compare, jit_cache_path, DATA_DIR
-
-NQUBITS = 32
-NLAYERS = 5
-BOND_VALUES = [1, 8, 16, 32, 64, 128, 256]
-SVD_CUTOFF = 1e-6
-OPTIMIZER = "auto-hq"
-
-if __name__ == "__main__":
-    cache_path = jit_cache_path("variational", NQUBITS, NLAYERS)
-
-    if os.path.exists(cache_path):
-        sv_ref = np.load(cache_path)
-        print(f"[qibojit] loaded from cache: {cache_path}\n")
-    else:
-        np.random.seed(42)
-        circuit_ref = make_circuit("variational", NQUBITS, NLAYERS)
-        sv_ref, t_ref = run_qibojit(circuit_ref)
-        np.save(cache_path, sv_ref)
-        print(f"[qibojit] time={t_ref:.4f}s  (saved to {cache_path})\n")
-
-    print(f"{'bond':>6}  {'time(s)':>10}  {'fidelity':>12}  {'l2_err':>10}")
-    print("-" * 46)
-
-    for bond in BOND_VALUES:
-        np.random.seed(42)
-        circuit_mps = make_circuit("variational", NQUBITS, NLAYERS)
-        try:
-            sv_mps, t_mps = run_quimb_mps(circuit_mps, bond, SVD_CUTOFF, OPTIMIZER)
-            fidelity, l2_err = compare(sv_ref, sv_mps)
-            print(f"{bond:>6}  {t_mps:>10.4f}  {fidelity:>12.8f}  {l2_err:>10.2e}")
-        except Exception as e:
-            print(f"{bond:>6}  FAILED: {e}")
diff --git a/tests/test_expectation.py b/tests/test_expectation.py
index d89c6f9..0cd6ac2 100644
--- a/tests/test_expectation.py
+++ b/tests/test_expectation.py
@@ -35,7 +35,7 @@ def test_observable_expval(backend, nqubits):
     numpy_backend = construct_backend("numpy")
     ham, ham_form = build_observable(nqubits)
     circ = build_circuit(nqubits=nqubits, nlayers=1)
-
+    
     exact_expval = numpy_backend.calculate_expectation_state(
         hamiltonian=ham,
         state=circ().state(),