diff --git a/.gitignore b/.gitignore
index 2fee375..e6de13b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,8 @@ __pycache__/
 data/
 # C extensions
 *.so
-
+bak/
+perf*
 # Distribution / packaging
 .Python
 build/
diff --git a/bench_profile.py b/bench_profile.py
new file mode 100644
index 0000000..26f8ac7
--- /dev/null
+++ b/bench_profile.py
@@ -0,0 +1,460 @@
+"""Benchmark: qibotn/quimb generic TN — single-process torch profiling version."""
+
+import os
+import pickle
+import time
+import argparse
+import numpy as np
+import cotengra as ctg
+import qibo
+from qibo import Circuit, gates
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
+
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+
+    return c
+
+
+def make_z_observable(nqubits):
+    """Z on qubit 0 only — single contraction for benchmarking."""
+    return ["z"], [(0,)], [1.0]
+
+
+def export_profiler_outputs(prof, trace_path):
+    """Export Chrome trace and text table."""
+    prof.export_chrome_trace(trace_path)
+
+    table_path = trace_path.replace(".json", ".txt")
+    with open(table_path, "w") as f:
+        f.write(
+            prof.key_averages().table(
+                sort_by="self_cpu_time_total",
+                row_limit=200,
+            )
+        )
+
+    print(f"  [torch profiler trace] {trace_path}")
+    print(f"  [torch profiler table] {table_path}")
+
+
+def run_quimb_tn(
+    circuit,
+    nqubits,
+    num_slices,
+    load_path=None,
+    save_path=None,
+):
+    """Mode: expval — compute <Z_0> via local_expectation."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    operators, sites, coeffs = make_z_observable(nqubits)
+    ops = b._string_to_quimb_operator(operators[0])
+
+    qc = b._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=b.circuit_ansatz,
+        gate_opts={"max_bond": None, "cutoff": 1e-10},
+    )
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=["kahypar", "random-greedy", "spinglass"],
+            max_repeats=16,
+            parallel=True,
+            max_time=60,
+            slicing_opts={"target_slices": num_slices},
+            progbar=True,
+        )
+
+        t0 = time.time()
+        rehearsal = qc.local_expectation(
+            ops,
+            where=sites[0],
+            optimize=opt,
+            simplify_sequence="R",
+            rehearse=True,
+        )
+        t_search = time.time() - t0
+        tree = rehearsal["tree"]
+
+        print(
+            f"  [path search]  {t_search:.3f}s  "
+            f"flops~2^{tree.contraction_cost():.2f}  "
+            f"size~2^{tree.contraction_width():.2f}  "
+            f"slices={tree.multiplicity}"
+        )
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    t0 = time.time()
+    expval = qc.local_expectation(
+        ops,
+        where=sites[0],
+        optimize=tree,
+        simplify_sequence="R",
+    )
+    t_contract = time.time() - t0
+
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return float(expval.real), t_search + t_contract
+
+
+def run_quimb_tn_statevector(
+    circuit,
+    nqubits,
+    num_slices,
+    load_path=None,
+    save_path=None,
+    profile=False,
+    profile_dir="profiles",
+):
+    """Mode: statevector — contract full TN to dense vector, single process."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+
+    qc = b._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=b.circuit_ansatz,
+        gate_opts={"max_bond": None, "cutoff": 1e-10},
+    )
+
+    # 让 quimb 生成 torch tensor，这样 torch.profiler 能抓到 aten op。
+    qc.to_backend = torch.from_numpy
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=["kahypar", "random-greedy", "spinglass"],
+            max_repeats=500,
+            parallel=48,
+            max_time=100,
+            minimize="size",
+            slicing_opts={"target_slices": num_slices},
+            progbar=True,
+        )
+
+        t0 = time.time()
+        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal["tree"]
+
+        print(
+            f"  [path search]  {t_search:.3f}s  "
+            f"flops~2^{tree.contraction_cost():.2f}  "
+            f"size~2^{tree.contraction_width():.2f}  "
+            f"slices={tree.multiplicity}"
+        )
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    os.makedirs(profile_dir, exist_ok=True)
+
+    if profile:
+        from torch.profiler import profile as torch_profile
+        from torch.profiler import ProfilerActivity, record_function
+
+        trace_path = os.path.join(
+            profile_dir,
+            (
+                f"trace_statevector_"
+                f"{circuit.nqubits}q_"
+                f"slices{tree.multiplicity}_"
+                f"{int(time.time())}.json"
+            ),
+        )
+
+        t0 = time.time()
+
+        with torch_profile(
+            activities=[ProfilerActivity.CPU],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+        ) as prof:
+            with record_function("qibotn_to_dense_contraction"):
+                sv = qc.to_dense(optimize=tree).reshape(-1)
+
+            with record_function("torch_to_numpy_view_or_copy"):
+                if type(sv).__module__.startswith("torch"):
+                    sv_tn = sv.detach().cpu().numpy()
+                else:
+                    sv_tn = np.asarray(sv)
+
+        t_contract = time.time() - t0
+
+        export_profiler_outputs(prof, trace_path)
+
+    else:
+        t0 = time.time()
+        sv = qc.to_dense(optimize=tree).reshape(-1)
+        t_contract = time.time() - t0
+
+        if type(sv).__module__.startswith("torch"):
+            sv_tn = sv.detach().cpu().numpy()
+        else:
+            sv_tn = np.asarray(sv)
+
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return sv_tn, t_search + t_contract
+
+
+def run_quimb_tn_samples(circuit, nshots=1024):
+    """Mode: samples — sample from circuit output distribution."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    t0 = time.time()
+    result = b.execute_circuit(circuit, nshots=nshots)
+    t_total = time.time() - t0
+
+    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
+
+    try:
+        freqs = result.frequencies()
+        print(f"  top states: {dict(list(freqs.items())[:5])}")
+    except Exception:
+        pass
+
+    return result, t_total
+
+
+def qibojit_expval(circuit, nqubits):
+    """Compute <Z_0> via qibojit statevector."""
+    qibo.set_backend("qibojit", platform="numba")
+
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+
+    sv = np.array(result.state(), dtype=complex).flatten()
+    probs = np.abs(sv) ** 2
+
+    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
+    expval = float(np.dot(probs, 1 - 2 * bits))
+
+    return expval, elapsed
+
+
+def run_qibojit(circuit):
+    """Compute full statevector via qibojit."""
+    qibo.set_backend("qibojit", platform="numba")
+
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+
+    sv = np.array(result.state(), dtype=complex).flatten()
+
+    return sv, elapsed
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument(
+        "--circuit",
+        type=str,
+        default="qft",
+        choices=["qft", "variational", "ghz", "brickwork"],
+    )
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--nshots", type=int, default=1024)
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="statevector",
+        choices=["expval", "statevector", "samples"],
+        help="expval: local_expectation; statevector: to_dense; samples: sampling",
+    )
+
+    parser.add_argument(
+        "--no-compare",
+        action="store_true",
+        help="Skip qibojit reference run",
+    )
+
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Save contraction tree to a pickle file",
+    )
+
+    parser.add_argument(
+        "--load-path",
+        type=str,
+        default=None,
+        help="Load contraction tree from a pickle file and skip path search",
+    )
+
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Enable torch profiler for statevector contraction stage",
+    )
+
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default="profiles",
+        help="Directory to save torch profiler traces",
+    )
+
+    parser.add_argument(
+        "--save-statevector",
+        action="store_true",
+        help="Save TN statevector to data/sv_tn_*.npy",
+    )
+
+    args = parser.parse_args()
+
+    print(
+        f"Circuit: {args.circuit}, "
+        f"nqubits={args.nqubits}, "
+        f"nlayers={args.nlayers}, "
+        f"mode={args.mode}, "
+        f"profile={args.profile}"
+    )
+
+    np.random.seed(42)
+    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+    try:
+        if args.mode == "expval":
+            expval_tn, t_tn = run_quimb_tn(
+                circuit_tn,
+                args.nqubits,
+                args.num_slices,
+                load_path=args.load_path,
+                save_path=args.save_path,
+            )
+
+            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
+
+        elif args.mode == "statevector":
+            sv_tn, t_tn = run_quimb_tn_statevector(
+                circuit_tn,
+                args.nqubits,
+                args.num_slices,
+                load_path=args.load_path,
+                save_path=args.save_path,
+                profile=args.profile,
+                profile_dir=args.profile_dir,
+            )
+
+            print(
+                f"\n[quimb TN]  time={t_tn:.4f}s  "
+                f"statevector shape={sv_tn.shape}"
+            )
+
+            if args.save_statevector:
+                os.makedirs("data", exist_ok=True)
+                out_path = f"data/sv_tn_{args.circuit}{args.nqubits}.npy"
+                np.save(out_path, sv_tn)
+                print(f"[saved statevector] {out_path}")
+
+        else:
+            _, t_tn = run_quimb_tn_samples(
+                circuit_tn,
+                nshots=args.nshots,
+            )
+
+            print(f"\n[quimb TN]  time={t_tn:.4f}s")
+            args.no_compare = True
+
+    except Exception as e:
+        print(f"[quimb TN] FAILED: {e}")
+        raise
+
+    if not args.no_compare and args.mode != "statevector":
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
+
+        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
+        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
+
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref / t_tn:.2f}x")
+
+    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+
+        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
+        l2_err = np.linalg.norm(sv_ref - sv_tn)
+
+        print(f"[qibojit]   time={t_ref:.4f}s")
+        print(f"Fidelity : {fid:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref / t_tn:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmark_mps.py b/benchmark_mps.py
index 90c8275..827fd77 100644
--- a/benchmark_mps.py
+++ b/benchmark_mps.py
@@ -4,16 +4,17 @@ import argparse
 import os
 import numpy as np
 import qibo
+import quimb.tensor as qtn
 from qibo import Circuit, gates
 
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 
 
-def make_circuit(circuit_type, nqubits, nlayers=1):
+def make_circuit(circuit_type, nqubits, nlayers=1, add_measurements=False):
     c = Circuit(nqubits)
     if circuit_type == "qft":
         from qibo.models import QFT
-        return QFT(nqubits)
+        c = QFT(nqubits)
     elif circuit_type == "variational":
         for layer in range(nlayers):
             for q in range(nqubits):
@@ -27,6 +28,8 @@ def make_circuit(circuit_type, nqubits, nlayers=1):
             c.add(gates.CNOT(q, q + 1))
     else:
         raise ValueError(f"Unknown circuit: {circuit_type}")
+    if add_measurements:
+        c.add(gates.M(*range(nqubits)))
     return c
 
 
@@ -39,20 +42,58 @@ def run_qibojit(circuit):
     return sv, elapsed
 
 
-def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer):
+def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer, nshots=None):
     qibo.set_backend("qibotn", platform="quimb")
     b = qibo.get_backend()
     b.configure_tn_simulation(ansatz="mps", max_bond_dimension=max_bond, svd_cutoff=svd_cutoff)
     b.contractions_optimizer = optimizer
 
     t0 = time.time()
-    result = b.execute_circuit(circuit, return_array=True)
-    elapsed = time.time() - t0
-    sv = result.state()
-    return sv, elapsed
+    if nshots:
+        result = b.execute_circuit(circuit, nshots=nshots)
+        elapsed = time.time() - t0
+        return result.frequencies(), elapsed, 0.0
+    else:
+        # MPS simulation
+        circ_quimb = qtn.CircuitMPS.from_openqasm2_str(
+            circuit.to_qasm(),
+            gate_opts={"max_bond": max_bond, "cutoff": svd_cutoff},
+        )
+        t_mps = time.time() - t0
+        # to_dense separately
+        t1 = time.time()
+        #sv = circ_quimb.psi.to_dense().reshape(-1)
+        sv = None
+        t_dense = time.time() - t1
+        return sv, t_mps, t_dense
 
 
-def compare(sv_ref, sv_mps):
+def run_quimb_permmps(circuit, max_bond, svd_cutoff, nshots=None):
+    gates_list = [
+        qtn.Gate(g.name, params=list(g.parameters), qubits=list(g.qubits))
+        for g in circuit.queue
+        if g.name.lower() != "measure"
+    ]
+    t0 = time.time()
+    circ = qtn.CircuitPermMPS.from_gates(
+        gates_list,
+        N=circuit.nqubits,
+        max_bond=max_bond,
+        cutoff=svd_cutoff,
+    )
+    if nshots:
+        from collections import Counter
+        result = Counter(circ.sample(nshots))
+        elapsed = time.time() - t0
+        return dict(result), elapsed
+    else:
+        mps = circ.get_psi_unordered()
+        sv = mps.to_dense().reshape(-1)
+        elapsed = time.time() - t0
+        return sv, elapsed
+
+
+def compare_statevector(sv_ref, sv_mps):
     sv_ref = np.array(sv_ref, dtype=complex).flatten()
     sv_mps = np.array(sv_mps, dtype=complex).flatten()
     fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
@@ -60,6 +101,12 @@ def compare(sv_ref, sv_mps):
     return fidelity, l2_err
 
 
+def compare_frequencies(freq_ref, freq_mps, nshots):
+    all_keys = set(freq_ref) | set(freq_mps)
+    tvd = 0.5 * sum(abs(freq_ref.get(k, 0) - freq_mps.get(k, 0)) for k in all_keys) / nshots
+    return tvd
+
+
 def jit_cache_path(circuit_type, nqubits, nlayers):
     os.makedirs(DATA_DIR, exist_ok=True)
     return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
@@ -74,37 +121,65 @@ def main():
     parser.add_argument("--max-bond", type=int, default=None,
                         help="Max bond dimension for MPS (None = unlimited)")
     parser.add_argument("--svd-cutoff", type=float, default=1e-6)
-    parser.add_argument("--optimizer", type=str, default="auto-hq")
+    parser.add_argument("--optimizer", type=str, default="eager")
+    parser.add_argument("--nshots", type=int, default=None,
+                        help="Use sampling mode with given number of shots instead of statevector")
+    parser.add_argument("--permmps", action="store_true",
+                        help="Use CircuitPermMPS directly instead of qibotn backend")
     parser.add_argument("--skip-jit", action="store_true",
                         help="Skip qibojit run, load cached statevector if available")
+    parser.add_argument("--no-compare", action="store_true",
+                        help="Skip correctness comparison entirely")
     args = parser.parse_args()
 
     print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
     print(f"MPS config: max_bond={args.max_bond}, svd_cutoff={args.svd_cutoff}, optimizer={args.optimizer}")
 
-    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+    ref = None
     t_ref = None
 
-    if args.skip_jit and os.path.exists(cache_path):
-        sv_ref = np.load(cache_path)
-        print(f"\n[qibojit]  loaded from cache: {cache_path}")
-    else:
-        np.random.seed(42)
-        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-        sv_ref, t_ref = run_qibojit(circuit_ref)
-        np.save(cache_path, sv_ref)
-        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
+    if not args.no_compare:
+        cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+        if args.nshots:
+            # frequency mode: run qibojit with same nshots
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers, add_measurements=True)
+            qibo.set_backend("qibojit", platform="numba")
+            t0 = time.time()
+            result_ref = circuit_ref(nshots=args.nshots)
+            t_ref = time.time() - t0
+            ref = dict(result_ref.frequencies())
+            print(f"\n[qibojit]  time={t_ref:.4f}s")
+        elif args.skip_jit and os.path.exists(cache_path):
+            ref = np.load(cache_path)
+            print(f"\n[qibojit]  loaded from cache: {cache_path}")
+        else:
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+            ref, t_ref = run_qibojit(circuit_ref)
+            np.save(cache_path, ref)
+            print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
 
     np.random.seed(42)
     circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    label = "quimb PermMPS" if args.permmps else "quimb MPS"
     try:
-        sv_mps, t_mps = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer)
-        fidelity, l2_err = compare(sv_ref, sv_mps)
-        print(f"[quimb MPS] time={t_mps:.4f}s")
-        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
-        print(f"L2 error : {l2_err:.2e}")
-        if t_ref is not None and t_mps > 0:
-            print(f"Speedup  : {t_ref/t_mps:.2f}x")
+        if args.permmps:
+            out, t_mps = run_quimb_permmps(circuit_mps, args.max_bond, args.svd_cutoff, args.nshots)
+            t_dense = 0.0
+        else:
+            out, t_mps, t_dense = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer, args.nshots)
+        print(f"[{label}] MPS sim={t_mps:.4f}s  to_dense={t_dense:.4f}s  total={t_mps+t_dense:.4f}s")
+        if not args.no_compare:
+            if args.nshots:
+                tvd = compare_frequencies(ref, out, args.nshots)
+                print(f"\nTVD      : {tvd:.6f}  (0=perfect)")
+            else:
+                fidelity, l2_err = compare_statevector(ref, out)
+                print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
+                print(f"L2 error : {l2_err:.2e}")
+            if t_ref is not None and t_mps > 0:
+                print(f"Speedup  : {t_ref/t_mps:.2f}x")
     except Exception as e:
         print(f"[quimb MPS] FAILED: {e}")
         raise
diff --git a/benchmark_qmatchatea.py b/benchmark_qmatchatea.py
new file mode 100644
index 0000000..9e7923b
--- /dev/null
+++ b/benchmark_qmatchatea.py
@@ -0,0 +1,126 @@
+"""Benchmark: qibojit (reference) vs qibotn/qmatchatea MPS."""
+import time
+import argparse
+import os
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+from qibo.backends import construct_backend
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+def run_qibojit(circuit):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    return result.state(), elapsed
+
+
+def run_qmatchatea(circuit, max_bond, cut_ratio):
+    import qmatchatea, qtealeaves.observables
+    from qibo.backends import construct_backend as _cb
+    b = _cb(backend="qibotn", platform="qmatchatea")
+    b.configure_tn_simulation(ansatz="MPS", max_bond_dimension=max_bond, cut_ratio=cut_ratio)
+
+    qk_circuit = b._qibocirc_to_qiskitcirc(circuit)
+    run_qk_params = qmatchatea.preprocessing.qk_transpilation_params(False)
+    observables = qtealeaves.observables.TNObservables()
+    observables += qtealeaves.observables.TNState2File(name="temp", formatting="D")
+
+    t0 = time.time()
+    results = qmatchatea.run_simulation(
+        circ=qk_circuit,
+        convergence_parameters=b.convergence_params,
+        transpilation_parameters=run_qk_params,
+        backend=b.qmatchatea_backend,
+        observables=observables,
+    )
+    elapsed = time.time() - t0
+    tn_state = results.observables.get("tn_state")
+    if tn_state is None:
+        results.load_state()
+        tn_state = results.observables["tn_state"]
+    sv_obj = tn_state.to_statevector(qiskit_order=False, max_qubit_equivalent=40)
+    sv = np.array(sv_obj.elem, dtype=complex).flatten()
+    return sv, elapsed
+
+
+def compare(sv_ref, sv_mps):
+    sv_ref = np.array(sv_ref, dtype=complex).flatten()
+    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
+    l2_err = np.linalg.norm(sv_ref - sv_mps)
+    return fidelity, l2_err
+
+
+def jit_cache_path(circuit_type, nqubits, nlayers):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--circuit", type=str, default="ghz",
+                        choices=["qft", "variational", "ghz"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--max-bond", type=int, default=64)
+    parser.add_argument("--cut-ratio", type=float, default=1e-6)
+    parser.add_argument("--skip-jit", action="store_true")
+    args = parser.parse_args()
+
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
+    print(f"MPS config: max_bond={args.max_bond}, cut_ratio={args.cut_ratio}")
+
+    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+    t_ref = None
+
+    if args.skip_jit and os.path.exists(cache_path):
+        sv_ref = np.load(cache_path)
+        print(f"\n[qibojit]  loaded from cache: {cache_path}")
+    else:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        np.save(cache_path, sv_ref)
+        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
+
+    np.random.seed(42)
+    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    try:
+        sv_mps, t_mps = run_qmatchatea(circuit_mps, args.max_bond, args.cut_ratio)
+        fidelity, l2_err = compare(sv_ref, sv_mps)
+        print(f"[qmatchatea] time={t_mps:.4f}s")
+        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+        if t_ref is not None and t_mps > 0:
+            print(f"Speedup  : {t_ref/t_mps:.2f}x")
+    except Exception as e:
+        print(f"[qmatchatea] FAILED: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark_tn.py b/benchmark_tn.py
index 4daa6fb..6dae804 100644
--- a/benchmark_tn.py
+++ b/benchmark_tn.py
@@ -1,4 +1,5 @@
 """Benchmark: qibotn/quimb generic TN — expectation values."""
+import pickle
 import time
 import argparse
 import numpy as np
@@ -22,6 +23,15 @@ def make_circuit(circuit_type, nqubits, nlayers=1):
         c.add(gates.H(0))
         for q in range(nqubits - 1):
             c.add(gates.CNOT(q, q + 1))
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
     else:
         raise ValueError(f"Unknown circuit: {circuit_type}")
     return c
@@ -33,80 +43,305 @@ def make_z_observable(nqubits):
     return ["z"], [(0,)], [1.0]
 
 
-def run_quimb_tn(circuit, nqubits):
+def run_quimb_tn(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """Mode: expval — compute <Z_0> via local_expectation (lightcone pruning)."""
     qibo.set_backend("qibotn", platform="quimb")
     b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")  # generic TN, no MPS
-    #if max_time is not None:
-     #   opt = ctg.HyperOptimizer(max_repeats=128, max_time=max_time, minimize=minimize, parallel=True)
-    #else:
-    opt = ctg.HyperOptimizer(
-        max_repeats=16, 
-        parallel=True,                                                                                           
-        slicing_opts={'target_size': 2**24}, # 限制单个张量最大 2^28 个元素   
-        progbar=True
-        )
-    
-    b.contractions_optimizer = opt
+    b.configure_tn_simulation(ansatz="tn")
+
     operators, sites, coeffs = make_z_observable(nqubits)
+    ops = b._string_to_quimb_operator(operators[0])
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=['kahypar', 'random-greedy', 'spinglass'],
+            max_repeats=16,
+            parallel=True,
+            max_time=60,
+            slicing_opts={'target_slices': num_slices},
+            progbar=True,
+        )
+        t0 = time.time()
+        rehearsal = qc.local_expectation(ops, where=sites[0], optimize=opt,
+                                         simplify_sequence="R", rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal['tree']
+        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
     t0 = time.time()
-    expval = b.exp_value_observable_symbolic(circuit, operators, sites, coeffs, nqubits)
-    elapsed = time.time() - t0
-    return expval, elapsed
+    expval = qc.local_expectation(ops, where=sites[0], optimize=tree, simplify_sequence="R")
+    t_contract = time.time() - t0
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return float(expval.real), t_search + t_contract
+
+
+def run_quimb_tn_statevector(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """Mode: statevector — contract full TN to dense vector."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = torch.from_numpy
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=['kahypar', 'random-greedy', 'spinglass'],
+            max_repeats=128,
+            parallel=64,
+            max_time=100,
+            minimize='size',
+            slicing_opts={'target_slices': num_slices},
+            #slicing_opts={'target_size': 2**30},
+            progbar=True,
+        )
+        t0 = time.time()
+        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal['tree']
+        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    t0 = time.time()
+    sv = qc.to_dense(optimize=tree,implementation="cotengra").reshape(-1)
+    t_contract = time.time() - t0
+    print(f"  [contraction]  {t_contract:.3f}s")
+    sv_tn = np.array(sv)
+    return sv_tn, t_search + t_contract
+
+
+def _contract_mpi(tree, arrays, comm, root=0):
+    """Contract slices via MPI, returning result as the same array type as input.
+
+    Unlike ``cotengra.ContractionTree.contract_mpi``, this works with any
+    array backend (numpy, torch, etc.) — it only converts to numpy at the
+    MPI-reduce boundary and converts back.
+    """
+    size = comm.Get_size()
+    rank = comm.Get_rank()
+
+    result_np = None
+    is_torch = type(arrays[0]).__module__.startswith("torch")
+
+    for i in range(rank, tree.multiplicity, size):
+        x = tree.contract_slice(arrays, i)
+        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
+        
+        if result_np is None:
+            result_np = x_np
+        else:
+            result_np += x_np
+
+    if result_np is None:
+        result_np = np.zeros(1, dtype=np.complex64)
+
+    if rank == root:
+        result = np.zeros_like(result_np)
+    else:
+        result = None
+    comm.Reduce(result_np, result, root=root)
+
+    if rank == root:
+        import torch
+        return torch.from_numpy(np.asarray(result)) if is_torch else result
+    return None
+
+
+def run_quimb_tn_statevector_mpi(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """MPI-parallel statevector via custom MPI contraction (supports torch backend)."""
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = torch.from_numpy
+
+    # path search on rank 0, broadcast to all
+    if rank == 0:
+        if load_path:
+            with open(load_path, "rb") as f:
+                saved = pickle.load(f)
+            tree = saved["tree"]
+            psi = saved["psi"]
+            t_search = 0.0
+            print(f"  [path loaded]  {load_path}")
+        else:
+            opt = ctg.HyperOptimizer(
+                methods=['kahypar', 'random-greedy', 'spinglass'],
+                max_repeats=128,
+                parallel=64,
+                #max_repeats=1,
+                max_time=100,
+                minimize='size',
+                slicing_opts={'target_slices': max(num_slices, size), 'allow_outer': False},
+                progbar=True,
+            )
+            t0 = time.time()
+            rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+            t_search = time.time() - t0
+            tree = rehearsal['tree']
+            psi = rehearsal['tn']
+            print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}  slices={tree.multiplicity}")
+            if save_path:
+                with open(save_path, "wb") as f:
+                    pickle.dump({"tree": tree, "psi": psi}, f)
+                print(f"  [path saved]   {save_path}")
+    else:
+        tree = None
+        psi = None
+        t_search = 0.0
+
+    tree = comm.bcast(tree, root=0)
+    psi = comm.bcast(psi, root=0)
+    t_search = comm.bcast(t_search, root=0)
+
+    arrays = psi.arrays
+    t0 = time.time()
+    sv = _contract_mpi(tree, arrays, comm, root=0)
+    t_contract = time.time() - t0
+
+    if rank == 0:
+        print(f"  [contraction]  {t_contract:.3f}s")
+        return np.array(sv).reshape(-1), t_search + t_contract
+    return None, t_search + t_contract
+
+
+def run_quimb_tn_samples(circuit, nshots=1024):
+    """Mode: samples — sample from circuit output distribution."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    t0 = time.time()
+    result = b.execute_circuit(circuit, nshots=nshots)
+    t_total = time.time() - t0
+    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
+    print(f"  top states: {dict(list(result.frequencies().items())[:5])}")
+    return result, t_total
 
 
 def qibojit_expval(circuit, nqubits):
-    """Compute sum_i <Z_i> via qibojit statevector."""
+    """Compute <Z_0> via qibojit statevector."""
     qibo.set_backend("qibojit", platform="numba")
     t0 = time.time()
     result = circuit()
     elapsed = time.time() - t0
     sv = np.array(result.state(), dtype=complex).flatten()
     probs = np.abs(sv) ** 2
-    expval = sum(
-        probs[idx] * (1 - 2 * ((idx >> (nqubits - 1 - i)) & 1))
-        for i in range(nqubits)
-        for idx in range(len(probs))
-    )
-    return float(expval), elapsed
+    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
+    expval = float(np.dot(probs, 1 - 2 * bits))
+    return expval, elapsed
+
+
+def run_qibojit(circuit):
+    """Compute full statevector via qibojit."""
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    sv = np.array(result.state(), dtype=complex).flatten()
+    return sv, elapsed
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--nqubits", type=int, default=10)
     parser.add_argument("--circuit", type=str, default="qft",
-                        choices=["qft", "variational", "ghz"])
+                        choices=["qft", "variational", "ghz", "brickwork"])
     parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--optimizer", type=str, default="auto-hq")
-    parser.add_argument("--max-time", type=float, default=None,
-                        help="HyperOptimizer max search time (seconds); overrides --optimizer")
-    parser.add_argument("--minimize", type=str, default="flops",
-                        choices=["flops", "size", "write"],
-                        help="HyperOptimizer minimize target")
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--nshots", type=int, default=1024)
+    parser.add_argument("--mode", type=str, default="statevector",
+                        choices=["expval", "statevector", "samples"],
+                        help="expval: local_expectation; statevector: to_dense; samples: sampling")
+    parser.add_argument("--mpi", action="store_true",
+                        help="Use MPI-parallel contraction (run with mpirun -n N)")
     parser.add_argument("--no-compare", action="store_true",
                         help="Skip qibojit reference run")
+    parser.add_argument("--save-path", type=str, default=None,
+                        help="Save contraction tree to a pickle file")
+    parser.add_argument("--load-path", type=str, default=None,
+                        help="Load contraction tree from a pickle file (skip path search)")
     args = parser.parse_args()
 
-    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
-    print(f"TN config: optimizer={args.optimizer}, max_time={args.max_time}, minimize={args.minimize}")
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}, mode={args.mode}")
 
     np.random.seed(42)
     circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
     try:
-        expval_tn, t_tn = run_quimb_tn(circuit_tn, args.nqubits)
-        print(f"\n[quimb TN]  time={t_tn:.4f}s  <sum Z_i>={expval_tn:.8f}")
+        if args.mode == "expval":
+            expval_tn, t_tn = run_quimb_tn(circuit_tn, args.nqubits, args.num_slices,
+                                           load_path=args.load_path, save_path=args.save_path)
+            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
+        elif args.mode == "statevector":
+            if args.mpi:
+                sv_tn, t_tn = run_quimb_tn_statevector_mpi(circuit_tn, args.nqubits, args.num_slices,
+                                                           load_path=args.load_path, save_path=args.save_path)
+            else:
+                sv_tn, t_tn = run_quimb_tn_statevector(circuit_tn, args.nqubits, args.num_slices,
+                                                       load_path=args.load_path, save_path=args.save_path)
+            if sv_tn is not None:
+                print(f"\n[quimb TN]  time={t_tn:.4f}s  statevector shape={sv_tn.shape}")
+                np.save(f"data/sv_tn_{args.circuit}{args.nqubits}.npy", sv_tn)
+        else:
+            _, t_tn = run_quimb_tn_samples(circuit_tn, args.nqubits, args.nshots)
+            print(f"\n[quimb TN]  time={t_tn:.4f}s")
+            args.no_compare = True  # samples 模式无法和 qibojit 期望值对比
     except Exception as e:
         print(f"[quimb TN] FAILED: {e}")
         raise
 
-    if not args.no_compare:
+    if not args.no_compare and args.mode != "statevector":
         np.random.seed(42)
         circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
         expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
-        print(f"[qibojit]   time={t_ref:.4f}s  <sum Z_i>={expval_ref:.8f}")
+        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
         print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
         if t_tn > 0:
             print(f"Speedup  : {t_ref/t_tn:.2f}x")
+    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
+        l2_err = np.linalg.norm(sv_ref - sv_tn)
+        print(f"[qibojit]   time={t_ref:.4f}s")
+        print(f"Fidelity : {fid:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref/t_tn:.2f}x")
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_quimb.py b/benchmarks/benchmark_quimb.py
new file mode 100644
index 0000000..9566cdc
--- /dev/null
+++ b/benchmarks/benchmark_quimb.py
@@ -0,0 +1,519 @@
+"""Benchmark and profile the qibotn/quimb CPU backend.
+
+This script is intended to be the stable baseline for quimb backend
+optimization work. It supports:
+
+- multiple circuit families
+- MPS or generic TN execution
+- statevector, sampling, conversion, and local expectation workloads
+- warmup/repeat timing
+- optional correctness checks against qibojit
+- optional cProfile output
+"""
+
+from __future__ import annotations
+
+import argparse
+import cProfile
+import json
+import math
+import os
+import pstats
+import time
+from pathlib import Path
+from statistics import mean, pstdev
+
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+
+
+def configure_runtime_env(quimb_num_procs: int | None, blas_threads: int | None):
+    """Pin process-level thread settings before heavy work starts."""
+    if quimb_num_procs is not None:
+        os.environ["QUIMB_NUM_PROCS"] = str(quimb_num_procs)
+    if blas_threads is not None:
+        value = str(blas_threads)
+        os.environ["OMP_NUM_THREADS"] = value
+        os.environ["OPENBLAS_NUM_THREADS"] = value
+        os.environ["MKL_NUM_THREADS"] = value
+        os.environ["NUMEXPR_NUM_THREADS"] = value
+
+
+def make_circuit(
+    circuit_type: str,
+    nqubits: int,
+    nlayers: int,
+    seed: int,
+    add_measurements: bool = False,
+) -> Circuit:
+    """Construct repeatable workloads covering low/high entanglement cases."""
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    if circuit_type == "qft":
+        from qibo.models import QFT
+
+        circuit = QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for qubit in range(nqubits):
+                circuit.add(gates.RY(qubit, theta=rng.uniform(0.0, 2.0 * np.pi)))
+            offset = layer % 2
+            for qubit in range(offset, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+    elif circuit_type == "ghz":
+        circuit.add(gates.H(0))
+        for qubit in range(nqubits - 1):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+    elif circuit_type == "qaoa":
+        for _ in range(nlayers):
+            for qubit in range(nqubits):
+                circuit.add(gates.RZ(qubit, theta=rng.uniform(0.0, 2.0 * np.pi)))
+            for qubit in range(0, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+            for qubit in range(nqubits):
+                circuit.add(gates.RX(qubit, theta=rng.uniform(0.0, 2.0 * np.pi)))
+    elif circuit_type == "ising1d":
+        for _ in range(nlayers):
+            for qubit in range(nqubits):
+                circuit.add(gates.RX(qubit, theta=rng.uniform(0.0, 2.0 * np.pi)))
+            for qubit in range(0, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+            for qubit in range(1, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+    elif circuit_type == "rcs":
+        cols = math.ceil(math.sqrt(nqubits))
+        rows = math.ceil(nqubits / cols)
+        single_qubit_gates = [gates.T, gates.X, gates.Y]
+        for layer in range(nlayers):
+            for qubit in range(nqubits):
+                gate_cls = single_qubit_gates[rng.integers(0, len(single_qubit_gates))]
+                circuit.add(gate_cls(qubit))
+            if layer % 2 == 0:
+                for row in range(rows):
+                    for col in range(0, cols - 1, 2):
+                        q1, q2 = row * cols + col, row * cols + col + 1
+                        if q2 < nqubits:
+                            circuit.add(gates.CZ(q1, q2))
+            else:
+                for row in range(0, rows - 1, 2):
+                    for col in range(cols):
+                        q1, q2 = row * cols + col, (row + 1) * cols + col
+                        if q2 < nqubits:
+                            circuit.add(gates.CZ(q1, q2))
+    else:
+        raise ValueError(f"Unknown circuit type: {circuit_type}")
+
+    if add_measurements:
+        circuit.add(gates.M(*range(nqubits)))
+    return circuit
+
+
+def prepare_quimb_backend(
+    ansatz: str,
+    max_bond: int | None,
+    svd_cutoff: float,
+    optimizer: str,
+    n_most_frequent_states: int,
+):
+    """Create and configure the qibotn/quimb backend once."""
+    qibo.set_backend("qibotn", platform="quimb")
+    backend = qibo.get_backend()
+    backend.configure_tn_simulation(
+        ansatz=ansatz,
+        max_bond_dimension=max_bond,
+        svd_cutoff=svd_cutoff,
+        n_most_frequent_states=n_most_frequent_states,
+    )
+    backend.contractions_optimizer = optimizer
+    return backend
+
+
+def run_qibojit_state(circuit: Circuit):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.perf_counter()
+    result = circuit()
+    elapsed = time.perf_counter() - t0
+    state = np.asarray(result.state(), dtype=complex).reshape(-1)
+    return state, elapsed
+
+
+def run_qibojit_shots(circuit: Circuit, nshots: int):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.perf_counter()
+    result = circuit(nshots=nshots)
+    elapsed = time.perf_counter() - t0
+    return dict(result.frequencies()), elapsed
+
+
+def z_expectation_from_statevector(statevector: np.ndarray, nqubits: int, qubit: int):
+    probs = np.abs(np.asarray(statevector).reshape(-1)) ** 2
+    bit_index = nqubits - qubit - 1
+    bits = (np.arange(len(probs)) >> bit_index) & 1
+    return float(np.dot(probs, 1.0 - 2.0 * bits))
+
+
+def fidelity_and_l2(reference: np.ndarray, candidate: np.ndarray):
+    ref = np.asarray(reference, dtype=complex).reshape(-1)
+    cand = np.asarray(candidate, dtype=complex).reshape(-1)
+    fidelity = abs(np.vdot(ref, cand)) ** 2
+    l2_error = np.linalg.norm(ref - cand)
+    return float(fidelity), float(l2_error)
+
+
+def total_variation_distance(reference: dict[str, int], candidate: dict[str, int], nshots: int):
+    keys = set(reference) | set(candidate)
+    return 0.5 * sum(abs(reference.get(key, 0) - candidate.get(key, 0)) for key in keys) / nshots
+
+
+def profile_callable(func, output_path: Path, sort_by: str):
+    """Profile a single invocation and dump textual stats."""
+    profiler = cProfile.Profile()
+    profiler.enable()
+    result = func()
+    profiler.disable()
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as stream:
+        stats = pstats.Stats(profiler, stream=stream)
+        stats.strip_dirs().sort_stats(sort_by).print_stats(80)
+        stats.print_callers(30)
+    return result
+
+
+def time_callable(func, repeats: int, warmup: int, profile_output: Path | None, profile_sort: str):
+    for _ in range(warmup):
+        func()
+
+    profiled_payload = None
+    if profile_output is not None:
+        profiled_payload = profile_callable(func, profile_output, profile_sort)
+
+    samples = []
+    payloads = []
+
+    for _ in range(repeats):
+        t0 = time.perf_counter()
+        payload = func()
+        elapsed = time.perf_counter() - t0
+        samples.append(elapsed)
+        payloads.append(payload)
+
+    final_payload = payloads[-1] if payloads else profiled_payload
+    return samples, final_payload
+
+
+def summarize_samples(samples: list[float]):
+    return {
+        "min_s": min(samples),
+        "mean_s": mean(samples),
+        "max_s": max(samples),
+        "std_s": pstdev(samples) if len(samples) > 1 else 0.0,
+        "repeats": len(samples),
+    }
+
+
+def workload_state(args):
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers, args.seed)
+    backend = prepare_quimb_backend(
+        ansatz=args.ansatz,
+        max_bond=args.max_bond,
+        svd_cutoff=args.svd_cutoff,
+        optimizer=args.optimizer,
+        n_most_frequent_states=args.topk,
+    )
+
+    def run_once():
+        result = backend.execute_circuit(circuit, return_array=True)
+        return np.asarray(result.statevector).reshape(-1)
+
+    samples, statevector = time_callable(
+        run_once, args.repeats, args.warmup, args.profile_output, args.profile_sort
+    )
+    summary = summarize_samples(samples)
+
+    correctness = None
+    if not args.no_compare:
+        ref_state, ref_time = run_qibojit_state(circuit)
+        fidelity, l2_error = fidelity_and_l2(ref_state, statevector)
+        correctness = {
+            "qibojit_time_s": ref_time,
+            "fidelity": fidelity,
+            "l2_error": l2_error,
+        }
+
+    return summary, correctness
+
+
+def workload_shots(args):
+    circuit = make_circuit(
+        args.circuit, args.nqubits, args.nlayers, args.seed, add_measurements=True
+    )
+    backend = prepare_quimb_backend(
+        ansatz=args.ansatz,
+        max_bond=args.max_bond,
+        svd_cutoff=args.svd_cutoff,
+        optimizer=args.optimizer,
+        n_most_frequent_states=args.topk,
+    )
+
+    def run_once():
+        result = backend.execute_circuit(circuit, nshots=args.nshots)
+        return dict(result.frequencies())
+
+    samples, frequencies = time_callable(
+        run_once, args.repeats, args.warmup, args.profile_output, args.profile_sort
+    )
+    summary = summarize_samples(samples)
+
+    correctness = None
+    if not args.no_compare:
+        ref_freq, ref_time = run_qibojit_shots(circuit, args.nshots)
+        correctness = {
+            "qibojit_time_s": ref_time,
+            "tvd": total_variation_distance(ref_freq, frequencies, args.nshots),
+        }
+
+    return summary, correctness
+
+
+def workload_convert(args):
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers, args.seed)
+    backend = prepare_quimb_backend(
+        ansatz=args.ansatz,
+        max_bond=args.max_bond,
+        svd_cutoff=args.svd_cutoff,
+        optimizer=args.optimizer,
+        n_most_frequent_states=args.topk,
+    )
+
+    def run_once():
+        quimb_circuit = backend._qibo_circuit_to_quimb(  # pylint: disable=protected-access
+            circuit,
+            quimb_circuit_type=backend.circuit_ansatz,
+            gate_opts={"max_bond": backend.max_bond_dimension, "cutoff": backend.svd_cutoff},
+        )
+        return len(quimb_circuit.gates)
+
+    samples, gate_count = time_callable(
+        run_once, args.repeats, args.warmup, args.profile_output, args.profile_sort
+    )
+    summary = summarize_samples(samples)
+    summary["gate_count"] = gate_count
+    return summary, None
+
+
+def workload_expectation(args):
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers, args.seed)
+    backend = prepare_quimb_backend(
+        ansatz=args.ansatz,
+        max_bond=args.max_bond,
+        svd_cutoff=args.svd_cutoff,
+        optimizer=args.optimizer,
+        n_most_frequent_states=args.topk,
+    )
+    operators = ["z"]
+    sites = [(args.observable_qubit,)]
+    coeffs = [1.0]
+
+    def run_once():
+        return float(
+            backend.exp_value_observable_symbolic(
+                circuit, operators, sites, coeffs, args.nqubits
+            )
+        )
+
+    samples, expval = time_callable(
+        run_once, args.repeats, args.warmup, args.profile_output, args.profile_sort
+    )
+    summary = summarize_samples(samples)
+
+    correctness = None
+    if not args.no_compare:
+        ref_state, ref_time = run_qibojit_state(circuit)
+        correctness = {
+            "qibojit_time_s": ref_time,
+            "reference_expval": z_expectation_from_statevector(
+                ref_state, args.nqubits, args.observable_qubit
+            ),
+            "abs_error": abs(
+                z_expectation_from_statevector(ref_state, args.nqubits, args.observable_qubit)
+                - expval
+            ),
+        }
+
+    return summary, correctness
+
+
+def workload_raw_local_exp(args):
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers, args.seed)
+    backend = prepare_quimb_backend(
+        ansatz=args.ansatz,
+        max_bond=args.max_bond,
+        svd_cutoff=args.svd_cutoff,
+        optimizer=args.optimizer,
+        n_most_frequent_states=args.topk,
+    )
+
+    def run_once():
+        metrics = {}
+        t0 = time.perf_counter()
+        quimb_circuit = backend._qibo_circuit_to_quimb(  # pylint: disable=protected-access
+            circuit,
+            quimb_circuit_type=backend.circuit_ansatz,
+            gate_opts={"max_bond": backend.max_bond_dimension, "cutoff": backend.svd_cutoff},
+        )
+        metrics["convert_s"] = time.perf_counter() - t0
+
+        operator = backend._string_to_quimb_operator("z")  # pylint: disable=protected-access
+        if args.rehearse:
+            t1 = time.perf_counter()
+            rehearsal = quimb_circuit.local_expectation(
+                operator,
+                where=(args.observable_qubit,),
+                backend=backend.backend,
+                optimize=backend.contractions_optimizer,
+                simplify_sequence="R",
+                rehearse=True,
+            )
+            metrics["rehearse_s"] = time.perf_counter() - t1
+            optimize = rehearsal["tree"]
+        else:
+            metrics["rehearse_s"] = 0.0
+            optimize = backend.contractions_optimizer
+
+        t2 = time.perf_counter()
+        expval = quimb_circuit.local_expectation(
+            operator,
+            where=(args.observable_qubit,),
+            backend=backend.backend,
+            optimize=optimize,
+            simplify_sequence="R",
+        )
+        metrics["contract_s"] = time.perf_counter() - t2
+        metrics["total_inner_s"] = (
+            metrics["convert_s"] + metrics["rehearse_s"] + metrics["contract_s"]
+        )
+        metrics["expval"] = float(np.real(expval))
+        return metrics
+
+    samples, metrics = time_callable(
+        run_once, args.repeats, args.warmup, args.profile_output, args.profile_sort
+    )
+    summary = summarize_samples(samples)
+    summary.update(
+        {
+            "convert_s": metrics["convert_s"],
+            "rehearse_s": metrics["rehearse_s"],
+            "contract_s": metrics["contract_s"],
+            "total_inner_s": metrics["total_inner_s"],
+        }
+    )
+
+    correctness = None
+    if not args.no_compare:
+        ref_state, ref_time = run_qibojit_state(circuit)
+        ref_expval = z_expectation_from_statevector(
+            ref_state, args.nqubits, args.observable_qubit
+        )
+        correctness = {
+            "qibojit_time_s": ref_time,
+            "reference_expval": ref_expval,
+            "abs_error": abs(ref_expval - metrics["expval"]),
+        }
+
+    return summary, correctness
+
+
+WORKLOADS = {
+    "state": workload_state,
+    "shots": workload_shots,
+    "convert": workload_convert,
+    "expectation": workload_expectation,
+    "raw-local-exp": workload_raw_local_exp,
+}
+
+
+def build_parser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--mode",
+        choices=sorted(WORKLOADS),
+        default="raw-local-exp",
+        help="Workload to benchmark.",
+    )
+    parser.add_argument(
+        "--circuit",
+        choices=["ghz", "ising1d", "qaoa", "qft", "rcs", "variational"],
+        default="variational",
+    )
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--ansatz", choices=["mps", "tn"], default="tn")
+    parser.add_argument("--max-bond", type=int, default=None)
+    parser.add_argument("--svd-cutoff", type=float, default=1e-10)
+    parser.add_argument("--optimizer", type=str, default="auto-hq")
+    parser.add_argument("--observable-qubit", type=int, default=0)
+    parser.add_argument("--nshots", type=int, default=1024)
+    parser.add_argument("--topk", type=int, default=100)
+    parser.add_argument("--warmup", type=int, default=1)
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--quimb-num-procs", type=int, default=None)
+    parser.add_argument("--blas-threads", type=int, default=None)
+    parser.add_argument("--rehearse", action="store_true")
+    parser.add_argument("--no-compare", action="store_true")
+    parser.add_argument("--profile-output", type=Path, default=None)
+    parser.add_argument("--profile-sort", type=str, default="cumulative")
+    parser.add_argument("--json-output", type=Path, default=None)
+    return parser
+
+
+def main():
+    parser = build_parser()
+    args = parser.parse_args()
+
+    configure_runtime_env(args.quimb_num_procs, args.blas_threads)
+
+    print(
+        f"mode={args.mode} circuit={args.circuit} nqubits={args.nqubits} "
+        f"nlayers={args.nlayers} ansatz={args.ansatz} optimizer={args.optimizer}"
+    )
+    if args.quimb_num_procs is not None or args.blas_threads is not None:
+        print(
+            "threads:"
+            f" QUIMB_NUM_PROCS={os.environ.get('QUIMB_NUM_PROCS')}"
+            f" OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')}"
+        )
+
+    workload = WORKLOADS[args.mode]
+    summary, correctness = workload(args)
+
+    print("\nTiming")
+    for key, value in summary.items():
+        if isinstance(value, float):
+            print(f"{key:>16}: {value:.6f}")
+        else:
+            print(f"{key:>16}: {value}")
+
+    if correctness is not None:
+        print("\nCorrectness")
+        for key, value in correctness.items():
+            if isinstance(value, float):
+                print(f"{key:>16}: {value:.6e}")
+            else:
+                print(f"{key:>16}: {value}")
+
+    if args.profile_output is not None:
+        print(f"\nProfile written to: {args.profile_output}")
+
+    if args.json_output is not None:
+        payload = {"timing": summary, "correctness": correctness, "args": vars(args)}
+        args.json_output.parent.mkdir(parents=True, exist_ok=True)
+        args.json_output.write_text(json.dumps(payload, indent=2, default=str), encoding="utf-8")
+        print(f"JSON written to: {args.json_output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py
index c378aef..5ece54f 100644
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -186,7 +186,16 @@ def execute_circuit(
     else:
         frequencies = None
         measured_probabilities = None
-
+    '''
+    if return_array:
+        if self.ansatz == "mps":
+            psi = circ_quimb.psi
+            statevector = psi.to_dense().reshape(-1)
+        else:
+            statevector = circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
+    else:
+        statevector = None
+    '''
     statevector = (
         circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
         if return_array
@@ -291,6 +300,15 @@ def _qibo_circuit_to_quimb(
         quimb_gate_name = GATE_MAP.get(gate_name, None)
         if quimb_gate_name == "measure":
             continue
+        if gate_name == "cu1":
+            theta = gate.parameters[0]
+            c, t = gate.qubits
+            circ.apply_gate("RZ", theta / 2, c)
+            circ.apply_gate("RZ", theta / 2, t)
+            circ.apply_gate("CNOT", c, t)
+            circ.apply_gate("RZ", -theta / 2, t)
+            circ.apply_gate("CNOT", c, t)
+            continue
         if quimb_gate_name is None:
             raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")
 
diff --git a/src/qibotn/result.py b/src/qibotn/result.py
index 6548ebe..2748cb0 100644
--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -57,10 +57,10 @@ class TensorNetworkResult:
         return self.measures
 
     def state(self):
-        """Return the statevector if the number of qubits is less than 20."""
+        """Return the statevector if the number of qubits is less than 35."""
         if self.nqubits < 35:
             return self.statevector
         raise_error(
             NotImplementedError,
-            f"Tensor network simulation cannot be used to reconstruct statevector for >= 20 .",
+            f"Tensor network simulation cannot be used to reconstruct statevector for >= 35 .",
         )