添加MPI并行TN benchmark及辅助脚本，移除旧benchmark

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
删除tn脚本implementation
2026-05-05 19:04:09 +08:00 · 2026-05-03 19:06:21 +08:00 · 2026-05-03 18:54:05 +08:00 · 2026-04-28 23:07:39 +08:00 · 2026-04-27 18:59:54 +08:00 · 2026-04-27 11:03:57 +08:00
25 changed files with 1607 additions and 579 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,14 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+data/
 # C extensions
 *.so
+bak/
+path/
+profiles/

+perf*
 # Distribution / packaging
 .Python
 build/
--- a/bench_profile.py
+++ b/bench_profile.py
@@ -0,0 +1,460 @@
+"""Benchmark: qibotn/quimb generic TN — single-process torch profiling version."""
+
+import os
+import pickle
+import time
+import argparse
+import numpy as np
+import cotengra as ctg
+import qibo
+from qibo import Circuit, gates
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
+
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+
+    return c
+
+
+def make_z_observable(nqubits):
+    """Z on qubit 0 only — single contraction for benchmarking."""
+    return ["z"], [(0,)], [1.0]
+
+
+def export_profiler_outputs(prof, trace_path):
+    """Export Chrome trace and text table."""
+    prof.export_chrome_trace(trace_path)
+
+    table_path = trace_path.replace(".json", ".txt")
+    with open(table_path, "w") as f:
+        f.write(
+            prof.key_averages().table(
+                sort_by="self_cpu_time_total",
+                row_limit=200,
+            )
+        )
+
+    print(f"  [torch profiler trace] {trace_path}")
+    print(f"  [torch profiler table] {table_path}")
+
+
+def run_quimb_tn(
+    circuit,
+    nqubits,
+    num_slices,
+    load_path=None,
+    save_path=None,
+):
+    """Mode: expval — compute <Z_0> via local_expectation."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    operators, sites, coeffs = make_z_observable(nqubits)
+    ops = b._string_to_quimb_operator(operators[0])
+
+    qc = b._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=b.circuit_ansatz,
+        gate_opts={"max_bond": None, "cutoff": 1e-10},
+    )
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=["kahypar", "random-greedy", "spinglass"],
+            max_repeats=16,
+            parallel=True,
+            max_time=60,
+            slicing_opts={"target_slices": num_slices},
+            progbar=True,
+        )
+
+        t0 = time.time()
+        rehearsal = qc.local_expectation(
+            ops,
+            where=sites[0],
+            optimize=opt,
+            simplify_sequence="R",
+            rehearse=True,
+        )
+        t_search = time.time() - t0
+        tree = rehearsal["tree"]
+
+        print(
+            f"  [path search]  {t_search:.3f}s  "
+            f"flops~2^{tree.contraction_cost():.2f}  "
+            f"size~2^{tree.contraction_width():.2f}  "
+            f"slices={tree.multiplicity}"
+        )
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    t0 = time.time()
+    expval = qc.local_expectation(
+        ops,
+        where=sites[0],
+        optimize=tree,
+        simplify_sequence="R",
+    )
+    t_contract = time.time() - t0
+
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return float(expval.real), t_search + t_contract
+
+
+def run_quimb_tn_statevector(
+    circuit,
+    nqubits,
+    num_slices,
+    load_path=None,
+    save_path=None,
+    profile=False,
+    profile_dir="profiles",
+):
+    """Mode: statevector — contract full TN to dense vector, single process."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+
+    qc = b._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=b.circuit_ansatz,
+        gate_opts={"max_bond": None, "cutoff": 1e-10},
+    )
+
+    # 让 quimb 生成 torch tensor，这样 torch.profiler 能抓到 aten op。
+    qc.to_backend = torch.from_numpy
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=["kahypar", "random-greedy", "spinglass"],
+            max_repeats=500,
+            parallel=48,
+            max_time=100,
+            minimize="size",
+            slicing_opts={"target_slices": num_slices},
+            progbar=True,
+        )
+
+        t0 = time.time()
+        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal["tree"]
+
+        print(
+            f"  [path search]  {t_search:.3f}s  "
+            f"flops~2^{tree.contraction_cost():.2f}  "
+            f"size~2^{tree.contraction_width():.2f}  "
+            f"slices={tree.multiplicity}"
+        )
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    os.makedirs(profile_dir, exist_ok=True)
+
+    if profile:
+        from torch.profiler import profile as torch_profile
+        from torch.profiler import ProfilerActivity, record_function
+
+        trace_path = os.path.join(
+            profile_dir,
+            (
+                f"trace_statevector_"
+                f"{circuit.nqubits}q_"
+                f"slices{tree.multiplicity}_"
+                f"{int(time.time())}.json"
+            ),
+        )
+
+        t0 = time.time()
+
+        with torch_profile(
+            activities=[ProfilerActivity.CPU],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+        ) as prof:
+            with record_function("qibotn_to_dense_contraction"):
+                sv = qc.to_dense(optimize=tree).reshape(-1)
+
+            with record_function("torch_to_numpy_view_or_copy"):
+                if type(sv).__module__.startswith("torch"):
+                    sv_tn = sv.detach().cpu().numpy()
+                else:
+                    sv_tn = np.asarray(sv)
+
+        t_contract = time.time() - t0
+
+        export_profiler_outputs(prof, trace_path)
+
+    else:
+        t0 = time.time()
+        sv = qc.to_dense(optimize=tree).reshape(-1)
+        t_contract = time.time() - t0
+
+        if type(sv).__module__.startswith("torch"):
+            sv_tn = sv.detach().cpu().numpy()
+        else:
+            sv_tn = np.asarray(sv)
+
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return sv_tn, t_search + t_contract
+
+
+def run_quimb_tn_samples(circuit, nshots=1024):
+    """Mode: samples — sample from circuit output distribution."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    t0 = time.time()
+    result = b.execute_circuit(circuit, nshots=nshots)
+    t_total = time.time() - t0
+
+    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
+
+    try:
+        freqs = result.frequencies()
+        print(f"  top states: {dict(list(freqs.items())[:5])}")
+    except Exception:
+        pass
+
+    return result, t_total
+
+
+def qibojit_expval(circuit, nqubits):
+    """Compute <Z_0> via qibojit statevector."""
+    qibo.set_backend("qibojit", platform="numba")
+
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+
+    sv = np.array(result.state(), dtype=complex).flatten()
+    probs = np.abs(sv) ** 2
+
+    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
+    expval = float(np.dot(probs, 1 - 2 * bits))
+
+    return expval, elapsed
+
+
+def run_qibojit(circuit):
+    """Compute full statevector via qibojit."""
+    qibo.set_backend("qibojit", platform="numba")
+
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+
+    sv = np.array(result.state(), dtype=complex).flatten()
+
+    return sv, elapsed
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument(
+        "--circuit",
+        type=str,
+        default="qft",
+        choices=["qft", "variational", "ghz", "brickwork"],
+    )
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--nshots", type=int, default=1024)
+
+    parser.add_argument(
+        "--mode",
+        type=str,
+        default="statevector",
+        choices=["expval", "statevector", "samples"],
+        help="expval: local_expectation; statevector: to_dense; samples: sampling",
+    )
+
+    parser.add_argument(
+        "--no-compare",
+        action="store_true",
+        help="Skip qibojit reference run",
+    )
+
+    parser.add_argument(
+        "--save-path",
+        type=str,
+        default=None,
+        help="Save contraction tree to a pickle file",
+    )
+
+    parser.add_argument(
+        "--load-path",
+        type=str,
+        default=None,
+        help="Load contraction tree from a pickle file and skip path search",
+    )
+
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Enable torch profiler for statevector contraction stage",
+    )
+
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default="profiles",
+        help="Directory to save torch profiler traces",
+    )
+
+    parser.add_argument(
+        "--save-statevector",
+        action="store_true",
+        help="Save TN statevector to data/sv_tn_*.npy",
+    )
+
+    args = parser.parse_args()
+
+    print(
+        f"Circuit: {args.circuit}, "
+        f"nqubits={args.nqubits}, "
+        f"nlayers={args.nlayers}, "
+        f"mode={args.mode}, "
+        f"profile={args.profile}"
+    )
+
+    np.random.seed(42)
+    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+    try:
+        if args.mode == "expval":
+            expval_tn, t_tn = run_quimb_tn(
+                circuit_tn,
+                args.nqubits,
+                args.num_slices,
+                load_path=args.load_path,
+                save_path=args.save_path,
+            )
+
+            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
+
+        elif args.mode == "statevector":
+            sv_tn, t_tn = run_quimb_tn_statevector(
+                circuit_tn,
+                args.nqubits,
+                args.num_slices,
+                load_path=args.load_path,
+                save_path=args.save_path,
+                profile=args.profile,
+                profile_dir=args.profile_dir,
+            )
+
+            print(
+                f"\n[quimb TN]  time={t_tn:.4f}s  "
+                f"statevector shape={sv_tn.shape}"
+            )
+
+            if args.save_statevector:
+                os.makedirs("data", exist_ok=True)
+                out_path = f"data/sv_tn_{args.circuit}{args.nqubits}.npy"
+                np.save(out_path, sv_tn)
+                print(f"[saved statevector] {out_path}")
+
+        else:
+            _, t_tn = run_quimb_tn_samples(
+                circuit_tn,
+                nshots=args.nshots,
+            )
+
+            print(f"\n[quimb TN]  time={t_tn:.4f}s")
+            args.no_compare = True
+
+    except Exception as e:
+        print(f"[quimb TN] FAILED: {e}")
+        raise
+
+    if not args.no_compare and args.mode != "statevector":
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
+
+        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
+        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
+
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref / t_tn:.2f}x")
+
+    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+
+        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
+        l2_err = np.linalg.norm(sv_ref - sv_tn)
+
+        print(f"[qibojit]   time={t_ref:.4f}s")
+        print(f"Fidelity : {fid:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref / t_tn:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark_mps.py
+++ b/benchmark_mps.py
@@ -0,0 +1,189 @@
+"""Benchmark: qibojit (reference) vs qibotn/quimb MPS, with error comparison."""
+import time
+import argparse
+import os
+import numpy as np
+import qibo
+import quimb.tensor as qtn
+from qibo import Circuit, gates
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1, add_measurements=False):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        c = QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    if add_measurements:
+        c.add(gates.M(*range(nqubits)))
+    return c
+
+
+def run_qibojit(circuit):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    sv = result.state()
+    return sv, elapsed
+
+
+def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer, nshots=None):
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="mps", max_bond_dimension=max_bond, svd_cutoff=svd_cutoff)
+    b.contractions_optimizer = optimizer
+
+    t0 = time.time()
+    if nshots:
+        result = b.execute_circuit(circuit, nshots=nshots)
+        elapsed = time.time() - t0
+        return result.frequencies(), elapsed, 0.0
+    else:
+        # MPS simulation
+        circ_quimb = qtn.CircuitMPS.from_openqasm2_str(
+            circuit.to_qasm(),
+            gate_opts={"max_bond": max_bond, "cutoff": svd_cutoff},
+        )
+        t_mps = time.time() - t0
+        # to_dense separately
+        t1 = time.time()
+        #sv = circ_quimb.psi.to_dense().reshape(-1)
+        sv = None
+        t_dense = time.time() - t1
+        return sv, t_mps, t_dense
+
+
+def run_quimb_permmps(circuit, max_bond, svd_cutoff, nshots=None):
+    gates_list = [
+        qtn.Gate(g.name, params=list(g.parameters), qubits=list(g.qubits))
+        for g in circuit.queue
+        if g.name.lower() != "measure"
+    ]
+    t0 = time.time()
+    circ = qtn.CircuitPermMPS.from_gates(
+        gates_list,
+        N=circuit.nqubits,
+        max_bond=max_bond,
+        cutoff=svd_cutoff,
+    )
+    if nshots:
+        from collections import Counter
+        result = Counter(circ.sample(nshots))
+        elapsed = time.time() - t0
+        return dict(result), elapsed
+    else:
+        mps = circ.get_psi_unordered()
+        sv = mps.to_dense().reshape(-1)
+        elapsed = time.time() - t0
+        return sv, elapsed
+
+
+def compare_statevector(sv_ref, sv_mps):
+    sv_ref = np.array(sv_ref, dtype=complex).flatten()
+    sv_mps = np.array(sv_mps, dtype=complex).flatten()
+    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
+    l2_err = np.linalg.norm(sv_ref - sv_mps)
+    return fidelity, l2_err
+
+
+def compare_frequencies(freq_ref, freq_mps, nshots):
+    all_keys = set(freq_ref) | set(freq_mps)
+    tvd = 0.5 * sum(abs(freq_ref.get(k, 0) - freq_mps.get(k, 0)) for k in all_keys) / nshots
+    return tvd
+
+
+def jit_cache_path(circuit_type, nqubits, nlayers):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--circuit", type=str, default="ghz",
+                        choices=["qft", "variational", "ghz"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--max-bond", type=int, default=None,
+                        help="Max bond dimension for MPS (None = unlimited)")
+    parser.add_argument("--svd-cutoff", type=float, default=1e-6)
+    parser.add_argument("--optimizer", type=str, default="eager")
+    parser.add_argument("--nshots", type=int, default=None,
+                        help="Use sampling mode with given number of shots instead of statevector")
+    parser.add_argument("--permmps", action="store_true",
+                        help="Use CircuitPermMPS directly instead of qibotn backend")
+    parser.add_argument("--skip-jit", action="store_true",
+                        help="Skip qibojit run, load cached statevector if available")
+    parser.add_argument("--no-compare", action="store_true",
+                        help="Skip correctness comparison entirely")
+    args = parser.parse_args()
+
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
+    print(f"MPS config: max_bond={args.max_bond}, svd_cutoff={args.svd_cutoff}, optimizer={args.optimizer}")
+
+    ref = None
+    t_ref = None
+
+    if not args.no_compare:
+        cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+        if args.nshots:
+            # frequency mode: run qibojit with same nshots
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers, add_measurements=True)
+            qibo.set_backend("qibojit", platform="numba")
+            t0 = time.time()
+            result_ref = circuit_ref(nshots=args.nshots)
+            t_ref = time.time() - t0
+            ref = dict(result_ref.frequencies())
+            print(f"\n[qibojit]  time={t_ref:.4f}s")
+        elif args.skip_jit and os.path.exists(cache_path):
+            ref = np.load(cache_path)
+            print(f"\n[qibojit]  loaded from cache: {cache_path}")
+        else:
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+            ref, t_ref = run_qibojit(circuit_ref)
+            np.save(cache_path, ref)
+            print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
+
+    np.random.seed(42)
+    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    label = "quimb PermMPS" if args.permmps else "quimb MPS"
+    try:
+        if args.permmps:
+            out, t_mps = run_quimb_permmps(circuit_mps, args.max_bond, args.svd_cutoff, args.nshots)
+            t_dense = 0.0
+        else:
+            out, t_mps, t_dense = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer, args.nshots)
+        print(f"[{label}] MPS sim={t_mps:.4f}s  to_dense={t_dense:.4f}s  total={t_mps+t_dense:.4f}s")
+        if not args.no_compare:
+            if args.nshots:
+                tvd = compare_frequencies(ref, out, args.nshots)
+                print(f"\nTVD      : {tvd:.6f}  (0=perfect)")
+            else:
+                fidelity, l2_err = compare_statevector(ref, out)
+                print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
+                print(f"L2 error : {l2_err:.2e}")
+            if t_ref is not None and t_mps > 0:
+                print(f"Speedup  : {t_ref/t_mps:.2f}x")
+    except Exception as e:
+        print(f"[quimb MPS] FAILED: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark_qmatchatea.py
+++ b/benchmark_qmatchatea.py
@@ -0,0 +1,126 @@
+"""Benchmark: qibojit (reference) vs qibotn/qmatchatea MPS."""
+import time
+import argparse
+import os
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+from qibo.backends import construct_backend
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+def run_qibojit(circuit):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    return result.state(), elapsed
+
+
+def run_qmatchatea(circuit, max_bond, cut_ratio):
+    import qmatchatea, qtealeaves.observables
+    from qibo.backends import construct_backend as _cb
+    b = _cb(backend="qibotn", platform="qmatchatea")
+    b.configure_tn_simulation(ansatz="MPS", max_bond_dimension=max_bond, cut_ratio=cut_ratio)
+
+    qk_circuit = b._qibocirc_to_qiskitcirc(circuit)
+    run_qk_params = qmatchatea.preprocessing.qk_transpilation_params(False)
+    observables = qtealeaves.observables.TNObservables()
+    observables += qtealeaves.observables.TNState2File(name="temp", formatting="D")
+
+    t0 = time.time()
+    results = qmatchatea.run_simulation(
+        circ=qk_circuit,
+        convergence_parameters=b.convergence_params,
+        transpilation_parameters=run_qk_params,
+        backend=b.qmatchatea_backend,
+        observables=observables,
+    )
+    elapsed = time.time() - t0
+    tn_state = results.observables.get("tn_state")
+    if tn_state is None:
+        results.load_state()
+        tn_state = results.observables["tn_state"]
+    sv_obj = tn_state.to_statevector(qiskit_order=False, max_qubit_equivalent=40)
+    sv = np.array(sv_obj.elem, dtype=complex).flatten()
+    return sv, elapsed
+
+
+def compare(sv_ref, sv_mps):
+    sv_ref = np.array(sv_ref, dtype=complex).flatten()
+    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
+    l2_err = np.linalg.norm(sv_ref - sv_mps)
+    return fidelity, l2_err
+
+
+def jit_cache_path(circuit_type, nqubits, nlayers):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--circuit", type=str, default="ghz",
+                        choices=["qft", "variational", "ghz"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--max-bond", type=int, default=64)
+    parser.add_argument("--cut-ratio", type=float, default=1e-6)
+    parser.add_argument("--skip-jit", action="store_true")
+    args = parser.parse_args()
+
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
+    print(f"MPS config: max_bond={args.max_bond}, cut_ratio={args.cut_ratio}")
+
+    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+    t_ref = None
+
+    if args.skip_jit and os.path.exists(cache_path):
+        sv_ref = np.load(cache_path)
+        print(f"\n[qibojit]  loaded from cache: {cache_path}")
+    else:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        np.save(cache_path, sv_ref)
+        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
+
+    np.random.seed(42)
+    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    try:
+        sv_mps, t_mps = run_qmatchatea(circuit_mps, args.max_bond, args.cut_ratio)
+        fidelity, l2_err = compare(sv_ref, sv_mps)
+        print(f"[qmatchatea] time={t_mps:.4f}s")
+        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+        if t_ref is not None and t_mps > 0:
+            print(f"Speedup  : {t_ref/t_mps:.2f}x")
+    except Exception as e:
+        print(f"[qmatchatea] FAILED: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark_tn.py
+++ b/benchmark_tn.py
@@ -0,0 +1,386 @@
+"""Benchmark: qibotn/quimb generic TN — expectation values."""
+import multiprocessing
+multiprocessing.set_start_method("spawn", force=True)
+import pickle
+import time
+import threading
+import argparse
+import numpy as np
+import cotengra as ctg
+import qibo
+from qibo import Circuit, gates
+
+class TimedTrialFn:
+    def __init__(self, trial_fn, timeout=15):
+        self.trial_fn = trial_fn
+        self.timeout = timeout
+
+    def __call__(self, *args, **kwargs):
+        result = [None]
+        exc = [None]
+
+        def _run():
+            try:
+                result[0] = self.trial_fn(*args, **kwargs)
+            except Exception as e:
+                exc[0] = e
+
+        t = threading.Thread(target=_run, daemon=True)
+        t.start()
+        t.join(self.timeout)
+        if t.is_alive():
+            raise TimeoutError(f"trial exceeded {self.timeout}s")
+        if exc[0] is not None:
+            raise exc[0]
+        return result[0]
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+
+def make_z_observable(nqubits):
+    """Z on qubit 0 only — single contraction for benchmarking"""
+    return ["z"], [(0,)], [1.0]
+
+
+def run_quimb_tn(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """Mode: expval — compute <Z_0> via local_expectation (lightcone pruning)."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    operators, sites, coeffs = make_z_observable(nqubits)
+    ops = b._string_to_quimb_operator(operators[0])
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            methods=['kahypar', 'random-greedy', 'spinglass'],
+            max_repeats=16,
+            parallel=True,
+            max_time=60,
+            slicing_opts={'target_slices': num_slices},
+            progbar=True,
+        )
+        t0 = time.time()
+        rehearsal = qc.local_expectation(ops, where=sites[0], optimize=opt,
+                                         simplify_sequence="R", rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal['tree']
+        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+
+    t0 = time.time()
+    expval = qc.local_expectation(ops, where=sites[0], optimize=tree, simplify_sequence="R")
+    t_contract = time.time() - t0
+    print(f"  [contraction]  {t_contract:.3f}s")
+
+    return float(expval.real), t_search + t_contract
+
+
+def run_quimb_tn_statevector(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """Mode: statevector — contract full TN to dense vector."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
+    if load_path:
+        with open(load_path, "rb") as f:
+            saved = pickle.load(f)
+        tree = saved["tree"]
+        t_search = 0.0
+        print(f"  [path loaded]  {load_path}")
+    else:
+        opt = ctg.HyperOptimizer(
+            #methods=['kahypar', 'random-greedy', 'spinglass'],
+            max_repeats=1024,
+            #parallel="concurrent.futures",
+            parallel=64,
+            max_time=60,
+            minimize='size',
+            slicing_opts={'target_slices': num_slices},
+            #slicing_opts={'target_size': 2**30},
+            progbar=True,
+            on_trial_error='ignore'
+        )
+        
+        t0 = time.time()
+        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+        t_search = time.time() - t0
+        tree = rehearsal['tree']
+        #print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
+
+        if save_path:
+            with open(save_path, "wb") as f:
+                pickle.dump({"tree": tree}, f)
+            print(f"  [path saved]  {save_path}")
+        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
+        return None, t_search
+
+    t0 = time.time()
+    sv = qc.to_dense(optimize=tree).reshape(-1)
+    t_contract = time.time() - t0
+    print(f"  [contraction]  {t_contract:.3f}s")
+    sv_tn = np.array(sv)
+    return sv_tn, t_search + t_contract
+
+
+def _contract_mpi(tree, arrays, comm, root=0):
+    """Contract slices via MPI, returning result as the same array type as input.
+
+    Unlike ``cotengra.ContractionTree.contract_mpi``, this works with any
+    array backend (numpy, torch, etc.) — it only converts to numpy at the
+    MPI-reduce boundary and converts back.
+    """
+    size = comm.Get_size()
+    rank = comm.Get_rank()
+
+    result_np = None
+    is_torch = type(arrays[0]).__module__.startswith("torch")
+
+    for i in range(rank, tree.multiplicity, size):
+        x = tree.contract_slice(arrays, i)
+        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
+        
+        if result_np is None:
+            result_np = x_np
+        else:
+            result_np += x_np
+
+    if result_np is None:
+        result_np = np.zeros(1, dtype=np.complex64)
+
+    if rank == root:
+        result = np.zeros_like(result_np)
+    else:
+        result = None
+    comm.Reduce(result_np, result, root=root)
+
+    if rank == root:
+        import torch
+        return torch.from_numpy(np.asarray(result)) if is_torch else result
+    return None
+
+
+def run_quimb_tn_statevector_mpi(circuit, nqubits, num_slices, load_path=None, save_path=None):
+    """MPI-parallel statevector via custom MPI contraction (supports torch backend)."""
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
+
+    if load_path:
+        if rank == 0:
+            with open(load_path, "rb") as f:
+                saved = pickle.load(f)
+            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
+            print(f"  [path loaded]  {load_path}")
+        else:
+            tree = psi = None
+            t_search = 0.0
+    else:
+        # each rank runs serial search over its share of trials
+        total_repeats = 1024
+        rank_repeats = max(1, total_repeats // size)
+        opt = ctg.HyperOptimizer(
+            methods=['kahypar', 'random-greedy', 'spinglass'],
+            max_repeats=rank_repeats,
+            parallel=False,
+            max_time=100,
+            minimize='size',
+            slicing_opts={'target_slices': max(num_slices, size), 'allow_outer': False},
+            progbar=(rank == 0),
+        )
+        t0 = time.time()
+        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
+        t_search = time.time() - t0
+        local_tree = rehearsal['tree']
+        local_psi = rehearsal['tn']
+        local_size = local_tree.contraction_width()
+
+        # gather all trees to rank 0, pick best by contraction_width
+        all_results = comm.gather((local_size, local_tree, local_psi), root=0)
+        if rank == 0:
+            _, tree, psi = min(all_results, key=lambda x: x[0])
+            print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}  slices={tree.multiplicity}")
+            if save_path:
+                with open(save_path, "wb") as f:
+                    pickle.dump({"tree": tree, "psi": psi}, f)
+                print(f"  [path saved]   {save_path}")
+        else:
+            tree = psi = None
+
+    tree = comm.bcast(tree, root=0)
+    psi = comm.bcast(psi, root=0)
+    t_search = comm.bcast(t_search, root=0)
+
+    arrays = psi.arrays
+    t0 = time.time()
+    sv = _contract_mpi(tree, arrays, comm, root=0)
+    t_contract = time.time() - t0
+
+    if rank == 0:
+        print(f"  [contraction]  {t_contract:.3f}s")
+        return np.array(sv).reshape(-1), t_search + t_contract
+    return None, t_search + t_contract
+
+
+def run_quimb_tn_samples(circuit, nshots=1024):
+    """Mode: samples — sample from circuit output distribution."""
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    t0 = time.time()
+    result = b.execute_circuit(circuit, nshots=nshots)
+    t_total = time.time() - t0
+    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
+    print(f"  top states: {dict(list(result.frequencies().items())[:5])}")
+    return result, t_total
+
+
+def qibojit_expval(circuit, nqubits):
+    """Compute <Z_0> via qibojit statevector."""
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    sv = np.array(result.state(), dtype=complex).flatten()
+    probs = np.abs(sv) ** 2
+    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
+    expval = float(np.dot(probs, 1 - 2 * bits))
+    return expval, elapsed
+
+
+def run_qibojit(circuit):
+    """Compute full statevector via qibojit."""
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    sv = np.array(result.state(), dtype=complex).flatten()
+    return sv, elapsed
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--circuit", type=str, default="qft",
+                        choices=["qft", "variational", "ghz", "brickwork"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--nshots", type=int, default=1024)
+    parser.add_argument("--mode", type=str, default="statevector",
+                        choices=["expval", "statevector", "samples"],
+                        help="expval: local_expectation; statevector: to_dense; samples: sampling")
+    parser.add_argument("--mpi", action="store_true",
+                        help="Use MPI-parallel contraction (run with mpirun -n N)")
+    parser.add_argument("--no-compare", action="store_true",
+                        help="Skip qibojit reference run")
+    parser.add_argument("--save-path", type=str, default=None,
+                        help="Save contraction tree to a pickle file")
+    parser.add_argument("--load-path", type=str, default=None,
+                        help="Load contraction tree from a pickle file (skip path search)")
+    args = parser.parse_args()
+
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}, mode={args.mode}")
+
+    np.random.seed(42)
+    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    try:
+        if args.mode == "expval":
+            expval_tn, t_tn = run_quimb_tn(circuit_tn, args.nqubits, args.num_slices,
+                                           load_path=args.load_path, save_path=args.save_path)
+            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
+        elif args.mode == "statevector":
+            if args.mpi:
+                sv_tn, t_tn = run_quimb_tn_statevector_mpi(circuit_tn, args.nqubits, args.num_slices,
+                                                           load_path=args.load_path, save_path=args.save_path)
+            else:
+                sv_tn, t_tn = run_quimb_tn_statevector(circuit_tn, args.nqubits, args.num_slices,
+                                                       load_path=args.load_path, save_path=args.save_path)
+            if sv_tn is not None:
+                print(f"\n[quimb TN]  time={t_tn:.4f}s  statevector shape={sv_tn.shape}")
+                np.save(f"data/sv_tn_{args.circuit}{args.nqubits}.npy", sv_tn)
+        else:
+            _, t_tn = run_quimb_tn_samples(circuit_tn, args.nqubits, args.nshots)
+            print(f"\n[quimb TN]  time={t_tn:.4f}s")
+            args.no_compare = True  # samples 模式无法和 qibojit 期望值对比
+    except Exception as e:
+        print(f"[quimb TN] FAILED: {e}")
+        raise
+
+    if not args.no_compare and args.mode != "statevector":
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
+        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
+        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref/t_tn:.2f}x")
+    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
+        l2_err = np.linalg.norm(sv_ref - sv_tn)
+        print(f"[qibojit]   time={t_ref:.4f}s")
+        print(f"Fidelity : {fid:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+        if t_tn > 0:
+            print(f"Speedup  : {t_ref/t_tn:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark_tn_mpi.py
+++ b/benchmark_tn_mpi.py
@@ -0,0 +1,246 @@
+"""MPI-parallel TN benchmark: path search + contraction via MPI."""
+import pickle
+import time
+import argparse
+import numpy as np
+import cotengra as ctg
+import qibo
+from qibo import Circuit, gates
+from mpi4py import MPI
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+
+def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks):
+    """Run one serial HyperOptimizer in a subprocess, return (width, tree)."""
+    import pickle, cotengra as ctg, random
+    random.seed(seed)
+    tn = pickle.loads(tn_bytes)
+    opt = ctg.HyperOptimizer(
+        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
+        max_repeats=repeats,
+        parallel=False,
+        minimize='flops',
+        max_time=600,
+        optlib="random",
+        slicing_opts={'target_size': 2**30, 'allow_outer': False},
+        progbar=False,
+    )
+    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
+    return tree.contraction_width(), tree
+
+
+def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
+                    timeout=None):
+    """Launch n_workers subprocesses each running serial search, return best tree."""
+    import pickle, os, signal
+    from concurrent.futures import ProcessPoolExecutor, as_completed
+    tn_bytes = pickle.dumps(tn)
+    repeats_per = max(1, total_repeats // n_workers)
+    best_width, best_tree = float('inf'), None
+
+    with ProcessPoolExecutor(max_workers=n_workers) as pool:
+        futures = {
+            pool.submit(_run_serial_search, tn_bytes, output_inds,
+                        repeats_per, seed, num_slices, n_ranks): seed
+            for seed in range(n_workers)
+        }
+        pids = {f: p.pid for f, p in zip(futures, pool._processes.values())}
+        try:
+            for fut in as_completed(futures, timeout=timeout):
+                try:
+                    width, tree = fut.result()
+                    if width < best_width:
+                        best_width, best_tree = width, tree
+                except Exception as e:
+                    print(f"  [worker failed] {e}")
+        except TimeoutError:
+            pass
+        for fut, pid in pids.items():
+            if not fut.done():
+                try:
+                    os.kill(pid, signal.SIGKILL)
+                except ProcessLookupError:
+                    pass
+
+    return best_tree
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    elif circuit_type == "brickwork":
+        for q in range(nqubits):
+            c.add(gates.H(q))
+        for layer in range(nlayers):
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CNOT(q, q + 1))
+                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
+                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+def _contract_mpi(tree, arrays, comm, root=0):
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    is_torch = type(arrays[0]).__module__.startswith("torch")
+
+    result_np = None
+    for i in range(rank, tree.multiplicity, size):
+        x = tree.contract_slice(arrays, i)
+        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
+        result_np = x_np if result_np is None else result_np + x_np
+
+    if result_np is None:
+        result_np = np.zeros(1, dtype=np.complex64)
+
+    result = np.zeros_like(result_np) if rank == root else None
+    comm.Reduce(result_np, result, root=root)
+
+    if rank == root:
+        import torch
+        return torch.from_numpy(np.asarray(result)) if is_torch else result
+    return None
+
+
+def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
+            load_path=None, save_path=None):
+    """Each MPI rank runs serial path search over total_repeats/size trials,
+    rank 0 picks the global best, then all ranks contract in parallel."""
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="tn")
+
+    import torch
+    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
+                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
+    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
+
+    # --- path search: each rank serial, gather best to rank 0 ---
+    if load_path:
+        if rank == 0:
+            with open(load_path, "rb") as f:
+                saved = pickle.load(f)
+            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
+            print(f"  [path loaded]  {load_path}")
+        else:
+            tree = psi = None
+            t_search = 0.0
+    else:
+        rank_repeats = max(1, total_repeats // size)
+        t0 = time.time()
+        # get TN object first (no contraction), then run parallel search
+        psi_tn = qc.to_dense(rehearse="tn")
+        local_tree = parallel_search(
+            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
+            num_slices=num_slices, n_ranks=size, timeout=630,
+        )
+        t_search = time.time() - t0
+        local_psi = psi_tn
+
+        all_results = comm.gather((local_tree.contraction_width(), local_tree, local_psi), root=0)
+        if rank == 0:
+            _, tree, psi = min(all_results, key=lambda x: x[0])
+            print(f"  [path search]  {t_search:.3f}s  "
+                  f"flops~2^{tree.contraction_cost():.2f}  "
+                  f"size~2^{tree.contraction_width():.2f}  "
+                  f"slices={tree.multiplicity}")
+            if save_path:
+                with open(save_path, "wb") as f:
+                    pickle.dump({"tree": tree, "psi": psi}, f)
+                print(f"  [path saved]   {save_path}")
+        else:
+            tree = psi = None
+
+        if save_path:
+            t_search = comm.bcast(t_search, root=0)
+            return None, t_search
+
+    tree = comm.bcast(tree, root=0)
+    psi = comm.bcast(psi, root=0)
+    t_search = comm.bcast(t_search, root=0)
+
+    # --- contraction: all ranks work in parallel ---
+    import torch
+    torch.set_num_threads(max(1, 48 // size))
+    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex64) for a in psi.arrays]
+    t0 = time.time()
+    sv = _contract_mpi(tree, arrays, comm, root=0)
+    t_contract = time.time() - t0
+
+    if rank == 0:
+        print(f"  [contraction]  {t_contract:.3f}s")
+        return np.array(sv).reshape(-1), t_search + t_contract
+    return None, t_search + t_contract
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=30)
+    parser.add_argument("--circuit", type=str, default="qft",
+                        choices=["qft", "variational", "ghz", "brickwork"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--num-slices", type=int, default=1)
+    parser.add_argument("--total-repeats", type=int, default=1024)
+    parser.add_argument("--save-path", type=str, default=None)
+    parser.add_argument("--load-path", type=str, default=None)
+    parser.add_argument("--no-compare", action="store_true")
+    args = parser.parse_args()
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
+    if rank == 0:
+        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
+              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
+
+    np.random.seed(42)
+    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
+
+    try:
+        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
+                              total_repeats=args.total_repeats,
+                              load_path=args.load_path, save_path=args.save_path)
+    except Exception as e:
+        if rank == 0:
+            print(f"[FAILED] {e}")
+        raise
+
+    if rank == 0 and sv is not None:
+        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
+        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
+
+        if not args.no_compare:
+            from benchmark_tn import run_qibojit
+            np.random.seed(42)
+            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+            sv_ref, t_ref = run_qibojit(circuit_ref)
+            fid = abs(np.dot(sv_ref.conj(), sv)) ** 2
+            print(f"[qibojit]       time={t_ref:.4f}s")
+            print(f"Fidelity : {fid:.8f}")
+            print(f"L2 error : {np.linalg.norm(sv_ref - sv):.2e}")
+            if t_total > 0:
+                print(f"Speedup  : {t_ref/t_total:.2f}x")
+
+
+if __name__ == "__main__":
+    main()
--- a/compare_jit_tn_quimb.py
+++ b/compare_jit_tn_quimb.py
@@ -0,0 +1,51 @@
+import numpy as np
+import os
+import sys
+
+def check_results(ref_path, tn_path):
+    # 1. 检查文件是否存在
+    if not os.path.exists(ref_path) or not os.path.exists(tn_path):
+        print(f"Error: 找不到文件！\n参考文件: {ref_path}\n待测文件: {tn_path}")
+        return
+
+    print(f"正在加载数据并对比: \n  [Ref] {ref_path}\n  [TN ] {tn_path}\n")
+
+    try:
+        # 2. 加载状态向量
+        # mmap_mode='r' 可以防止大文件直接撑爆内存
+        sv_ref = np.load(ref_path, mmap_mode='r')
+        sv_tn = np.load(tn_path, mmap_mode='r')
+
+        # 3. 计算保真度 (Fidelity)
+        # fid = |<ref|tn>|^2
+        inner_product = np.dot(sv_ref.conj(), sv_tn)
+        fidelity = np.abs(inner_product)**2
+
+        # 4. 计算 L2 误差 (欧氏距离)
+        l2_error = np.linalg.norm(sv_ref - sv_tn)
+
+        # 5. 打印结果
+        print("-" * 30)
+        print(f"保真度 (Fidelity): {fidelity:.12f}")
+        #print(f"L2 范数误差:       {l2_error:.2e}")
+        print("-" * 30)
+
+        # phase-invariant L2: align global phase first
+        phase = inner_product / np.abs(inner_product)
+        l2_phase_corrected = np.linalg.norm(sv_ref - sv_tn / phase)
+        print(f"L2 误差(相位校正后): {l2_phase_corrected:.2e}")
+
+        if fidelity > 0.999999:
+            print("✅ 验证通过：结果高度一致。")
+        else:
+            print("❌ 警告：保真度较低，请检查收缩路径或截断误差。")
+
+    except Exception as e:
+        print(f"计算过程中发生错误: {e}")
+
+if __name__ == "__main__":
+    # 你可以在这里直接修改文件名
+    REF_FILE = 'data/sv_qibojit_qft30.npy'
+    TN_FILE  = 'data/sv_tn_qft30_mpi.npy'
+    
+    check_results(REF_FILE, TN_FILE)
--- a/doc/make.bat
+++ b/doc/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
+10.20.6.74:1                                                                                         
+10.20.6.102:1
--- a/inspect_path.py
+++ b/inspect_path.py
@@ -0,0 +1,21 @@
+import pickle
+import sys
+
+path = sys.argv[1] if len(sys.argv) > 1 else 'path/path.pkl.34.mpi'
+
+with open(path, 'rb') as f:
+    d = pickle.load(f)
+tree = d['tree']
+
+width = tree.contraction_width()
+slices = tree.multiplicity
+sliced_width = width - (slices.bit_length() - 1)
+
+print(f"Path: {path}")
+print(f"Width (unsliced):     {width:.1f}")
+print(f"Slices:               {slices}")
+print(f"Sliced width:         {sliced_width:.1f}")
+print(f"Peak memory (total):  {2**width * 16 / 1e9:.1f} GB")
+print(f"Peak per slice:       {2**sliced_width * 16 / 1e9:.2f} GB")
+print(f"Sliced indices:       {len(tree.sliced_inds)}")
+print(f"Cost (log2 flops):    {tree.contraction_cost(log=True):.2f}")
--- a/poetry.lock
+++ b/poetry.lock
@@ -1733,14 +1733,14 @@ files = [

 [[package]]
 name = "mako"
-version = "1.3.10"
+version = "1.3.11"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
-    {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
+    {file = "mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77"},
+    {file = "mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069"},
 ]

 [package.dependencies]
--- a/run_qibojit_ref.py
+++ b/run_qibojit_ref.py
@@ -0,0 +1,18 @@
+"""Run qibojit on 30-qubit QFT and save statevector for comparison."""
+import time
+import numpy as np
+import qibo
+from qibo.models import QFT
+
+#np.random.seed(42)
+circuit = QFT(32)
+
+qibo.set_backend("qibojit", platform="numba")
+t0 = time.time()
+result = circuit()
+elapsed = time.time() - t0
+
+sv = np.array(result.state(), dtype=complex).flatten()
+np.save("data/sv_qibojit_qft30.npy", sv)
+print(f"[qibojit] time={elapsed:.4f}s  shape={sv.shape}")
+print(f"Saved to sv_qibojit_qft30.npy")
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -167,7 +167,7 @@ def execute_circuit(
        raise_error(ValueError, "Initial state not None supported only for MPS ansatz.")

    circ_quimb = self.circuit_ansatz.from_openqasm2_str(
-        circuit.to_qasm(), psi0=initial_state
+        circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}
    )

    if nshots:
@@ -186,7 +186,16 @@ def execute_circuit(
    else:
        frequencies = None
        measured_probabilities = None
-
+    '''
+    if return_array:
+        if self.ansatz == "mps":
+            psi = circ_quimb.psi
+            statevector = psi.to_dense().reshape(-1)
+        else:
+            statevector = circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
+    else:
+        statevector = None
+    '''
    statevector = (
        circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
        if return_array
@@ -291,6 +300,15 @@ def _qibo_circuit_to_quimb(
        quimb_gate_name = GATE_MAP.get(gate_name, None)
        if quimb_gate_name == "measure":
            continue
+        if gate_name == "cu1":
+            theta = gate.parameters[0]
+            c, t = gate.qubits
+            circ.apply_gate("RZ", theta / 2, c)
+            circ.apply_gate("RZ", theta / 2, t)
+            circ.apply_gate("CNOT", c, t)
+            circ.apply_gate("RZ", -theta / 2, t)
+            circ.apply_gate("CNOT", c, t)
+            continue
        if quimb_gate_name is None:
            raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")

--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -57,10 +57,10 @@ class TensorNetworkResult:
        return self.measures

    def state(self):
-        """Return the statevector if the number of qubits is less than 20."""
-        if self.nqubits < 20:
+        """Return the statevector if the number of qubits is less than 35."""
+        if self.nqubits < 35:
            return self.statevector
        raise_error(
            NotImplementedError,
-            f"Tensor network simulation cannot be used to reconstruct statevector for >= 20 .",
+            f"Tensor network simulation cannot be used to reconstruct statevector for >= 35 .",
        )
--- a/sweep_bond_32q.py
+++ b/sweep_bond_32q.py
@@ -0,0 +1,39 @@
+"""Bond dimension sweep for 32-qubit variational circuit."""
+import os
+import sys
+import numpy as np
+
+sys.path.insert(0, os.path.dirname(__file__))
+from benchmark_mps import make_circuit, run_qibojit, run_quimb_mps, compare, jit_cache_path, DATA_DIR
+
+NQUBITS = 32
+NLAYERS = 5
+BOND_VALUES = [1, 8, 16, 32, 64, 128, 256]
+SVD_CUTOFF = 1e-6
+OPTIMIZER = "auto-hq"
+
+if __name__ == "__main__":
+    cache_path = jit_cache_path("variational", NQUBITS, NLAYERS)
+
+    if os.path.exists(cache_path):
+        sv_ref = np.load(cache_path)
+        print(f"[qibojit] loaded from cache: {cache_path}\n")
+    else:
+        np.random.seed(42)
+        circuit_ref = make_circuit("variational", NQUBITS, NLAYERS)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        np.save(cache_path, sv_ref)
+        print(f"[qibojit] time={t_ref:.4f}s  (saved to {cache_path})\n")
+
+    print(f"{'bond':>6}  {'time(s)':>10}  {'fidelity':>12}  {'l2_err':>10}")
+    print("-" * 46)
+
+    for bond in BOND_VALUES:
+        np.random.seed(42)
+        circuit_mps = make_circuit("variational", NQUBITS, NLAYERS)
+        try:
+            sv_mps, t_mps = run_quimb_mps(circuit_mps, bond, SVD_CUTOFF, OPTIMIZER)
+            fidelity, l2_err = compare(sv_ref, sv_mps)
+            print(f"{bond:>6}  {t_mps:>10.4f}  {fidelity:>12.8f}  {l2_err:>10.2e}")
+        except Exception as e:
+            print(f"{bond:>6}  FAILED: {e}")
--- a/tests/contract.py
+++ b/tests/contract.py
@@ -1,27 +0,0 @@
-import time
-import pickle
-
-
-def run(input="tree.pkl"):
-    with open(input, "rb") as f:
-        data = pickle.load(f)
-
-    sliced_tree = data["sliced_tree"]
-    arrays = data["arrays"]
-    n_slices = sliced_tree.nslices
-    print(f"Total slices: {n_slices}")
-
-    t0 = time.perf_counter()
-    total = sum(sliced_tree.contract_slice(arrays, i, backend='numpy',implementation='cotengra') for i in range(n_slices))
-    t1 = time.perf_counter()
-
-    print(f"Contract: {t1 - t0:.4f} s")
-    #print(f"Result: {total:.10f}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", type=str, default="tree.pkl.bak")
-    args = parser.parse_args()
-    run(args.input)
--- a/tests/gen_qasm.py
+++ b/tests/gen_qasm.py
@@ -1,60 +0,0 @@
-"""生成比赛常用测试电路的 QASM 文件。"""
-import argparse
-import qibo
-from qibo.models import QFT, Circuit
-from qibo import gates
-import numpy as np
-
-qibo.set_backend("numpy")
-
-
-def gen_qft(n_qubits):
-    return QFT(n_qubits, with_swaps=True).to_qasm()
-
-
-def gen_random(n_qubits, depth, seed):
-    rng = np.random.default_rng(seed)
-    c = Circuit(n_qubits)
-    for _ in range(depth):
-        for q in range(n_qubits):
-            c.add(gates.H(q))
-        for q in range(0, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-    return c.to_qasm()
-
-
-def gen_supremacy(n_qubits, depth, seed):
-    """Google supremacy 风格：随机单比特门 + CZ"""
-    rng = np.random.default_rng(seed)
-    single = [gates.X, gates.Y, gates.H]
-    c = Circuit(n_qubits)
-    for _ in range(depth):
-        for q in range(n_qubits):
-            g = single[rng.integers(3)]
-            c.add(g(q))
-        for q in range(0, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-        for q in range(1, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-    return c.to_qasm()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--circuit", default="qft", choices=["qft", "random", "supremacy"])
-    parser.add_argument("--n_qubits", type=int, default=20)
-    parser.add_argument("--depth", type=int, default=10)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--out", default="circuit.qasm")
-    args = parser.parse_args()
-
-    if args.circuit == "qft":
-        qasm = gen_qft(args.n_qubits)
-    elif args.circuit == "random":
-        qasm = gen_random(args.n_qubits, args.depth, args.seed)
-    else:
-        qasm = gen_supremacy(args.n_qubits, args.depth, args.seed)
-
-    with open(args.out, "w") as f:
-        f.write(qasm)
-    print(f"Written: {args.out}  ({args.n_qubits} qubits, {args.circuit})")
--- a/tests/hostfile
+++ b/tests/hostfile
@@ -1,2 +0,0 @@
-192.168.20.102
-192.168.20.101
--- a/tests/mpi_v.py
+++ b/tests/mpi_v.py
@@ -1,126 +0,0 @@
-"""
-MPI + ThreadPoolExecutor 混合并行张量网络收缩。
-每个 MPI rank 负责一部分 slice（stride 分配），
-rank 内用 ThreadPoolExecutor 并行执行各 slice（每线程一个 slice）。
-
-用法：
-    mpirun -n <N> python mpi_v.py --qasm circuit.qasm --target-slices 16 --threads 8
-"""
-import os
-import time
-import argparse
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-import quimb.tensor as qtn
-import cotengra as ctg
-
-
-def _contract_slice(sliced_tree, arrays, idx):
-    return sliced_tree.contract_slice(arrays, idx, backend="numpy")
-
-
-def run(qasm_path, target_slices, n_threads, max_repeats):
-    # ── 构建张量网络（rank 0，broadcast arrays）──
-    if rank == 0:
-        with open(qasm_path) as f:
-            qasm_str = f.read()
-        # 不用 full_simplify，保持 outer_inds 完整
-        psi = qtn.Circuit.from_openqasm2_str(qasm_str).psi
-        n_qubits = len([i for i in psi.outer_inds() if i.startswith("k")])
-        output_inds = [f"k{i}" for i in range(n_qubits)]
-        arrays = [t.data for t in psi.tensors]
-    else:
-        psi = None
-        n_qubits = None
-        arrays = None
-        output_inds = None
-
-    n_qubits = comm.bcast(n_qubits, root=0)
-    arrays = comm.bcast(arrays, root=0)
-    output_inds = comm.bcast(output_inds, root=0)
-
-    # ── 路径搜索（rank 0）+ broadcast ──
-    t0 = time.perf_counter()
-    if rank == 0:
-        opt = ctg.HyperOptimizer(
-            methods=["kahypar", "greedy"],
-            max_repeats=max_repeats,
-            minimize="flops",
-            parallel=min(96, os.cpu_count()),
-        )
-        tree = psi.contraction_tree(optimize=opt, output_inds=output_inds)
-        n = target_slices
-        sliced_tree = None
-        while n >= 1:
-            try:
-                sliced_tree = tree.slice(target_size=n, allow_outer=False)
-                break
-            except RuntimeError:
-                n //= 2
-        if sliced_tree is None:
-            sliced_tree = tree.slice(target_slices=1, allow_outer=True)
-        print(f"[rank 0] path search: {time.perf_counter()-t0:.2f}s  slices: {sliced_tree.nslices}", flush=True)
-    else:
-        sliced_tree = None
-
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    # ── 分布式收缩（MPI stride + ThreadPoolExecutor）──
-    my_indices = list(range(rank, n_slices, size))
-    local_result = np.zeros(2**n_qubits, dtype=np.complex128)
-
-    comm.Barrier()
-    t1 = time.perf_counter()
-
-    with ThreadPoolExecutor(max_workers=n_threads) as pool:
-        for batch_start in range(0, len(my_indices), n_threads):
-            batch = my_indices[batch_start:batch_start + n_threads]
-            futures = {pool.submit(_contract_slice, sliced_tree, arrays, i): i for i in batch}
-            for fut in as_completed(futures):
-                local_result += np.array(fut.result()).flatten()
-
-    t2 = time.perf_counter()
-    if rank == 0:
-        print(f"[rank 0] contract: {t2-t1:.2f}s", flush=True)
-
-    # ── MPI reduce ──
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        print(f"result norm: {np.linalg.norm(total):.10f}", flush=True)
-        print(f"total time:  {t2-t0:.2f}s", flush=True)
-        return total
-    return None
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--qasm", required=True, help="QASM 文件路径")
-    parser.add_argument("--target-slices", type=int, default=None,
-                        help="目标切片数量（优先于 target-size）")
-    parser.add_argument("--target-size", type=int, default=28,
-                        help="切片目标大小指数（2^N），默认 28")
-    parser.add_argument("--threads", type=int, default=max(1, os.cpu_count() // size),
-                        help="每个 rank 的线程数，默认 cpu_count/size")
-    parser.add_argument("--max-repeats", type=int, default=256,
-                        help="cotengra 路径搜索重复次数")
-    args = parser.parse_args()
-
-    target = args.target_slices if args.target_slices else 2**args.target_size
-    mode = "slices" if args.target_slices else f"size=2^{args.target_size}"
-
-    if rank == 0:
-        print(f"ranks={size}  threads/rank={args.threads}  target_{mode}", flush=True)
-
-    run(args.qasm, target, args.threads, args.max_repeats)
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/quimb_mpi.py
+++ b/tests/quimb_mpi.py
@@ -1,68 +0,0 @@
-import os
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-'''
-# --- 1. 关键：在导入 numpy/quimb 之前设置环境变量 ---
-# 告诉底层 BLAS 库 (MKL/OpenBLAS) 使用 96 个线程
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-# 优化线程亲和性，避免线程在不同 CPU 核心间跳变，提升缓存命中率
-os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
-os.environ["KMP_BLOCKTIME"] = "0"
-'''
-# 现在导入库
-import psutil
-
-def run_baseline(n_qubits=50, depth=20):
-    print(f"🚀 {n_qubits} Qubits, Depth {depth}")
-    print(f"💻 Detected Logical Cores: {os.cpu_count()}")
-    
-    # 1. 构建电路 (必须 complex128 保证精度)
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for d in range(depth):
-        for i in range(n_qubits):
-            circ.apply_gate('H', i)
-        for i in range(0, n_qubits - 1, 2):
-            circ.apply_gate('CZ', i, i + 1)
-            
-    psi = circ.psi
-    
-    # 2. 构建闭合网络 <psi|psi>
-    net = psi.conj() & psi
-    
-    # 3. 路径搜索参数 (Kahypar)
-    print("🔍 Searching path with Kahypar...")
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=128,
-        parallel=96,       
-        minimize='flops',
-        on_trial_error='ignore'
-    )
-    
-    # 4. 阶段1：路径搜索
-    t0 = time.perf_counter()
-    tree = net.contraction_tree(optimize=opt)
-    t1 = time.perf_counter()
-    print(f"🔍 Path search done: {t1 - t0:.4f} s")
-
-    # 5. 阶段2：张量收缩
-    result = net.contract(optimize=tree, backend='numpy')
-    t2 = time.perf_counter()
-    peak_mem = psutil.Process().memory_info().rss / 1024**3
-
-    print(f"✅ Done!")
-    print(f"⏱️ Contract: {t2 - t1:.4f} s  |  Total: {t2 - t0:.4f} s")
-    print(f"💾 Peak Memory: {peak_mem:.2f} GB")
-    print(f"🔢 Result: {result:.10f}")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=50)
-    parser.add_argument("--depth", type=int, default=20)
-    args = parser.parse_args()
-    run_baseline(n_qubits=args.n_qubits, depth=args.depth)
--- a/tests/quimb_mpi2.py
+++ b/tests/quimb_mpi2.py
@@ -1,90 +0,0 @@
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-def build_qft(n_qubits):
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        circ.apply_gate('H', i)
-        for j in range(i + 1, n_qubits):
-            circ.apply_gate('CPHASE', np.pi / 2 ** (j - i), i, j)
-    return circ
-
-def run_mpi(n_qubits, depth=None):
-    if rank == 0:
-        print(f"MPI size: {size} ranks")
-        print(f"Circuit: QFT {n_qubits} qubits")
-
-    circ = build_qft(n_qubits)
-    psi = circ.psi
-
-    # 期望值网络：<psi|Z_0|psi>
-    Z = np.array([[1, 0], [0, -1]], dtype=np.complex128)
-    bra = psi.conj().reindex({f'k{i}': f'b{i}' for i in range(n_qubits)})
-    obs = qtn.Tensor(Z, inds=(f'k0', f'b0'))
-    net = psi & obs & bra
-
-    # 2. 所有 rank 并行搜索路径，rank 0 选全局最优
-    t0 = time.perf_counter()
-    repeats_per_rank = max(1, 128 // size)
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        #methods=['greedy'],
-        #max_repeats=repeats_per_rank,
-        max_repeats=repeats_per_rank,
-        minimize='flops',
-        parallel=max(1, 96 // size),
-    )
-    local_tree = net.contraction_tree(optimize=opt)
-
-    all_trees = comm.gather(local_tree, root=0)
-
-    if rank == 0:
-        tree = min(all_trees, key=lambda t: t.contraction_cost())
-        t1 = time.perf_counter()
-        print(f"[rank 0] Path search: {t1 - t0:.4f} s")
-    else:
-        tree = None
-
-    tree = comm.bcast(tree, root=0)
-
-    # 3. rank 0 切片，broadcast sliced_tree
-    if rank == 0:
-        sliced_tree = tree.slice(target_size=2**27)
-    else:
-        sliced_tree = None
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    if rank == 0:
-        print(f"Total slices: {n_slices}, each rank handles ~{n_slices // size}")
-
-    arrays = [t.data for t in net.tensors]
-
-    # 每个 rank 处理自己负责的切片
-    t2 = time.perf_counter()
-    local_result = 0.0 + 0.0j
-    for i in range(rank, n_slices, size):
-        local_result += sliced_tree.contract_slice(arrays, i, backend='numpy')
-    t3 = time.perf_counter()
-
-    # 4. reduce 汇总到 rank 0
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        print(f"[rank 0] Contract: {t3 - t2:.4f} s")
-        print(f"Result: {total:.10f}")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=20)
-    parser.add_argument("--depth", type=int, default=30)
-    args = parser.parse_args()
-    run_mpi(args.n_qubits, args.depth)
--- a/tests/quimb_mpi3.py
+++ b/tests/quimb_mpi3.py
@@ -1,103 +0,0 @@
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-def build_qft_circuit(n_qubits):
-    """构建标准 QFT 电路"""
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        # 1. 施加 H 门
-        circ.apply_gate('H', i)
-        # 2. 施加受控相位旋转
-        for j in range(i + 1, n_qubits):
-            theta = np.pi / (2**(j - i))
-            circ.apply_gate('CPHASE', theta, i, j)
-    return circ
-
-def run_mpi(n_qubits):
-    if rank == 0:
-        print(f"MPI size: {size} ranks")
-        print(f"Circuit: QFT {n_qubits} qubits")
-
-    # 1. 所有 rank 独立构建 QFT 电路
-    circ = build_qft_circuit(n_qubits)
-    
-    # 物理观测：计算 <psi|psi>，结果应为 1.0
-    # 注意：QFT 是幺正变换，末态模长平方必为 1
-    psi = circ.psi
-    net = psi.conj() & psi 
-
-    # 2. 路径搜索优化
-    t0 = time.perf_counter()
-    # 每个 rank 尝试不同的种子，增加找到全局最优路径的概率
-    repeats_per_rank = max(1, 256 // size)
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=repeats_per_rank,
-        minimize='flops',
-        parallel=max(1, 96 // size),
-        ) 
-    # 搜索收缩树
-    local_tree = net.contraction_tree(optimize=opt)
-    
-    # 汇总所有 rank 找到的树，在 rank 0 选出 FLOPs 最低的那棵
-    all_trees = comm.gather(local_tree, root=0)
-
-    if rank == 0:
-        tree = min(all_trees, key=lambda t: t.contraction_cost())
-        t1 = time.perf_counter()
-        print(f"[rank 0] Path search: {t1 - t0:.4f} s")
-        print(f"[rank 0] Best path FLOPs: {tree.contraction_cost():.2e}")
-    else:
-        tree = None
-
-    # 将最优路径广播给所有进程
-    tree = comm.bcast(tree, root=0)
-
-    # 3. 切片处理（性能控制核心）
-    if rank == 0:
-        # 比赛建议：将 target_size 设为能填满单进程内存的 50%-70%
-        # 或者改用 target_slices=size * 4 以确保负载绝对平衡
-        sliced_tree = tree.slice(target_size=2**27) 
-    else:
-        sliced_tree = None
-        
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    if rank == 0:
-        print(f"Total slices: {n_slices}, each rank handles ~{n_slices // size + 1}")
-
-    # 获取原始张量数据
-    arrays = [t.data for t in net.tensors]
-
-    # 4. 执行收缩计算
-    t2 = time.perf_counter()
-    local_result = 0.0 + 0.0j
-    # 简单的静态负载均衡：每个 rank 跳步处理切片
-    for i in range(rank, n_slices, size):
-        local_result += sliced_tree.contract_slice(arrays, i, backend='numpy')
-    t3 = time.perf_counter()
-
-    # 5. 结果汇总
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        duration = t3 - t2
-        print(f"[rank 0] Contract: {duration:.4f} s")
-        # 对于 <psi|psi>，QFT 的正确结果应无限接近 1.0
-        print(f"Result (Norm): {total.real:.10f} + {total.imag:.10f}j")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=20)
-    # QFT 的深度由比特数自动决定，所以删除了 --depth 参数
-    args = parser.parse_args()
-    run_mpi(args.n_qubits)
--- a/tests/search_tree.py
+++ b/tests/search_tree.py
@@ -1,56 +0,0 @@
-import time
-import pickle
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-
-
-def build_qft(n_qubits):
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        circ.apply_gate('H', i)
-        for j in range(i + 1, n_qubits):
-            circ.apply_gate('CPHASE', np.pi / 2 ** (j - i), i, j)
-    return circ
-
-
-def run(n_qubits, output="tree.pkl"):
-    print(f"Circuit: QFT {n_qubits} qubits")
-
-    circ = build_qft(n_qubits)
-    psi = circ.psi
-
-    Z = np.array([[1, 0], [0, -1]], dtype=np.complex128)
-    bra = psi.conj().reindex({f'k{i}': f'b{i}' for i in range(n_qubits)})
-    obs = qtn.Tensor(Z, inds=(f'k0', f'b0'))
-    net = psi & obs & bra
-
-    t0 = time.perf_counter()
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=32,
-        minimize='combo',
-        parallel=8,
-    )
-    tree = net.contraction_tree(optimize=opt)
-    t1 = time.perf_counter()
-    print(f"Path search: {t1 - t0:.4f} s")
-    print(tree)
-
-    sliced_tree = tree.slice(target_size=2**28)
-    print(f"Total slices: {sliced_tree.nslices}")
-
-    arrays = [t.data for t in net.tensors]
-
-    with open(output, "wb") as f:
-        pickle.dump({"sliced_tree": sliced_tree, "arrays": arrays}, f)
-    print(f"Saved to {output}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=18)
-    parser.add_argument("--output", type=str, default="tree.pkl")
-    args = parser.parse_args()
-    run(args.n_qubits, args.output)
--- a/tests/test_quimb_backend.py
+++ b/tests/test_quimb_backend.py
@@ -61,6 +61,6 @@ def test_eval(nqubits: int, tolerance: float, is_mps: bool):
        qasm_circ, init_state_tn, gate_opt, backend=config.quimb.backend
    ).flatten()

-    #assert np.allclose(
-    #    result_sv, result_tn, atol=tolerance
-    #), "Resulting dense vectors do not match"
+    assert np.allclose(
+        result_sv, result_tn, atol=tolerance
+    ), "Resulting dense vectors do not match"
--- a/tests/tree.pkl.bak
+++ b/tests/tree.pkl.bak
Author	SHA1	Message	Date
jaunatisblue	5a692033a6	添加MPI并行TN benchmark及辅助脚本，移除旧benchmark Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-05 19:04:09 +08:00
jaunatisblue	a3f39a1d67	删除tn脚本implementation	2026-05-03 19:06:21 +08:00
jaunatisblue	dd222587b7	tn脚本更新 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-03 18:54:05 +08:00
jaunatisblue	740828872e	添加tn脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-28 23:07:39 +08:00
jaunatisblue	80d9c1de5a	benchmark测试，发现瓶颈：路径搜索 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 18:59:54 +08:00
jaunatisblue	2c54840e7b	1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 11:03:57 +08:00