添加MPI并行TN benchmark及辅助脚本，移除旧benchmark

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
删除tn脚本implementation
2026-05-05 19:04:09 +08:00 · 2026-05-03 19:06:21 +08:00 · 2026-05-03 18:54:05 +08:00 · 2026-04-28 23:07:39 +08:00 · 2026-04-27 18:59:54 +08:00 · 2026-04-27 11:03:57 +08:00
15 changed files with 1604 additions and 44 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,14 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+data/
 # C extensions
 *.so
 bak/
 path/
 profiles/
 perf*
 # Distribution / packaging
 .Python
 build/
--- a/bench_profile.py
+++ b/bench_profile.py
@@ -0,0 +1,460 @@
 """Benchmark: qibotn/quimb generic TN — single-process torch profiling version."""
 import os
 import pickle
 import time
 import argparse
 import numpy as np
 import cotengra as ctg
 import qibo
 from qibo import Circuit, gates
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    elif circuit_type == "brickwork":
        for q in range(nqubits):
            c.add(gates.H(q))
        for layer in range(nlayers):
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CNOT(q, q + 1))
                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def make_z_observable(nqubits):
    """Z on qubit 0 only — single contraction for benchmarking."""
    return ["z"], [(0,)], [1.0]
 def export_profiler_outputs(prof, trace_path):
    """Export Chrome trace and text table."""
    prof.export_chrome_trace(trace_path)
    table_path = trace_path.replace(".json", ".txt")
    with open(table_path, "w") as f:
        f.write(
            prof.key_averages().table(
                sort_by="self_cpu_time_total",
                row_limit=200,
            )
        )
    print(f"  [torch profiler trace] {trace_path}")
    print(f"  [torch profiler table] {table_path}")
 def run_quimb_tn(
    circuit,
    nqubits,
    num_slices,
    load_path=None,
    save_path=None,
 ):
    """Mode: expval — compute <Z_0> via local_expectation."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    operators, sites, coeffs = make_z_observable(nqubits)
    ops = b._string_to_quimb_operator(operators[0])
    qc = b._qibo_circuit_to_quimb(
        circuit,
        quimb_circuit_type=b.circuit_ansatz,
        gate_opts={"max_bond": None, "cutoff": 1e-10},
    )
    if load_path:
        with open(load_path, "rb") as f:
            saved = pickle.load(f)
        tree = saved["tree"]
        t_search = 0.0
        print(f"  [path loaded]  {load_path}")
    else:
        opt = ctg.HyperOptimizer(
            methods=["kahypar", "random-greedy", "spinglass"],
            max_repeats=16,
            parallel=True,
            max_time=60,
            slicing_opts={"target_slices": num_slices},
            progbar=True,
        )
        t0 = time.time()
        rehearsal = qc.local_expectation(
            ops,
            where=sites[0],
            optimize=opt,
            simplify_sequence="R",
            rehearse=True,
        )
        t_search = time.time() - t0
        tree = rehearsal["tree"]
        print(
            f"  [path search]  {t_search:.3f}s  "
            f"flops~2^{tree.contraction_cost():.2f}  "
            f"size~2^{tree.contraction_width():.2f}  "
            f"slices={tree.multiplicity}"
        )
        if save_path:
            with open(save_path, "wb") as f:
                pickle.dump({"tree": tree}, f)
            print(f"  [path saved]  {save_path}")
    t0 = time.time()
    expval = qc.local_expectation(
        ops,
        where=sites[0],
        optimize=tree,
        simplify_sequence="R",
    )
    t_contract = time.time() - t0
    print(f"  [contraction]  {t_contract:.3f}s")
    return float(expval.real), t_search + t_contract
 def run_quimb_tn_statevector(
    circuit,
    nqubits,
    num_slices,
    load_path=None,
    save_path=None,
    profile=False,
    profile_dir="profiles",
 ):
    """Mode: statevector — contract full TN to dense vector, single process."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(
        circuit,
        quimb_circuit_type=b.circuit_ansatz,
        gate_opts={"max_bond": None, "cutoff": 1e-10},
    )
    # 让 quimb 生成 torch tensor，这样 torch.profiler 能抓到 aten op。
    qc.to_backend = torch.from_numpy
    if load_path:
        with open(load_path, "rb") as f:
            saved = pickle.load(f)
        tree = saved["tree"]
        t_search = 0.0
        print(f"  [path loaded]  {load_path}")
    else:
        opt = ctg.HyperOptimizer(
            methods=["kahypar", "random-greedy", "spinglass"],
            max_repeats=500,
            parallel=48,
            max_time=100,
            minimize="size",
            slicing_opts={"target_slices": num_slices},
            progbar=True,
        )
        t0 = time.time()
        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
        t_search = time.time() - t0
        tree = rehearsal["tree"]
        print(
            f"  [path search]  {t_search:.3f}s  "
            f"flops~2^{tree.contraction_cost():.2f}  "
            f"size~2^{tree.contraction_width():.2f}  "
            f"slices={tree.multiplicity}"
        )
        if save_path:
            with open(save_path, "wb") as f:
                pickle.dump({"tree": tree}, f)
            print(f"  [path saved]  {save_path}")
    os.makedirs(profile_dir, exist_ok=True)
    if profile:
        from torch.profiler import profile as torch_profile
        from torch.profiler import ProfilerActivity, record_function
        trace_path = os.path.join(
            profile_dir,
            (
                f"trace_statevector_"
                f"{circuit.nqubits}q_"
                f"slices{tree.multiplicity}_"
                f"{int(time.time())}.json"
            ),
        )
        t0 = time.time()
        with torch_profile(
            activities=[ProfilerActivity.CPU],
            record_shapes=True,
            profile_memory=True,
            with_stack=True,
        ) as prof:
            with record_function("qibotn_to_dense_contraction"):
                sv = qc.to_dense(optimize=tree).reshape(-1)
            with record_function("torch_to_numpy_view_or_copy"):
                if type(sv).__module__.startswith("torch"):
                    sv_tn = sv.detach().cpu().numpy()
                else:
                    sv_tn = np.asarray(sv)
        t_contract = time.time() - t0
        export_profiler_outputs(prof, trace_path)
    else:
        t0 = time.time()
        sv = qc.to_dense(optimize=tree).reshape(-1)
        t_contract = time.time() - t0
        if type(sv).__module__.startswith("torch"):
            sv_tn = sv.detach().cpu().numpy()
        else:
            sv_tn = np.asarray(sv)
    print(f"  [contraction]  {t_contract:.3f}s")
    return sv_tn, t_search + t_contract
 def run_quimb_tn_samples(circuit, nshots=1024):
    """Mode: samples — sample from circuit output distribution."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    t0 = time.time()
    result = b.execute_circuit(circuit, nshots=nshots)
    t_total = time.time() - t0
    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
    try:
        freqs = result.frequencies()
        print(f"  top states: {dict(list(freqs.items())[:5])}")
    except Exception:
        pass
    return result, t_total
 def qibojit_expval(circuit, nqubits):
    """Compute <Z_0> via qibojit statevector."""
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    sv = np.array(result.state(), dtype=complex).flatten()
    probs = np.abs(sv) ** 2
    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
    expval = float(np.dot(probs, 1 - 2 * bits))
    return expval, elapsed
 def run_qibojit(circuit):
    """Compute full statevector via qibojit."""
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    sv = np.array(result.state(), dtype=complex).flatten()
    return sv, elapsed
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=10)
    parser.add_argument(
        "--circuit",
        type=str,
        default="qft",
        choices=["qft", "variational", "ghz", "brickwork"],
    )
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--num-slices", type=int, default=1)
    parser.add_argument("--nshots", type=int, default=1024)
    parser.add_argument(
        "--mode",
        type=str,
        default="statevector",
        choices=["expval", "statevector", "samples"],
        help="expval: local_expectation; statevector: to_dense; samples: sampling",
    )
    parser.add_argument(
        "--no-compare",
        action="store_true",
        help="Skip qibojit reference run",
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default=None,
        help="Save contraction tree to a pickle file",
    )
    parser.add_argument(
        "--load-path",
        type=str,
        default=None,
        help="Load contraction tree from a pickle file and skip path search",
    )
    parser.add_argument(
        "--profile",
        action="store_true",
        help="Enable torch profiler for statevector contraction stage",
    )
    parser.add_argument(
        "--profile-dir",
        type=str,
        default="profiles",
        help="Directory to save torch profiler traces",
    )
    parser.add_argument(
        "--save-statevector",
        action="store_true",
        help="Save TN statevector to data/sv_tn_*.npy",
    )
    args = parser.parse_args()
    print(
        f"Circuit: {args.circuit}, "
        f"nqubits={args.nqubits}, "
        f"nlayers={args.nlayers}, "
        f"mode={args.mode}, "
        f"profile={args.profile}"
    )
    np.random.seed(42)
    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
    try:
        if args.mode == "expval":
            expval_tn, t_tn = run_quimb_tn(
                circuit_tn,
                args.nqubits,
                args.num_slices,
                load_path=args.load_path,
                save_path=args.save_path,
            )
            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
        elif args.mode == "statevector":
            sv_tn, t_tn = run_quimb_tn_statevector(
                circuit_tn,
                args.nqubits,
                args.num_slices,
                load_path=args.load_path,
                save_path=args.save_path,
                profile=args.profile,
                profile_dir=args.profile_dir,
            )
            print(
                f"\n[quimb TN]  time={t_tn:.4f}s  "
                f"statevector shape={sv_tn.shape}"
            )
            if args.save_statevector:
                os.makedirs("data", exist_ok=True)
                out_path = f"data/sv_tn_{args.circuit}{args.nqubits}.npy"
                np.save(out_path, sv_tn)
                print(f"[saved statevector] {out_path}")
        else:
            _, t_tn = run_quimb_tn_samples(
                circuit_tn,
                nshots=args.nshots,
            )
            print(f"\n[quimb TN]  time={t_tn:.4f}s")
            args.no_compare = True
    except Exception as e:
        print(f"[quimb TN] FAILED: {e}")
        raise
    if not args.no_compare and args.mode != "statevector":
        np.random.seed(42)
        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
        if t_tn > 0:
            print(f"Speedup  : {t_ref / t_tn:.2f}x")
    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
        np.random.seed(42)
        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
        sv_ref, t_ref = run_qibojit(circuit_ref)
        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
        l2_err = np.linalg.norm(sv_ref - sv_tn)
        print(f"[qibojit]   time={t_ref:.4f}s")
        print(f"Fidelity : {fid:.8f}  (1=perfect)")
        print(f"L2 error : {l2_err:.2e}")
        if t_tn > 0:
            print(f"Speedup  : {t_ref / t_tn:.2f}x")
 if __name__ == "__main__":
    main()
--- a/benchmark_mps.py
+++ b/benchmark_mps.py
@@ -0,0 +1,189 @@
 """Benchmark: qibojit (reference) vs qibotn/quimb MPS, with error comparison."""
 import time
 import argparse
 import os
 import numpy as np
 import qibo
 import quimb.tensor as qtn
 from qibo import Circuit, gates
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 def make_circuit(circuit_type, nqubits, nlayers=1, add_measurements=False):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        c = QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    if add_measurements:
        c.add(gates.M(*range(nqubits)))
    return c
 def run_qibojit(circuit):
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    sv = result.state()
    return sv, elapsed
 def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer, nshots=None):
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="mps", max_bond_dimension=max_bond, svd_cutoff=svd_cutoff)
    b.contractions_optimizer = optimizer
    t0 = time.time()
    if nshots:
        result = b.execute_circuit(circuit, nshots=nshots)
        elapsed = time.time() - t0
        return result.frequencies(), elapsed, 0.0
    else:
        # MPS simulation
        circ_quimb = qtn.CircuitMPS.from_openqasm2_str(
            circuit.to_qasm(),
            gate_opts={"max_bond": max_bond, "cutoff": svd_cutoff},
        )
        t_mps = time.time() - t0
        # to_dense separately
        t1 = time.time()
        #sv = circ_quimb.psi.to_dense().reshape(-1)
        sv = None
        t_dense = time.time() - t1
        return sv, t_mps, t_dense
 def run_quimb_permmps(circuit, max_bond, svd_cutoff, nshots=None):
    gates_list = [
        qtn.Gate(g.name, params=list(g.parameters), qubits=list(g.qubits))
        for g in circuit.queue
        if g.name.lower() != "measure"
    ]
    t0 = time.time()
    circ = qtn.CircuitPermMPS.from_gates(
        gates_list,
        N=circuit.nqubits,
        max_bond=max_bond,
        cutoff=svd_cutoff,
    )
    if nshots:
        from collections import Counter
        result = Counter(circ.sample(nshots))
        elapsed = time.time() - t0
        return dict(result), elapsed
    else:
        mps = circ.get_psi_unordered()
        sv = mps.to_dense().reshape(-1)
        elapsed = time.time() - t0
        return sv, elapsed
 def compare_statevector(sv_ref, sv_mps):
    sv_ref = np.array(sv_ref, dtype=complex).flatten()
    sv_mps = np.array(sv_mps, dtype=complex).flatten()
    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
    l2_err = np.linalg.norm(sv_ref - sv_mps)
    return fidelity, l2_err
 def compare_frequencies(freq_ref, freq_mps, nshots):
    all_keys = set(freq_ref) | set(freq_mps)
    tvd = 0.5 * sum(abs(freq_ref.get(k, 0) - freq_mps.get(k, 0)) for k in all_keys) / nshots
    return tvd
 def jit_cache_path(circuit_type, nqubits, nlayers):
    os.makedirs(DATA_DIR, exist_ok=True)
    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=10)
    parser.add_argument("--circuit", type=str, default="ghz",
                        choices=["qft", "variational", "ghz"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--max-bond", type=int, default=None,
                        help="Max bond dimension for MPS (None = unlimited)")
    parser.add_argument("--svd-cutoff", type=float, default=1e-6)
    parser.add_argument("--optimizer", type=str, default="eager")
    parser.add_argument("--nshots", type=int, default=None,
                        help="Use sampling mode with given number of shots instead of statevector")
    parser.add_argument("--permmps", action="store_true",
                        help="Use CircuitPermMPS directly instead of qibotn backend")
    parser.add_argument("--skip-jit", action="store_true",
                        help="Skip qibojit run, load cached statevector if available")
    parser.add_argument("--no-compare", action="store_true",
                        help="Skip correctness comparison entirely")
    args = parser.parse_args()
    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
    print(f"MPS config: max_bond={args.max_bond}, svd_cutoff={args.svd_cutoff}, optimizer={args.optimizer}")
    ref = None
    t_ref = None
    if not args.no_compare:
        cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
        if args.nshots:
            # frequency mode: run qibojit with same nshots
            np.random.seed(42)
            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers, add_measurements=True)
            qibo.set_backend("qibojit", platform="numba")
            t0 = time.time()
            result_ref = circuit_ref(nshots=args.nshots)
            t_ref = time.time() - t0
            ref = dict(result_ref.frequencies())
            print(f"\n[qibojit]  time={t_ref:.4f}s")
        elif args.skip_jit and os.path.exists(cache_path):
            ref = np.load(cache_path)
            print(f"\n[qibojit]  loaded from cache: {cache_path}")
        else:
            np.random.seed(42)
            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
            ref, t_ref = run_qibojit(circuit_ref)
            np.save(cache_path, ref)
            print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
    np.random.seed(42)
    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
    label = "quimb PermMPS" if args.permmps else "quimb MPS"
    try:
        if args.permmps:
            out, t_mps = run_quimb_permmps(circuit_mps, args.max_bond, args.svd_cutoff, args.nshots)
            t_dense = 0.0
        else:
            out, t_mps, t_dense = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer, args.nshots)
        print(f"[{label}] MPS sim={t_mps:.4f}s  to_dense={t_dense:.4f}s  total={t_mps+t_dense:.4f}s")
        if not args.no_compare:
            if args.nshots:
                tvd = compare_frequencies(ref, out, args.nshots)
                print(f"\nTVD      : {tvd:.6f}  (0=perfect)")
            else:
                fidelity, l2_err = compare_statevector(ref, out)
                print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
                print(f"L2 error : {l2_err:.2e}")
            if t_ref is not None and t_mps > 0:
                print(f"Speedup  : {t_ref/t_mps:.2f}x")
    except Exception as e:
        print(f"[quimb MPS] FAILED: {e}")
        raise
 if __name__ == "__main__":
    main()
--- a/benchmark_qmatchatea.py
+++ b/benchmark_qmatchatea.py
@@ -0,0 +1,126 @@
 """Benchmark: qibojit (reference) vs qibotn/qmatchatea MPS."""
 import time
 import argparse
 import os
 import numpy as np
 import qibo
 from qibo import Circuit, gates
 from qibo.backends import construct_backend
 DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def run_qibojit(circuit):
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    return result.state(), elapsed
 def run_qmatchatea(circuit, max_bond, cut_ratio):
    import qmatchatea, qtealeaves.observables
    from qibo.backends import construct_backend as _cb
    b = _cb(backend="qibotn", platform="qmatchatea")
    b.configure_tn_simulation(ansatz="MPS", max_bond_dimension=max_bond, cut_ratio=cut_ratio)
    qk_circuit = b._qibocirc_to_qiskitcirc(circuit)
    run_qk_params = qmatchatea.preprocessing.qk_transpilation_params(False)
    observables = qtealeaves.observables.TNObservables()
    observables += qtealeaves.observables.TNState2File(name="temp", formatting="D")
    t0 = time.time()
    results = qmatchatea.run_simulation(
        circ=qk_circuit,
        convergence_parameters=b.convergence_params,
        transpilation_parameters=run_qk_params,
        backend=b.qmatchatea_backend,
        observables=observables,
    )
    elapsed = time.time() - t0
    tn_state = results.observables.get("tn_state")
    if tn_state is None:
        results.load_state()
        tn_state = results.observables["tn_state"]
    sv_obj = tn_state.to_statevector(qiskit_order=False, max_qubit_equivalent=40)
    sv = np.array(sv_obj.elem, dtype=complex).flatten()
    return sv, elapsed
 def compare(sv_ref, sv_mps):
    sv_ref = np.array(sv_ref, dtype=complex).flatten()
    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
    l2_err = np.linalg.norm(sv_ref - sv_mps)
    return fidelity, l2_err
 def jit_cache_path(circuit_type, nqubits, nlayers):
    os.makedirs(DATA_DIR, exist_ok=True)
    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=10)
    parser.add_argument("--circuit", type=str, default="ghz",
                        choices=["qft", "variational", "ghz"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--max-bond", type=int, default=64)
    parser.add_argument("--cut-ratio", type=float, default=1e-6)
    parser.add_argument("--skip-jit", action="store_true")
    args = parser.parse_args()
    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
    print(f"MPS config: max_bond={args.max_bond}, cut_ratio={args.cut_ratio}")
    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
    t_ref = None
    if args.skip_jit and os.path.exists(cache_path):
        sv_ref = np.load(cache_path)
        print(f"\n[qibojit]  loaded from cache: {cache_path}")
    else:
        np.random.seed(42)
        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
        sv_ref, t_ref = run_qibojit(circuit_ref)
        np.save(cache_path, sv_ref)
        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
    np.random.seed(42)
    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
    try:
        sv_mps, t_mps = run_qmatchatea(circuit_mps, args.max_bond, args.cut_ratio)
        fidelity, l2_err = compare(sv_ref, sv_mps)
        print(f"[qmatchatea] time={t_mps:.4f}s")
        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
        print(f"L2 error : {l2_err:.2e}")
        if t_ref is not None and t_mps > 0:
            print(f"Speedup  : {t_ref/t_mps:.2f}x")
    except Exception as e:
        print(f"[qmatchatea] FAILED: {e}")
        raise
 if __name__ == "__main__":
    main()
--- a/benchmark_tn.py
+++ b/benchmark_tn.py
@@ -0,0 +1,386 @@
 """Benchmark: qibotn/quimb generic TN — expectation values."""
 import multiprocessing
 multiprocessing.set_start_method("spawn", force=True)
 import pickle
 import time
 import threading
 import argparse
 import numpy as np
 import cotengra as ctg
 import qibo
 from qibo import Circuit, gates
 class TimedTrialFn:
    def __init__(self, trial_fn, timeout=15):
        self.trial_fn = trial_fn
        self.timeout = timeout
    def __call__(self, *args, **kwargs):
        result = [None]
        exc = [None]
        def _run():
            try:
                result[0] = self.trial_fn(*args, **kwargs)
            except Exception as e:
                exc[0] = e
        t = threading.Thread(target=_run, daemon=True)
        t.start()
        t.join(self.timeout)
        if t.is_alive():
            raise TimeoutError(f"trial exceeded {self.timeout}s")
        if exc[0] is not None:
            raise exc[0]
        return result[0]
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    elif circuit_type == "brickwork":
        for q in range(nqubits):
            c.add(gates.H(q))
        for layer in range(nlayers):
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CNOT(q, q + 1))
                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def make_z_observable(nqubits):
    """Z on qubit 0 only — single contraction for benchmarking"""
    return ["z"], [(0,)], [1.0]
 def run_quimb_tn(circuit, nqubits, num_slices, load_path=None, save_path=None):
    """Mode: expval — compute <Z_0> via local_expectation (lightcone pruning)."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    operators, sites, coeffs = make_z_observable(nqubits)
    ops = b._string_to_quimb_operator(operators[0])
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    if load_path:
        with open(load_path, "rb") as f:
            saved = pickle.load(f)
        tree = saved["tree"]
        t_search = 0.0
        print(f"  [path loaded]  {load_path}")
    else:
        opt = ctg.HyperOptimizer(
            methods=['kahypar', 'random-greedy', 'spinglass'],
            max_repeats=16,
            parallel=True,
            max_time=60,
            slicing_opts={'target_slices': num_slices},
            progbar=True,
        )
        t0 = time.time()
        rehearsal = qc.local_expectation(ops, where=sites[0], optimize=opt,
                                         simplify_sequence="R", rehearse=True)
        t_search = time.time() - t0
        tree = rehearsal['tree']
        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
        if save_path:
            with open(save_path, "wb") as f:
                pickle.dump({"tree": tree}, f)
            print(f"  [path saved]  {save_path}")
    t0 = time.time()
    expval = qc.local_expectation(ops, where=sites[0], optimize=tree, simplify_sequence="R")
    t_contract = time.time() - t0
    print(f"  [contraction]  {t_contract:.3f}s")
    return float(expval.real), t_search + t_contract
 def run_quimb_tn_statevector(circuit, nqubits, num_slices, load_path=None, save_path=None):
    """Mode: statevector — contract full TN to dense vector."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
    if load_path:
        with open(load_path, "rb") as f:
            saved = pickle.load(f)
        tree = saved["tree"]
        t_search = 0.0
        print(f"  [path loaded]  {load_path}")
    else:
        opt = ctg.HyperOptimizer(
            #methods=['kahypar', 'random-greedy', 'spinglass'],
            max_repeats=1024,
            #parallel="concurrent.futures",
            parallel=64,
            max_time=60,
            minimize='size',
            slicing_opts={'target_slices': num_slices},
            #slicing_opts={'target_size': 2**30},
            progbar=True,
            on_trial_error='ignore'
        )
        t0 = time.time()
        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
        t_search = time.time() - t0
        tree = rehearsal['tree']
        #print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
        if save_path:
            with open(save_path, "wb") as f:
                pickle.dump({"tree": tree}, f)
            print(f"  [path saved]  {save_path}")
        print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}")
        return None, t_search
    t0 = time.time()
    sv = qc.to_dense(optimize=tree).reshape(-1)
    t_contract = time.time() - t0
    print(f"  [contraction]  {t_contract:.3f}s")
    sv_tn = np.array(sv)
    return sv_tn, t_search + t_contract
 def _contract_mpi(tree, arrays, comm, root=0):
    """Contract slices via MPI, returning result as the same array type as input.
    Unlike ``cotengra.ContractionTree.contract_mpi``, this works with any
    array backend (numpy, torch, etc.) — it only converts to numpy at the
    MPI-reduce boundary and converts back.
    """
    size = comm.Get_size()
    rank = comm.Get_rank()
    result_np = None
    is_torch = type(arrays[0]).__module__.startswith("torch")
    for i in range(rank, tree.multiplicity, size):
        x = tree.contract_slice(arrays, i)
        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
        if result_np is None:
            result_np = x_np
        else:
            result_np += x_np
    if result_np is None:
        result_np = np.zeros(1, dtype=np.complex64)
    if rank == root:
        result = np.zeros_like(result_np)
    else:
        result = None
    comm.Reduce(result_np, result, root=root)
    if rank == root:
        import torch
        return torch.from_numpy(np.asarray(result)) if is_torch else result
    return None
 def run_quimb_tn_statevector_mpi(circuit, nqubits, num_slices, load_path=None, save_path=None):
    """MPI-parallel statevector via custom MPI contraction (supports torch backend)."""
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
    if load_path:
        if rank == 0:
            with open(load_path, "rb") as f:
                saved = pickle.load(f)
            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
            print(f"  [path loaded]  {load_path}")
        else:
            tree = psi = None
            t_search = 0.0
    else:
        # each rank runs serial search over its share of trials
        total_repeats = 1024
        rank_repeats = max(1, total_repeats // size)
        opt = ctg.HyperOptimizer(
            methods=['kahypar', 'random-greedy', 'spinglass'],
            max_repeats=rank_repeats,
            parallel=False,
            max_time=100,
            minimize='size',
            slicing_opts={'target_slices': max(num_slices, size), 'allow_outer': False},
            progbar=(rank == 0),
        )
        t0 = time.time()
        rehearsal = qc.to_dense(optimize=opt, rehearse=True)
        t_search = time.time() - t0
        local_tree = rehearsal['tree']
        local_psi = rehearsal['tn']
        local_size = local_tree.contraction_width()
        # gather all trees to rank 0, pick best by contraction_width
        all_results = comm.gather((local_size, local_tree, local_psi), root=0)
        if rank == 0:
            _, tree, psi = min(all_results, key=lambda x: x[0])
            print(f"  [path search]  {t_search:.3f}s  flops~2^{tree.contraction_cost():.2f}  size~2^{tree.contraction_width():.2f}  slices={tree.multiplicity}")
            if save_path:
                with open(save_path, "wb") as f:
                    pickle.dump({"tree": tree, "psi": psi}, f)
                print(f"  [path saved]   {save_path}")
        else:
            tree = psi = None
    tree = comm.bcast(tree, root=0)
    psi = comm.bcast(psi, root=0)
    t_search = comm.bcast(t_search, root=0)
    arrays = psi.arrays
    t0 = time.time()
    sv = _contract_mpi(tree, arrays, comm, root=0)
    t_contract = time.time() - t0
    if rank == 0:
        print(f"  [contraction]  {t_contract:.3f}s")
        return np.array(sv).reshape(-1), t_search + t_contract
    return None, t_search + t_contract
 def run_quimb_tn_samples(circuit, nshots=1024):
    """Mode: samples — sample from circuit output distribution."""
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    t0 = time.time()
    result = b.execute_circuit(circuit, nshots=nshots)
    t_total = time.time() - t0
    print(f"  [sampling]     {t_total:.3f}s  nshots={nshots}")
    print(f"  top states: {dict(list(result.frequencies().items())[:5])}")
    return result, t_total
 def qibojit_expval(circuit, nqubits):
    """Compute <Z_0> via qibojit statevector."""
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    sv = np.array(result.state(), dtype=complex).flatten()
    probs = np.abs(sv) ** 2
    bits = (np.arange(len(probs)) >> (nqubits - 1)) & 1
    expval = float(np.dot(probs, 1 - 2 * bits))
    return expval, elapsed
 def run_qibojit(circuit):
    """Compute full statevector via qibojit."""
    qibo.set_backend("qibojit", platform="numba")
    t0 = time.time()
    result = circuit()
    elapsed = time.time() - t0
    sv = np.array(result.state(), dtype=complex).flatten()
    return sv, elapsed
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=10)
    parser.add_argument("--circuit", type=str, default="qft",
                        choices=["qft", "variational", "ghz", "brickwork"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--num-slices", type=int, default=1)
    parser.add_argument("--nshots", type=int, default=1024)
    parser.add_argument("--mode", type=str, default="statevector",
                        choices=["expval", "statevector", "samples"],
                        help="expval: local_expectation; statevector: to_dense; samples: sampling")
    parser.add_argument("--mpi", action="store_true",
                        help="Use MPI-parallel contraction (run with mpirun -n N)")
    parser.add_argument("--no-compare", action="store_true",
                        help="Skip qibojit reference run")
    parser.add_argument("--save-path", type=str, default=None,
                        help="Save contraction tree to a pickle file")
    parser.add_argument("--load-path", type=str, default=None,
                        help="Load contraction tree from a pickle file (skip path search)")
    args = parser.parse_args()
    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}, mode={args.mode}")
    np.random.seed(42)
    circuit_tn = make_circuit(args.circuit, args.nqubits, args.nlayers)
    try:
        if args.mode == "expval":
            expval_tn, t_tn = run_quimb_tn(circuit_tn, args.nqubits, args.num_slices,
                                           load_path=args.load_path, save_path=args.save_path)
            print(f"\n[quimb TN]  time={t_tn:.4f}s  <Z_0>={expval_tn:.8f}")
        elif args.mode == "statevector":
            if args.mpi:
                sv_tn, t_tn = run_quimb_tn_statevector_mpi(circuit_tn, args.nqubits, args.num_slices,
                                                           load_path=args.load_path, save_path=args.save_path)
            else:
                sv_tn, t_tn = run_quimb_tn_statevector(circuit_tn, args.nqubits, args.num_slices,
                                                       load_path=args.load_path, save_path=args.save_path)
            if sv_tn is not None:
                print(f"\n[quimb TN]  time={t_tn:.4f}s  statevector shape={sv_tn.shape}")
                np.save(f"data/sv_tn_{args.circuit}{args.nqubits}.npy", sv_tn)
        else:
            _, t_tn = run_quimb_tn_samples(circuit_tn, args.nqubits, args.nshots)
            print(f"\n[quimb TN]  time={t_tn:.4f}s")
            args.no_compare = True  # samples 模式无法和 qibojit 期望值对比
    except Exception as e:
        print(f"[quimb TN] FAILED: {e}")
        raise
    if not args.no_compare and args.mode != "statevector":
        np.random.seed(42)
        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
        expval_ref, t_ref = qibojit_expval(circuit_ref, args.nqubits)
        print(f"[qibojit]   time={t_ref:.4f}s  <Z_0>={expval_ref:.8f}")
        print(f"\nDiff     : {abs(expval_tn - expval_ref):.2e}")
        if t_tn > 0:
            print(f"Speedup  : {t_ref/t_tn:.2f}x")
    elif not args.no_compare and args.mode == "statevector" and sv_tn is not None:
        np.random.seed(42)
        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
        sv_ref, t_ref = run_qibojit(circuit_ref)
        fid = abs(np.dot(sv_ref.conj(), sv_tn)) ** 2
        l2_err = np.linalg.norm(sv_ref - sv_tn)
        print(f"[qibojit]   time={t_ref:.4f}s")
        print(f"Fidelity : {fid:.8f}  (1=perfect)")
        print(f"L2 error : {l2_err:.2e}")
        if t_tn > 0:
            print(f"Speedup  : {t_ref/t_tn:.2f}x")
 if __name__ == "__main__":
    main()
--- a/benchmark_tn_mpi.py
+++ b/benchmark_tn_mpi.py
@@ -0,0 +1,246 @@
 """MPI-parallel TN benchmark: path search + contraction via MPI."""
 import pickle
 import time
 import argparse
 import numpy as np
 import cotengra as ctg
 import qibo
 from qibo import Circuit, gates
 from mpi4py import MPI
 from concurrent.futures import ProcessPoolExecutor, as_completed
 def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks):
    """Run one serial HyperOptimizer in a subprocess, return (width, tree)."""
    import pickle, cotengra as ctg, random
    random.seed(seed)
    tn = pickle.loads(tn_bytes)
    opt = ctg.HyperOptimizer(
        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
        max_repeats=repeats,
        parallel=False,
        minimize='flops',
        max_time=600,
        optlib="random",
        slicing_opts={'target_size': 2**30, 'allow_outer': False},
        progbar=False,
    )
    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
    return tree.contraction_width(), tree
 def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
                    timeout=None):
    """Launch n_workers subprocesses each running serial search, return best tree."""
    import pickle, os, signal
    from concurrent.futures import ProcessPoolExecutor, as_completed
    tn_bytes = pickle.dumps(tn)
    repeats_per = max(1, total_repeats // n_workers)
    best_width, best_tree = float('inf'), None
    with ProcessPoolExecutor(max_workers=n_workers) as pool:
        futures = {
            pool.submit(_run_serial_search, tn_bytes, output_inds,
                        repeats_per, seed, num_slices, n_ranks): seed
            for seed in range(n_workers)
        }
        pids = {f: p.pid for f, p in zip(futures, pool._processes.values())}
        try:
            for fut in as_completed(futures, timeout=timeout):
                try:
                    width, tree = fut.result()
                    if width < best_width:
                        best_width, best_tree = width, tree
                except Exception as e:
                    print(f"  [worker failed] {e}")
        except TimeoutError:
            pass
        for fut, pid in pids.items():
            if not fut.done():
                try:
                    os.kill(pid, signal.SIGKILL)
                except ProcessLookupError:
                    pass
    return best_tree
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    elif circuit_type == "brickwork":
        for q in range(nqubits):
            c.add(gates.H(q))
        for layer in range(nlayers):
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CNOT(q, q + 1))
                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def _contract_mpi(tree, arrays, comm, root=0):
    rank = comm.Get_rank()
    size = comm.Get_size()
    is_torch = type(arrays[0]).__module__.startswith("torch")
    result_np = None
    for i in range(rank, tree.multiplicity, size):
        x = tree.contract_slice(arrays, i)
        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
        result_np = x_np if result_np is None else result_np + x_np
    if result_np is None:
        result_np = np.zeros(1, dtype=np.complex64)
    result = np.zeros_like(result_np) if rank == root else None
    comm.Reduce(result_np, result, root=root)
    if rank == root:
        import torch
        return torch.from_numpy(np.asarray(result)) if is_torch else result
    return None
 def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
            load_path=None, save_path=None):
    """Each MPI rank runs serial path search over total_repeats/size trials,
    rank 0 picks the global best, then all ranks contract in parallel."""
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex64)
    # --- path search: each rank serial, gather best to rank 0 ---
    if load_path:
        if rank == 0:
            with open(load_path, "rb") as f:
                saved = pickle.load(f)
            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
            print(f"  [path loaded]  {load_path}")
        else:
            tree = psi = None
            t_search = 0.0
    else:
        rank_repeats = max(1, total_repeats // size)
        t0 = time.time()
        # get TN object first (no contraction), then run parallel search
        psi_tn = qc.to_dense(rehearse="tn")
        local_tree = parallel_search(
            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
            num_slices=num_slices, n_ranks=size, timeout=630,
        )
        t_search = time.time() - t0
        local_psi = psi_tn
        all_results = comm.gather((local_tree.contraction_width(), local_tree, local_psi), root=0)
        if rank == 0:
            _, tree, psi = min(all_results, key=lambda x: x[0])
            print(f"  [path search]  {t_search:.3f}s  "
                  f"flops~2^{tree.contraction_cost():.2f}  "
                  f"size~2^{tree.contraction_width():.2f}  "
                  f"slices={tree.multiplicity}")
            if save_path:
                with open(save_path, "wb") as f:
                    pickle.dump({"tree": tree, "psi": psi}, f)
                print(f"  [path saved]   {save_path}")
        else:
            tree = psi = None
        if save_path:
            t_search = comm.bcast(t_search, root=0)
            return None, t_search
    tree = comm.bcast(tree, root=0)
    psi = comm.bcast(psi, root=0)
    t_search = comm.bcast(t_search, root=0)
    # --- contraction: all ranks work in parallel ---
    import torch
    torch.set_num_threads(max(1, 48 // size))
    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex64) for a in psi.arrays]
    t0 = time.time()
    sv = _contract_mpi(tree, arrays, comm, root=0)
    t_contract = time.time() - t0
    if rank == 0:
        print(f"  [contraction]  {t_contract:.3f}s")
        return np.array(sv).reshape(-1), t_search + t_contract
    return None, t_search + t_contract
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=30)
    parser.add_argument("--circuit", type=str, default="qft",
                        choices=["qft", "variational", "ghz", "brickwork"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--num-slices", type=int, default=1)
    parser.add_argument("--total-repeats", type=int, default=1024)
    parser.add_argument("--save-path", type=str, default=None)
    parser.add_argument("--load-path", type=str, default=None)
    parser.add_argument("--no-compare", action="store_true")
    args = parser.parse_args()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    if rank == 0:
        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
    np.random.seed(42)
    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
    try:
        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
                              total_repeats=args.total_repeats,
                              load_path=args.load_path, save_path=args.save_path)
    except Exception as e:
        if rank == 0:
            print(f"[FAILED] {e}")
        raise
    if rank == 0 and sv is not None:
        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
        if not args.no_compare:
            from benchmark_tn import run_qibojit
            np.random.seed(42)
            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
            sv_ref, t_ref = run_qibojit(circuit_ref)
            fid = abs(np.dot(sv_ref.conj(), sv)) ** 2
            print(f"[qibojit]       time={t_ref:.4f}s")
            print(f"Fidelity : {fid:.8f}")
            print(f"L2 error : {np.linalg.norm(sv_ref - sv):.2e}")
            if t_total > 0:
                print(f"Speedup  : {t_ref/t_total:.2f}x")
 if __name__ == "__main__":
    main()
--- a/compare_jit_tn_quimb.py
+++ b/compare_jit_tn_quimb.py
@@ -0,0 +1,51 @@
 import numpy as np
 import os
 import sys
 def check_results(ref_path, tn_path):
    # 1. 检查文件是否存在
    if not os.path.exists(ref_path) or not os.path.exists(tn_path):
        print(f"Error: 找不到文件！\n参考文件: {ref_path}\n待测文件: {tn_path}")
        return
    print(f"正在加载数据并对比: \n  [Ref] {ref_path}\n  [TN ] {tn_path}\n")
    try:
        # 2. 加载状态向量
        # mmap_mode='r' 可以防止大文件直接撑爆内存
        sv_ref = np.load(ref_path, mmap_mode='r')
        sv_tn = np.load(tn_path, mmap_mode='r')
        # 3. 计算保真度 (Fidelity)
        # fid = |<ref|tn>|^2
        inner_product = np.dot(sv_ref.conj(), sv_tn)
        fidelity = np.abs(inner_product)**2
        # 4. 计算 L2 误差 (欧氏距离)
        l2_error = np.linalg.norm(sv_ref - sv_tn)
        # 5. 打印结果
        print("-" * 30)
        print(f"保真度 (Fidelity): {fidelity:.12f}")
        #print(f"L2 范数误差:       {l2_error:.2e}")
        print("-" * 30)
        # phase-invariant L2: align global phase first
        phase = inner_product / np.abs(inner_product)
        l2_phase_corrected = np.linalg.norm(sv_ref - sv_tn / phase)
        print(f"L2 误差(相位校正后): {l2_phase_corrected:.2e}")
        if fidelity > 0.999999:
            print("✅ 验证通过：结果高度一致。")
        else:
            print("❌ 警告：保真度较低，请检查收缩路径或截断误差。")
    except Exception as e:
        print(f"计算过程中发生错误: {e}")
 if __name__ == "__main__":
    # 你可以在这里直接修改文件名
    REF_FILE = 'data/sv_qibojit_qft30.npy'
    TN_FILE  = 'data/sv_tn_qft30_mpi.npy'
    check_results(REF_FILE, TN_FILE)
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
 10.20.6.74:1                                                                                         
 10.20.6.102:1
--- a/inspect_path.py
+++ b/inspect_path.py
@@ -0,0 +1,21 @@
 import pickle
 import sys
 path = sys.argv[1] if len(sys.argv) > 1 else 'path/path.pkl.34.mpi'
 with open(path, 'rb') as f:
    d = pickle.load(f)
 tree = d['tree']
 width = tree.contraction_width()
 slices = tree.multiplicity
 sliced_width = width - (slices.bit_length() - 1)
 print(f"Path: {path}")
 print(f"Width (unsliced):     {width:.1f}")
 print(f"Slices:               {slices}")
 print(f"Sliced width:         {sliced_width:.1f}")
 print(f"Peak memory (total):  {2**width * 16 / 1e9:.1f} GB")
 print(f"Peak per slice:       {2**sliced_width * 16 / 1e9:.2f} GB")
 print(f"Sliced indices:       {len(tree.sliced_inds)}")
 print(f"Cost (log2 flops):    {tree.contraction_cost(log=True):.2f}")
--- a/poetry.lock
+++ b/poetry.lock
@@ -1733,14 +1733,14 @@ files = [
 [[package]]
 name = "mako"
-version = "1.3.10"
+version = "1.3.11"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
+    {file = "mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77"},
-    {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
+    {file = "mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069"},
 ]
 [package.dependencies]
--- a/run_qibojit_ref.py
+++ b/run_qibojit_ref.py
@@ -0,0 +1,18 @@
 """Run qibojit on 30-qubit QFT and save statevector for comparison."""
 import time
 import numpy as np
 import qibo
 from qibo.models import QFT
 #np.random.seed(42)
 circuit = QFT(32)
 qibo.set_backend("qibojit", platform="numba")
 t0 = time.time()
 result = circuit()
 elapsed = time.time() - t0
 sv = np.array(result.state(), dtype=complex).flatten()
 np.save("data/sv_qibojit_qft30.npy", sv)
 print(f"[qibojit] time={elapsed:.4f}s  shape={sv.shape}")
 print(f"Saved to sv_qibojit_qft30.npy")
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -167,7 +167,7 @@ def execute_circuit(
        raise_error(ValueError, "Initial state not None supported only for MPS ansatz.")
    circ_quimb = self.circuit_ansatz.from_openqasm2_str(
-        circuit.to_qasm(), psi0=initial_state
+        circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}
    )
    if nshots:
@@ -186,7 +186,16 @@ def execute_circuit(
    else:
        frequencies = None
        measured_probabilities = None
-
+    '''
    if return_array:
        if self.ansatz == "mps":
            psi = circ_quimb.psi
            statevector = psi.to_dense().reshape(-1)
        else:
            statevector = circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
    else:
        statevector = None
    '''
    statevector = (
        circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
        if return_array
@@ -291,6 +300,15 @@ def _qibo_circuit_to_quimb(
        quimb_gate_name = GATE_MAP.get(gate_name, None)
        if quimb_gate_name == "measure":
            continue
        if gate_name == "cu1":
            theta = gate.parameters[0]
            c, t = gate.qubits
            circ.apply_gate("RZ", theta / 2, c)
            circ.apply_gate("RZ", theta / 2, t)
            circ.apply_gate("CNOT", c, t)
            circ.apply_gate("RZ", -theta / 2, t)
            circ.apply_gate("CNOT", c, t)
            continue
        if quimb_gate_name is None:
            raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")
--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -57,10 +57,10 @@ class TensorNetworkResult:
        return self.measures
    def state(self):
-        """Return the statevector if the number of qubits is less than 20."""
+        """Return the statevector if the number of qubits is less than 35."""
-        if self.nqubits < 20:
+        if self.nqubits < 35:
            return self.statevector
        raise_error(
            NotImplementedError,
-            f"Tensor network simulation cannot be used to reconstruct statevector for >= 20 .",
+            f"Tensor network simulation cannot be used to reconstruct statevector for >= 35 .",
        )
--- a/sweep_bond_32q.py
+++ b/sweep_bond_32q.py
@@ -0,0 +1,39 @@
 """Bond dimension sweep for 32-qubit variational circuit."""
 import os
 import sys
 import numpy as np
 sys.path.insert(0, os.path.dirname(__file__))
 from benchmark_mps import make_circuit, run_qibojit, run_quimb_mps, compare, jit_cache_path, DATA_DIR
 NQUBITS = 32
 NLAYERS = 5
 BOND_VALUES = [1, 8, 16, 32, 64, 128, 256]
 SVD_CUTOFF = 1e-6
 OPTIMIZER = "auto-hq"
 if __name__ == "__main__":
    cache_path = jit_cache_path("variational", NQUBITS, NLAYERS)
    if os.path.exists(cache_path):
        sv_ref = np.load(cache_path)
        print(f"[qibojit] loaded from cache: {cache_path}\n")
    else:
        np.random.seed(42)
        circuit_ref = make_circuit("variational", NQUBITS, NLAYERS)
        sv_ref, t_ref = run_qibojit(circuit_ref)
        np.save(cache_path, sv_ref)
        print(f"[qibojit] time={t_ref:.4f}s  (saved to {cache_path})\n")
    print(f"{'bond':>6}  {'time(s)':>10}  {'fidelity':>12}  {'l2_err':>10}")
    print("-" * 46)
    for bond in BOND_VALUES:
        np.random.seed(42)
        circuit_mps = make_circuit("variational", NQUBITS, NLAYERS)
        try:
            sv_mps, t_mps = run_quimb_mps(circuit_mps, bond, SVD_CUTOFF, OPTIMIZER)
            fidelity, l2_err = compare(sv_ref, sv_mps)
            print(f"{bond:>6}  {t_mps:>10.4f}  {fidelity:>12.8f}  {l2_err:>10.2e}")
        except Exception as e:
            print(f"{bond:>6}  FAILED: {e}")
Author	SHA1	Message	Date
jaunatisblue	5a692033a6	添加MPI并行TN benchmark及辅助脚本，移除旧benchmark Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-05 19:04:09 +08:00
jaunatisblue	a3f39a1d67	删除tn脚本implementation	2026-05-03 19:06:21 +08:00
jaunatisblue	dd222587b7	tn脚本更新 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-03 18:54:05 +08:00
jaunatisblue	740828872e	添加tn脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-28 23:07:39 +08:00
jaunatisblue	80d9c1de5a	benchmark测试，发现瓶颈：路径搜索 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 18:59:54 +08:00
jaunatisblue	2c54840e7b	1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 11:03:57 +08:00