benchmark测试，发现瓶颈：路径搜索

1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆
2026-04-27 18:59:54 +08:00 · 2026-04-27 11:03:57 +08:00
18 changed files with 208 additions and 576 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+data/
 # C extensions
 *.so

--- a/benchmark_mps.py
+++ b/benchmark_mps.py
@@ -0,0 +1,114 @@
+"""Benchmark: qibojit (reference) vs qibotn/quimb MPS, with error comparison."""
+import time
+import argparse
+import os
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+def make_circuit(circuit_type, nqubits, nlayers=1):
+    c = Circuit(nqubits)
+    if circuit_type == "qft":
+        from qibo.models import QFT
+        return QFT(nqubits)
+    elif circuit_type == "variational":
+        for layer in range(nlayers):
+            for q in range(nqubits):
+                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
+            offset = layer % 2
+            for q in range(offset, nqubits - 1, 2):
+                c.add(gates.CZ(q, q + 1))
+    elif circuit_type == "ghz":
+        c.add(gates.H(0))
+        for q in range(nqubits - 1):
+            c.add(gates.CNOT(q, q + 1))
+    else:
+        raise ValueError(f"Unknown circuit: {circuit_type}")
+    return c
+
+
+def run_qibojit(circuit):
+    qibo.set_backend("qibojit", platform="numba")
+    t0 = time.time()
+    result = circuit()
+    elapsed = time.time() - t0
+    sv = result.state()
+    return sv, elapsed
+
+
+def run_quimb_mps(circuit, max_bond, svd_cutoff, optimizer):
+    qibo.set_backend("qibotn", platform="quimb")
+    b = qibo.get_backend()
+    b.configure_tn_simulation(ansatz="mps", max_bond_dimension=max_bond, svd_cutoff=svd_cutoff)
+    b.contractions_optimizer = optimizer
+
+    t0 = time.time()
+    result = b.execute_circuit(circuit, return_array=True)
+    elapsed = time.time() - t0
+    sv = result.state()
+    return sv, elapsed
+
+
+def compare(sv_ref, sv_mps):
+    sv_ref = np.array(sv_ref, dtype=complex).flatten()
+    sv_mps = np.array(sv_mps, dtype=complex).flatten()
+    fidelity = abs(np.dot(sv_ref.conj(), sv_mps)) ** 2
+    l2_err = np.linalg.norm(sv_ref - sv_mps)
+    return fidelity, l2_err
+
+
+def jit_cache_path(circuit_type, nqubits, nlayers):
+    os.makedirs(DATA_DIR, exist_ok=True)
+    return os.path.join(DATA_DIR, f"jit_{circuit_type}_n{nqubits}_l{nlayers}.npy")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=10)
+    parser.add_argument("--circuit", type=str, default="ghz",
+                        choices=["qft", "variational", "ghz"])
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--max-bond", type=int, default=None,
+                        help="Max bond dimension for MPS (None = unlimited)")
+    parser.add_argument("--svd-cutoff", type=float, default=1e-6)
+    parser.add_argument("--optimizer", type=str, default="auto-hq")
+    parser.add_argument("--skip-jit", action="store_true",
+                        help="Skip qibojit run, load cached statevector if available")
+    args = parser.parse_args()
+
+    print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, nlayers={args.nlayers}")
+    print(f"MPS config: max_bond={args.max_bond}, svd_cutoff={args.svd_cutoff}, optimizer={args.optimizer}")
+
+    cache_path = jit_cache_path(args.circuit, args.nqubits, args.nlayers)
+    t_ref = None
+
+    if args.skip_jit and os.path.exists(cache_path):
+        sv_ref = np.load(cache_path)
+        print(f"\n[qibojit]  loaded from cache: {cache_path}")
+    else:
+        np.random.seed(42)
+        circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        np.save(cache_path, sv_ref)
+        print(f"\n[qibojit]  time={t_ref:.4f}s  (saved to {cache_path})")
+
+    np.random.seed(42)
+    circuit_mps = make_circuit(args.circuit, args.nqubits, args.nlayers)
+    try:
+        sv_mps, t_mps = run_quimb_mps(circuit_mps, args.max_bond, args.svd_cutoff, args.optimizer)
+        fidelity, l2_err = compare(sv_ref, sv_mps)
+        print(f"[quimb MPS] time={t_mps:.4f}s")
+        print(f"\nFidelity : {fidelity:.8f}  (1=perfect)")
+        print(f"L2 error : {l2_err:.2e}")
+        if t_ref is not None and t_mps > 0:
+            print(f"Speedup  : {t_ref/t_mps:.2f}x")
+    except Exception as e:
+        print(f"[quimb MPS] FAILED: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
--- a/11
+++ b/11
@@ -0,0 +1,11 @@
+[qibojit] loaded from cache: /home/yx/qibotn/data/jit_variational_n32_l5.npy
+
+  bond     time(s)      fidelity      l2_err
+----------------------------------------------
+     1    157.4587    0.00000280    9.99e-01
+     8     61.9126    0.99999014    2.22e-03
+    16     63.4902    0.99999014    2.22e-03
+    32     58.3594    0.99999014    2.22e-03
+    64     59.7043    0.99999014    2.22e-03
+   128     64.6368    0.99999014    2.22e-03
+   256     64.9058    0.99999014    2.22e-03
--- a/poetry.lock
+++ b/poetry.lock
@@ -1733,14 +1733,14 @@ files = [

 [[package]]
 name = "mako"
-version = "1.3.10"
+version = "1.3.11"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
-    {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
+    {file = "mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77"},
+    {file = "mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069"},
 ]

 [package.dependencies]
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -167,7 +167,7 @@ def execute_circuit(
        raise_error(ValueError, "Initial state not None supported only for MPS ansatz.")

    circ_quimb = self.circuit_ansatz.from_openqasm2_str(
-        circuit.to_qasm(), psi0=initial_state
+        circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}
    )

    if nshots:
--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -58,7 +58,7 @@ class TensorNetworkResult:

    def state(self):
        """Return the statevector if the number of qubits is less than 20."""
-        if self.nqubits < 20:
+        if self.nqubits < 35:
            return self.statevector
        raise_error(
            NotImplementedError,
--- a/sweep_bond_32q.py
+++ b/sweep_bond_32q.py
@@ -0,0 +1,39 @@
+"""Bond dimension sweep for 32-qubit variational circuit."""
+import os
+import sys
+import numpy as np
+
+sys.path.insert(0, os.path.dirname(__file__))
+from benchmark_mps import make_circuit, run_qibojit, run_quimb_mps, compare, jit_cache_path, DATA_DIR
+
+NQUBITS = 32
+NLAYERS = 5
+BOND_VALUES = [1, 8, 16, 32, 64, 128, 256]
+SVD_CUTOFF = 1e-6
+OPTIMIZER = "auto-hq"
+
+if __name__ == "__main__":
+    cache_path = jit_cache_path("variational", NQUBITS, NLAYERS)
+
+    if os.path.exists(cache_path):
+        sv_ref = np.load(cache_path)
+        print(f"[qibojit] loaded from cache: {cache_path}\n")
+    else:
+        np.random.seed(42)
+        circuit_ref = make_circuit("variational", NQUBITS, NLAYERS)
+        sv_ref, t_ref = run_qibojit(circuit_ref)
+        np.save(cache_path, sv_ref)
+        print(f"[qibojit] time={t_ref:.4f}s  (saved to {cache_path})\n")
+
+    print(f"{'bond':>6}  {'time(s)':>10}  {'fidelity':>12}  {'l2_err':>10}")
+    print("-" * 46)
+
+    for bond in BOND_VALUES:
+        np.random.seed(42)
+        circuit_mps = make_circuit("variational", NQUBITS, NLAYERS)
+        try:
+            sv_mps, t_mps = run_quimb_mps(circuit_mps, bond, SVD_CUTOFF, OPTIMIZER)
+            fidelity, l2_err = compare(sv_ref, sv_mps)
+            print(f"{bond:>6}  {t_mps:>10.4f}  {fidelity:>12.8f}  {l2_err:>10.2e}")
+        except Exception as e:
+            print(f"{bond:>6}  FAILED: {e}")
--- a/tests/contract.py
+++ b/tests/contract.py
@@ -1,27 +0,0 @@
-import time
-import pickle
-
-
-def run(input="tree.pkl"):
-    with open(input, "rb") as f:
-        data = pickle.load(f)
-
-    sliced_tree = data["sliced_tree"]
-    arrays = data["arrays"]
-    n_slices = sliced_tree.nslices
-    print(f"Total slices: {n_slices}")
-
-    t0 = time.perf_counter()
-    total = sum(sliced_tree.contract_slice(arrays, i, backend='numpy',implementation='cotengra') for i in range(n_slices))
-    t1 = time.perf_counter()
-
-    print(f"Contract: {t1 - t0:.4f} s")
-    #print(f"Result: {total:.10f}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input", type=str, default="tree.pkl.bak")
-    args = parser.parse_args()
-    run(args.input)
--- a/tests/gen_qasm.py
+++ b/tests/gen_qasm.py
@@ -1,60 +0,0 @@
-"""生成比赛常用测试电路的 QASM 文件。"""
-import argparse
-import qibo
-from qibo.models import QFT, Circuit
-from qibo import gates
-import numpy as np
-
-qibo.set_backend("numpy")
-
-
-def gen_qft(n_qubits):
-    return QFT(n_qubits, with_swaps=True).to_qasm()
-
-
-def gen_random(n_qubits, depth, seed):
-    rng = np.random.default_rng(seed)
-    c = Circuit(n_qubits)
-    for _ in range(depth):
-        for q in range(n_qubits):
-            c.add(gates.H(q))
-        for q in range(0, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-    return c.to_qasm()
-
-
-def gen_supremacy(n_qubits, depth, seed):
-    """Google supremacy 风格：随机单比特门 + CZ"""
-    rng = np.random.default_rng(seed)
-    single = [gates.X, gates.Y, gates.H]
-    c = Circuit(n_qubits)
-    for _ in range(depth):
-        for q in range(n_qubits):
-            g = single[rng.integers(3)]
-            c.add(g(q))
-        for q in range(0, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-        for q in range(1, n_qubits - 1, 2):
-            c.add(gates.CZ(q, q + 1))
-    return c.to_qasm()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--circuit", default="qft", choices=["qft", "random", "supremacy"])
-    parser.add_argument("--n_qubits", type=int, default=20)
-    parser.add_argument("--depth", type=int, default=10)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--out", default="circuit.qasm")
-    args = parser.parse_args()
-
-    if args.circuit == "qft":
-        qasm = gen_qft(args.n_qubits)
-    elif args.circuit == "random":
-        qasm = gen_random(args.n_qubits, args.depth, args.seed)
-    else:
-        qasm = gen_supremacy(args.n_qubits, args.depth, args.seed)
-
-    with open(args.out, "w") as f:
-        f.write(qasm)
-    print(f"Written: {args.out}  ({args.n_qubits} qubits, {args.circuit})")
--- a/tests/hostfile
+++ b/tests/hostfile
@@ -1,2 +0,0 @@
-192.168.20.102
-192.168.20.101
--- a/tests/mpi_v.py
+++ b/tests/mpi_v.py
@@ -1,126 +0,0 @@
-"""
-MPI + ThreadPoolExecutor 混合并行张量网络收缩。
-每个 MPI rank 负责一部分 slice（stride 分配），
-rank 内用 ThreadPoolExecutor 并行执行各 slice（每线程一个 slice）。
-
-用法：
-    mpirun -n <N> python mpi_v.py --qasm circuit.qasm --target-slices 16 --threads 8
-"""
-import os
-import time
-import argparse
-import numpy as np
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-import quimb.tensor as qtn
-import cotengra as ctg
-
-
-def _contract_slice(sliced_tree, arrays, idx):
-    return sliced_tree.contract_slice(arrays, idx, backend="numpy")
-
-
-def run(qasm_path, target_slices, n_threads, max_repeats):
-    # ── 构建张量网络（rank 0，broadcast arrays）──
-    if rank == 0:
-        with open(qasm_path) as f:
-            qasm_str = f.read()
-        # 不用 full_simplify，保持 outer_inds 完整
-        psi = qtn.Circuit.from_openqasm2_str(qasm_str).psi
-        n_qubits = len([i for i in psi.outer_inds() if i.startswith("k")])
-        output_inds = [f"k{i}" for i in range(n_qubits)]
-        arrays = [t.data for t in psi.tensors]
-    else:
-        psi = None
-        n_qubits = None
-        arrays = None
-        output_inds = None
-
-    n_qubits = comm.bcast(n_qubits, root=0)
-    arrays = comm.bcast(arrays, root=0)
-    output_inds = comm.bcast(output_inds, root=0)
-
-    # ── 路径搜索（rank 0）+ broadcast ──
-    t0 = time.perf_counter()
-    if rank == 0:
-        opt = ctg.HyperOptimizer(
-            methods=["kahypar", "greedy"],
-            max_repeats=max_repeats,
-            minimize="flops",
-            parallel=min(96, os.cpu_count()),
-        )
-        tree = psi.contraction_tree(optimize=opt, output_inds=output_inds)
-        n = target_slices
-        sliced_tree = None
-        while n >= 1:
-            try:
-                sliced_tree = tree.slice(target_size=n, allow_outer=False)
-                break
-            except RuntimeError:
-                n //= 2
-        if sliced_tree is None:
-            sliced_tree = tree.slice(target_slices=1, allow_outer=True)
-        print(f"[rank 0] path search: {time.perf_counter()-t0:.2f}s  slices: {sliced_tree.nslices}", flush=True)
-    else:
-        sliced_tree = None
-
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    # ── 分布式收缩（MPI stride + ThreadPoolExecutor）──
-    my_indices = list(range(rank, n_slices, size))
-    local_result = np.zeros(2**n_qubits, dtype=np.complex128)
-
-    comm.Barrier()
-    t1 = time.perf_counter()
-
-    with ThreadPoolExecutor(max_workers=n_threads) as pool:
-        for batch_start in range(0, len(my_indices), n_threads):
-            batch = my_indices[batch_start:batch_start + n_threads]
-            futures = {pool.submit(_contract_slice, sliced_tree, arrays, i): i for i in batch}
-            for fut in as_completed(futures):
-                local_result += np.array(fut.result()).flatten()
-
-    t2 = time.perf_counter()
-    if rank == 0:
-        print(f"[rank 0] contract: {t2-t1:.2f}s", flush=True)
-
-    # ── MPI reduce ──
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        print(f"result norm: {np.linalg.norm(total):.10f}", flush=True)
-        print(f"total time:  {t2-t0:.2f}s", flush=True)
-        return total
-    return None
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--qasm", required=True, help="QASM 文件路径")
-    parser.add_argument("--target-slices", type=int, default=None,
-                        help="目标切片数量（优先于 target-size）")
-    parser.add_argument("--target-size", type=int, default=28,
-                        help="切片目标大小指数（2^N），默认 28")
-    parser.add_argument("--threads", type=int, default=max(1, os.cpu_count() // size),
-                        help="每个 rank 的线程数，默认 cpu_count/size")
-    parser.add_argument("--max-repeats", type=int, default=256,
-                        help="cotengra 路径搜索重复次数")
-    args = parser.parse_args()
-
-    target = args.target_slices if args.target_slices else 2**args.target_size
-    mode = "slices" if args.target_slices else f"size=2^{args.target_size}"
-
-    if rank == 0:
-        print(f"ranks={size}  threads/rank={args.threads}  target_{mode}", flush=True)
-
-    run(args.qasm, target, args.threads, args.max_repeats)
-
-
-if __name__ == "__main__":
-    main()
--- a/tests/quimb_mpi.py
+++ b/tests/quimb_mpi.py
@@ -1,68 +0,0 @@
-import os
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-'''
-# --- 1. 关键：在导入 numpy/quimb 之前设置环境变量 ---
-# 告诉底层 BLAS 库 (MKL/OpenBLAS) 使用 96 个线程
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-# 优化线程亲和性，避免线程在不同 CPU 核心间跳变，提升缓存命中率
-os.environ["KMP_AFFINITY"] = "granularity=fine,compact,1,0"
-os.environ["KMP_BLOCKTIME"] = "0"
-'''
-# 现在导入库
-import psutil
-
-def run_baseline(n_qubits=50, depth=20):
-    print(f"🚀 {n_qubits} Qubits, Depth {depth}")
-    print(f"💻 Detected Logical Cores: {os.cpu_count()}")
-    
-    # 1. 构建电路 (必须 complex128 保证精度)
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for d in range(depth):
-        for i in range(n_qubits):
-            circ.apply_gate('H', i)
-        for i in range(0, n_qubits - 1, 2):
-            circ.apply_gate('CZ', i, i + 1)
-            
-    psi = circ.psi
-    
-    # 2. 构建闭合网络 <psi|psi>
-    net = psi.conj() & psi
-    
-    # 3. 路径搜索参数 (Kahypar)
-    print("🔍 Searching path with Kahypar...")
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=128,
-        parallel=96,       
-        minimize='flops',
-        on_trial_error='ignore'
-    )
-    
-    # 4. 阶段1：路径搜索
-    t0 = time.perf_counter()
-    tree = net.contraction_tree(optimize=opt)
-    t1 = time.perf_counter()
-    print(f"🔍 Path search done: {t1 - t0:.4f} s")
-
-    # 5. 阶段2：张量收缩
-    result = net.contract(optimize=tree, backend='numpy')
-    t2 = time.perf_counter()
-    peak_mem = psutil.Process().memory_info().rss / 1024**3
-
-    print(f"✅ Done!")
-    print(f"⏱️ Contract: {t2 - t1:.4f} s  |  Total: {t2 - t0:.4f} s")
-    print(f"💾 Peak Memory: {peak_mem:.2f} GB")
-    print(f"🔢 Result: {result:.10f}")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=50)
-    parser.add_argument("--depth", type=int, default=20)
-    args = parser.parse_args()
-    run_baseline(n_qubits=args.n_qubits, depth=args.depth)
--- a/tests/quimb_mpi2.py
+++ b/tests/quimb_mpi2.py
@@ -1,90 +0,0 @@
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-def build_qft(n_qubits):
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        circ.apply_gate('H', i)
-        for j in range(i + 1, n_qubits):
-            circ.apply_gate('CPHASE', np.pi / 2 ** (j - i), i, j)
-    return circ
-
-def run_mpi(n_qubits, depth=None):
-    if rank == 0:
-        print(f"MPI size: {size} ranks")
-        print(f"Circuit: QFT {n_qubits} qubits")
-
-    circ = build_qft(n_qubits)
-    psi = circ.psi
-
-    # 期望值网络：<psi|Z_0|psi>
-    Z = np.array([[1, 0], [0, -1]], dtype=np.complex128)
-    bra = psi.conj().reindex({f'k{i}': f'b{i}' for i in range(n_qubits)})
-    obs = qtn.Tensor(Z, inds=(f'k0', f'b0'))
-    net = psi & obs & bra
-
-    # 2. 所有 rank 并行搜索路径，rank 0 选全局最优
-    t0 = time.perf_counter()
-    repeats_per_rank = max(1, 128 // size)
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        #methods=['greedy'],
-        #max_repeats=repeats_per_rank,
-        max_repeats=repeats_per_rank,
-        minimize='flops',
-        parallel=max(1, 96 // size),
-    )
-    local_tree = net.contraction_tree(optimize=opt)
-
-    all_trees = comm.gather(local_tree, root=0)
-
-    if rank == 0:
-        tree = min(all_trees, key=lambda t: t.contraction_cost())
-        t1 = time.perf_counter()
-        print(f"[rank 0] Path search: {t1 - t0:.4f} s")
-    else:
-        tree = None
-
-    tree = comm.bcast(tree, root=0)
-
-    # 3. rank 0 切片，broadcast sliced_tree
-    if rank == 0:
-        sliced_tree = tree.slice(target_size=2**27)
-    else:
-        sliced_tree = None
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    if rank == 0:
-        print(f"Total slices: {n_slices}, each rank handles ~{n_slices // size}")
-
-    arrays = [t.data for t in net.tensors]
-
-    # 每个 rank 处理自己负责的切片
-    t2 = time.perf_counter()
-    local_result = 0.0 + 0.0j
-    for i in range(rank, n_slices, size):
-        local_result += sliced_tree.contract_slice(arrays, i, backend='numpy')
-    t3 = time.perf_counter()
-
-    # 4. reduce 汇总到 rank 0
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        print(f"[rank 0] Contract: {t3 - t2:.4f} s")
-        print(f"Result: {total:.10f}")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=20)
-    parser.add_argument("--depth", type=int, default=30)
-    args = parser.parse_args()
-    run_mpi(args.n_qubits, args.depth)
--- a/tests/quimb_mpi3.py
+++ b/tests/quimb_mpi3.py
@@ -1,103 +0,0 @@
-import time
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-from mpi4py import MPI
-
-comm = MPI.COMM_WORLD
-rank = comm.Get_rank()
-size = comm.Get_size()
-
-def build_qft_circuit(n_qubits):
-    """构建标准 QFT 电路"""
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        # 1. 施加 H 门
-        circ.apply_gate('H', i)
-        # 2. 施加受控相位旋转
-        for j in range(i + 1, n_qubits):
-            theta = np.pi / (2**(j - i))
-            circ.apply_gate('CPHASE', theta, i, j)
-    return circ
-
-def run_mpi(n_qubits):
-    if rank == 0:
-        print(f"MPI size: {size} ranks")
-        print(f"Circuit: QFT {n_qubits} qubits")
-
-    # 1. 所有 rank 独立构建 QFT 电路
-    circ = build_qft_circuit(n_qubits)
-    
-    # 物理观测：计算 <psi|psi>，结果应为 1.0
-    # 注意：QFT 是幺正变换，末态模长平方必为 1
-    psi = circ.psi
-    net = psi.conj() & psi 
-
-    # 2. 路径搜索优化
-    t0 = time.perf_counter()
-    # 每个 rank 尝试不同的种子，增加找到全局最优路径的概率
-    repeats_per_rank = max(1, 256 // size)
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=repeats_per_rank,
-        minimize='flops',
-        parallel=max(1, 96 // size),
-        ) 
-    # 搜索收缩树
-    local_tree = net.contraction_tree(optimize=opt)
-    
-    # 汇总所有 rank 找到的树，在 rank 0 选出 FLOPs 最低的那棵
-    all_trees = comm.gather(local_tree, root=0)
-
-    if rank == 0:
-        tree = min(all_trees, key=lambda t: t.contraction_cost())
-        t1 = time.perf_counter()
-        print(f"[rank 0] Path search: {t1 - t0:.4f} s")
-        print(f"[rank 0] Best path FLOPs: {tree.contraction_cost():.2e}")
-    else:
-        tree = None
-
-    # 将最优路径广播给所有进程
-    tree = comm.bcast(tree, root=0)
-
-    # 3. 切片处理（性能控制核心）
-    if rank == 0:
-        # 比赛建议：将 target_size 设为能填满单进程内存的 50%-70%
-        # 或者改用 target_slices=size * 4 以确保负载绝对平衡
-        sliced_tree = tree.slice(target_size=2**27) 
-    else:
-        sliced_tree = None
-        
-    sliced_tree = comm.bcast(sliced_tree, root=0)
-    n_slices = sliced_tree.nslices
-
-    if rank == 0:
-        print(f"Total slices: {n_slices}, each rank handles ~{n_slices // size + 1}")
-
-    # 获取原始张量数据
-    arrays = [t.data for t in net.tensors]
-
-    # 4. 执行收缩计算
-    t2 = time.perf_counter()
-    local_result = 0.0 + 0.0j
-    # 简单的静态负载均衡：每个 rank 跳步处理切片
-    for i in range(rank, n_slices, size):
-        local_result += sliced_tree.contract_slice(arrays, i, backend='numpy')
-    t3 = time.perf_counter()
-
-    # 5. 结果汇总
-    total = comm.reduce(local_result, op=MPI.SUM, root=0)
-
-    if rank == 0:
-        duration = t3 - t2
-        print(f"[rank 0] Contract: {duration:.4f} s")
-        # 对于 <psi|psi>，QFT 的正确结果应无限接近 1.0
-        print(f"Result (Norm): {total.real:.10f} + {total.imag:.10f}j")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=20)
-    # QFT 的深度由比特数自动决定，所以删除了 --depth 参数
-    args = parser.parse_args()
-    run_mpi(args.n_qubits)
--- a/tests/search_tree.py
+++ b/tests/search_tree.py
@@ -1,56 +0,0 @@
-import time
-import pickle
-import numpy as np
-import quimb.tensor as qtn
-import cotengra as ctg
-
-
-def build_qft(n_qubits):
-    circ = qtn.Circuit(n_qubits, dtype=np.complex128)
-    for i in range(n_qubits):
-        circ.apply_gate('H', i)
-        for j in range(i + 1, n_qubits):
-            circ.apply_gate('CPHASE', np.pi / 2 ** (j - i), i, j)
-    return circ
-
-
-def run(n_qubits, output="tree.pkl"):
-    print(f"Circuit: QFT {n_qubits} qubits")
-
-    circ = build_qft(n_qubits)
-    psi = circ.psi
-
-    Z = np.array([[1, 0], [0, -1]], dtype=np.complex128)
-    bra = psi.conj().reindex({f'k{i}': f'b{i}' for i in range(n_qubits)})
-    obs = qtn.Tensor(Z, inds=(f'k0', f'b0'))
-    net = psi & obs & bra
-
-    t0 = time.perf_counter()
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar'],
-        max_repeats=32,
-        minimize='combo',
-        parallel=8,
-    )
-    tree = net.contraction_tree(optimize=opt)
-    t1 = time.perf_counter()
-    print(f"Path search: {t1 - t0:.4f} s")
-    print(tree)
-
-    sliced_tree = tree.slice(target_size=2**28)
-    print(f"Total slices: {sliced_tree.nslices}")
-
-    arrays = [t.data for t in net.tensors]
-
-    with open(output, "wb") as f:
-        pickle.dump({"sliced_tree": sliced_tree, "arrays": arrays}, f)
-    print(f"Saved to {output}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--n_qubits", type=int, default=18)
-    parser.add_argument("--output", type=str, default="tree.pkl")
-    args = parser.parse_args()
-    run(args.n_qubits, args.output)
--- a/tests/test_quimb_backend.py
+++ b/tests/test_quimb_backend.py
@@ -61,6 +61,6 @@ def test_eval(nqubits: int, tolerance: float, is_mps: bool):
        qasm_circ, init_state_tn, gate_opt, backend=config.quimb.backend
    ).flatten()

-    #assert np.allclose(
-    #    result_sv, result_tn, atol=tolerance
-    #), "Resulting dense vectors do not match"
+    assert np.allclose(
+        result_sv, result_tn, atol=tolerance
+    ), "Resulting dense vectors do not match"
--- a/tests/tree.pkl.bak
+++ b/tests/tree.pkl.bak
Author	SHA1	Message	Date
jaunatisblue	80d9c1de5a	benchmark测试，发现瓶颈：路径搜索 Some checks are pending Build wheels / build (ubuntu-latest, 3.11) (push) Waiting to run Details Build wheels / build (ubuntu-latest, 3.12) (push) Waiting to run Details Build wheels / build (ubuntu-latest, 3.13) (push) Waiting to run Details Tests / check (push) Waiting to run Details Tests / build (ubuntu-latest, 3.11) (push) Blocked by required conditions Details Tests / build (ubuntu-latest, 3.12) (push) Blocked by required conditions Details Tests / build (ubuntu-latest, 3.13) (push) Blocked by required conditions Details	2026-04-27 18:59:54 +08:00
jaunatisblue	2c54840e7b	1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 11:03:57 +08:00