From 2f5c86395293d68970d65b76c7d453ba2e66180c Mon Sep 17 00:00:00 2001
From: jaunatisblue <jaunatisblue@gmail.com>
Date: Thu, 7 May 2026 23:26:53 +0800
Subject: [PATCH] =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E5=8C=96=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E5=AE=8C=E5=96=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                   |  2 +-
 benchmark_contract_sliced.py | 53 ++++++++++++++++++++
 benchmark_expectation.py     | 56 ---------------------
 benchmark_search.py          | 34 +++++++++++++
 benchmark_slice.py           | 16 ++++++
 check_tree.py                | 13 +++++
 src/qibotn/backends/quimb.py |  4 +-
 src/qibotn/observables.py    | 15 ++++++
 src/qibotn/parallel.py       | 95 +++++++++++++++++++++++-------------
 9 files changed, 195 insertions(+), 93 deletions(-)
 create mode 100644 benchmark_contract_sliced.py
 delete mode 100644 benchmark_expectation.py
 create mode 100644 benchmark_search.py
 create mode 100644 benchmark_slice.py
 create mode 100644 check_tree.py

diff --git a/.gitignore b/.gitignore
index 1c33bfb..2705f4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,7 @@ data/
 bak/
 path/
 profiles/
-
+vtune_expval/
 perf*
 # Distribution / packaging
 .Python
diff --git a/benchmark_contract_sliced.py b/benchmark_contract_sliced.py
new file mode 100644
index 0000000..0f56cd1
--- /dev/null
+++ b/benchmark_contract_sliced.py
@@ -0,0 +1,53 @@
+"""MPI parallel sliced contraction using pre-sliced tree."""
+import time, pickle, os
+import numpy as np
+from mpi4py import MPI
+
+NQUBITS, NLAYERS, NCORES = 25, 10, 96
+
+comm = MPI.COMM_WORLD
+rank, size = comm.Get_rank(), comm.Get_size()
+
+os.environ['OMP_NUM_THREADS'] = str(max(1, NCORES // size))
+os.environ['MKL_NUM_THREADS'] = str(max(1, NCORES // size))
+
+import torch
+import qibo, quimb as qu
+from qibotn.observables import build_random_circuit
+
+torch.set_num_threads(max(1, NCORES // size))
+
+circuit = build_random_circuit(NQUBITS, NLAYERS)
+qibo.set_backend("qibotn", platform="quimb")
+backend = qibo.get_backend()
+backend.configure_tn_simulation(ansatz="tn")
+qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
+tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
+
+with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
+#with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
+    tree = pickle.load(f)
+
+arrays = [torch.from_numpy(np.ascontiguousarray(t._data, dtype=np.complex128)) for t in tn.tensors]
+n_slices = tree.multiplicity
+
+if rank == 0:
+    print(f"Slices: {n_slices}, Ranks: {size}, "
+          f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
+          f"Threads/rank: {max(1, NCORES // size)}, Backend: torch")
+
+t0 = time.time()
+result = None
+for i in range(rank, n_slices, size):
+    val = tree.contract_slice(arrays, i, backend='torch')
+    val_np = val.cpu().numpy().reshape(-1)
+    result = val_np if result is None else result + val_np
+
+if result is None:
+    result = np.zeros(1, dtype=np.complex128)
+
+total = np.zeros_like(result) if rank == 0 else None
+comm.Reduce(result, total, root=0)
+
+if rank == 0:
+    print(f"Contract: {time.time() - t0:.4f}s  Expectation: {0.5 * total[0].real:.10f}")
diff --git a/benchmark_expectation.py b/benchmark_expectation.py
deleted file mode 100644
index 4c70ecf..0000000
--- a/benchmark_expectation.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""Expectation value benchmark with timing breakdown."""
-import time
-import numpy as np
-import torch
-import qibo
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Z
-import quimb as qu
-from qibotn.parallel import parallel_path_search
-
-# Config
-NQUBITS = 25
-NLAYERS = 10
-WORKERS = 96
-
-# Build circuit
-np.random.seed(42)
-circuit = Circuit(NQUBITS)
-for _ in range(NLAYERS):
-    for q in range(NQUBITS):
-        circuit.add(gates.RY(q, theta=np.random.uniform(0, 2*np.pi)))
-        circuit.add(gates.RZ(q, theta=np.random.uniform(0, 2*np.pi)))
-    for q in range(NQUBITS):
-        circuit.add(gates.CNOT(q % NQUBITS, (q + 1) % NQUBITS))
-
-# Setup backend
-qibo.set_backend("qibotn", platform="quimb")
-backend = qibo.get_backend()
-backend.configure_tn_simulation(ansatz="tn")
-torch.set_num_threads(WORKERS)
-
-# Prepare TN
-t0 = time.time()
-qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
-op = qu.pauli('x') & qu.pauli('z')
-tn = qc.local_expectation(op, (0, 1), rehearse='tn')
-t_prepare = time.time() - t0
-
-# Search path
-t0 = time.time()
-tree = parallel_path_search(tn, tn.outer_inds(), 'processpool', 1024, 300, WORKERS)
-t_search = time.time() - t0
-
-# Contract
-t0 = time.time()
-for tensor in tn.tensors:
-    tensor._data = torch.from_numpy(np.asarray(tensor._data)).to(torch.complex128)
-val = tn.contract(all, output_inds=(), optimize=tree)
-t_contract = time.time() - t0
-
-# Results
-print(f"Prepare:  {t_prepare:.4f}s")
-print(f"Search:   {t_search:.4f}s")
-print(f"Contract: {t_contract:.4f}s")
-print(f"Total:    {t_prepare+t_search+t_contract:.4f}s")
-print(f"Expectation: {0.5 * complex(val).real:.10f}")
diff --git a/benchmark_search.py b/benchmark_search.py
new file mode 100644
index 0000000..e110f8a
--- /dev/null
+++ b/benchmark_search.py
@@ -0,0 +1,34 @@
+"""Search contraction path and save."""
+import time, os, pickle
+from qibotn.parallel import parallel_path_search
+from qibotn.observables import build_random_circuit
+import qibo, quimb as qu
+
+from mpi4py import MPI
+
+NQUBITS, NLAYERS, WORKERS = 30, 10, 96
+
+comm = MPI.COMM_WORLD
+rank, size = comm.Get_rank(), comm.Get_size()
+method = 'mpi' if size > 1 else 'processpool'
+
+circuit = build_random_circuit(NQUBITS, NLAYERS)
+qibo.set_backend("qibotn", platform="quimb")
+backend = qibo.get_backend()
+backend.configure_tn_simulation(ansatz="tn")
+qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
+tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
+
+if rank == 0:
+    print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
+t0 = time.time()
+tree = parallel_path_search(tn, tn.outer_inds(), method=method,
+    total_repeats=1024, max_time=300, n_workers=WORKERS)
+t_search = time.time() - t0
+
+if rank == 0:
+    os.makedirs('data', exist_ok=True)
+    path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
+    with open(path, 'wb') as f:
+        pickle.dump(tree, f)
+    print(f"Search: {t_search:.2f}s  Peak: {tree.max_size() * 16 / 1e9:.2f} GB  Saved: {path}")
diff --git a/benchmark_slice.py b/benchmark_slice.py
new file mode 100644
index 0000000..8e7daae
--- /dev/null
+++ b/benchmark_slice.py
@@ -0,0 +1,16 @@
+"""Slice saved tree and save."""
+import pickle
+
+NQUBITS, NLAYERS = 25, 10
+
+with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
+    tree = pickle.load(f)
+
+print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
+
+tree_sliced = tree.slice_and_reconfigure(target_size=2**30)  # 2^29 = 8 GB
+
+with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
+    pickle.dump(tree_sliced, f)
+
+print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB  Slices: {tree_sliced.multiplicity}")
diff --git a/check_tree.py b/check_tree.py
new file mode 100644
index 0000000..dfd8d40
--- /dev/null
+++ b/check_tree.py
@@ -0,0 +1,13 @@
+"""Check contraction tree statistics."""
+import pickle, sys
+
+path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
+with open(path, 'rb') as f:
+    tree = pickle.load(f)
+
+print(f"File: {path}")
+print(f"Peak memory elements: {tree.max_size():.2e}")
+print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")  # complex128 = 16 bytes
+print(f"Total FLOPs: {tree.total_flops():.2e}")
+print(f"Contraction width: {tree.contraction_width()}")
+print(f"Multiplicity (slices): {tree.multiplicity}")
diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py
index 3868eec..3e532be 100644
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -438,6 +438,7 @@ def _expectation_parallel(self, circuit, observable, method, opts):
     search_workers = opts.get('search_workers', 48)
     mpi_contract = opts.get('mpi_contract', False)
     torch_threads = opts.get('torch_threads', None)
+    slicing_opts = opts.get('slicing_opts', None)
 
     qc = self._qibo_circuit_to_quimb(
         circuit,
@@ -469,7 +470,8 @@ def _expectation_parallel(self, circuit, observable, method, opts):
             method=method,
             total_repeats=max_repeats,
             max_time=max_time,
-            n_workers=search_workers
+            n_workers=search_workers,
+            slicing_opts=slicing_opts,
         )
 
         if tree is None:
diff --git a/src/qibotn/observables.py b/src/qibotn/observables.py
index 309ace4..cca738e 100644
--- a/src/qibotn/observables.py
+++ b/src/qibotn/observables.py
@@ -51,6 +51,21 @@ def create_hamiltonian_from_dict(data, circuit_nqubit):
     return hamiltonians.SymbolicHamiltonian(sum(terms))
 
 
+def build_random_circuit(nqubits, nlayers, seed=42):
+    """Build a random circuit with RY+RZ+CNOT layers for benchmarks."""
+    import numpy as np
+    from qibo import Circuit, gates
+    np.random.seed(seed)
+    c = Circuit(nqubits)
+    for _ in range(nlayers):
+        for q in range(nqubits):
+            c.add(gates.RY(q, theta=np.random.uniform(0, 2*np.pi)))
+            c.add(gates.RZ(q, theta=np.random.uniform(0, 2*np.pi)))
+        for q in range(nqubits):
+            c.add(gates.CNOT(q % nqubits, (q + 1) % nqubits))
+    return c
+
+
 def extract_gates_and_qubits(hamiltonian):
     """Extract per-term Pauli factors from a Qibo SymbolicHamiltonian.
 
diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py
index eab6b43..5073ee7 100644
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -13,7 +13,7 @@ except ImportError:
     MPI = None
 
 
-def _serial_search(tn_bytes, output_inds, repeats, seed, max_time):
+def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=None):
     """Single-process path search with cotengra."""
     import random
     import cotengra as ctg
@@ -26,44 +26,20 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time):
         minimize="combo-256",
         max_time=max_time,
         optlib="random",
+        slicing_opts=slicing_opts,
         progbar=False,
     )
     tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
     return tree.combo_cost(factor=256), tree
 
 
-def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
-                        max_time=300, n_workers=48):
-    """Parallel contraction path search.
-
-    Args:
-        tn: Tensor network (quimb TensorNetwork)
-        output_inds: Output indices for contraction
-        method: 'processpool' or 'mpi'
-        total_repeats: Total optimization repeats
-        max_time: Timeout per worker (seconds)
-        n_workers: Number of workers (processpool only)
-
-    Returns:
-        Best contraction tree found
-    """
-    if method == 'mpi':
-        if not _HAVE_MPI:
-            raise ImportError("mpi4py not available")
-        return _mpi_search(tn, output_inds, total_repeats, max_time)
-    elif method == 'processpool':
-        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time)
-    else:
-        raise ValueError(f"Unknown method: {method}")
-
-
-def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time):
+def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None):
     """ProcessPool-based parallel search."""
     tn_bytes = pickle.dumps(tn)
     repeats_per = max(1, total_repeats // n_workers)
     pool = ProcessPoolExecutor(max_workers=n_workers)
     futures = [
-        pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time)
+        pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time, slicing_opts)
         for seed in range(n_workers)
     ]
     best_cost, best_tree = float("inf"), None
@@ -89,15 +65,30 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time):
     return best_tree
 
 
-def _mpi_search(tn, output_inds, total_repeats, max_time):
-    """MPI-based parallel search across ranks."""
+def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicing_opts=None):
+    """MPI+ProcessPool hybrid search.
+
+    Each MPI rank uses a local ProcessPool for parallel search,
+    then the best tree is gathered and broadcast.
+    """
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
     tn_bytes = pickle.dumps(tn)
     repeats_per = max(1, total_repeats // size)
-    local_cost, local_tree = _serial_search(tn_bytes, output_inds, repeats_per, rank, max_time)
-    all_results = comm.gather((local_cost, local_tree), root=0)
+
+    if n_workers and n_workers > 1:
+        # Hybrid: each MPI rank uses ProcessPool
+        local_tree = _processpool_search(
+            tn, output_inds, repeats_per, n_workers, max_time, slicing_opts
+        )
+    else:
+        # Pure MPI: each rank runs serial
+        local_cost, local_tree = _serial_search(
+            tn_bytes, output_inds, repeats_per, rank, max_time, slicing_opts
+        )
+
+    all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree), root=0)
     best_tree = None
     if rank == 0:
         best_cost = float("inf")
@@ -108,6 +99,36 @@ def _mpi_search(tn, output_inds, total_repeats, max_time):
     return best_tree
 
 
+def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
+                         max_time=300, n_workers=48, slicing_opts=None):
+    """Parallel contraction path search.
+
+    Args:
+        tn: Tensor network (quimb TensorNetwork)
+        output_inds: Output indices
+        method: 'processpool' | 'mpi' | 'serial'
+        total_repeats: Total optimization repeats
+        max_time: Timeout per worker (seconds)
+        n_workers: Number of workers (processpool only, or per-MPI-rank if MPI)
+        slicing_opts: dict for cotengra slicing_opts (memory control)
+
+    Returns:
+        Best contraction tree
+    """
+    if method == 'serial':
+        tn_bytes = pickle.dumps(tn)
+        _, tree = _serial_search(tn_bytes, output_inds, total_repeats, 0, max_time, slicing_opts)
+        return tree
+    elif method == 'mpi':
+        if not _HAVE_MPI:
+            raise ImportError("mpi4py not available")
+        return _mpi_search(tn, output_inds, total_repeats, max_time, n_workers, slicing_opts)
+    elif method == 'processpool':
+        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts)
+    else:
+        raise ValueError(f"Unknown method: {method}")
+
+
 def parallel_contract(tree, arrays, method='mpi', comm=None):
     """Parallel sliced contraction.
 
@@ -115,10 +136,10 @@ def parallel_contract(tree, arrays, method='mpi', comm=None):
         tree: Contraction tree
         arrays: List of tensor arrays
         method: 'mpi' (only MPI supported for now)
-        comm: MPI communicator (required for MPI)
+        comm: MPI communicator
 
     Returns:
-        Contracted result (on root rank for MPI)
+        Contracted result (on root rank for MPI, otherwise for all)
     """
     if method == 'mpi':
         if not _HAVE_MPI or comm is None:
@@ -129,7 +150,7 @@ def parallel_contract(tree, arrays, method='mpi', comm=None):
 
 
 def _contract_mpi(tree, arrays, comm, root=0):
-    """Distribute contraction slices across MPI ranks."""
+    """Distribute contraction slices across MPI ranks with Reduce."""
     rank = comm.Get_rank()
     size = comm.Get_size()
     is_torch = type(arrays[0]).__module__.startswith("torch")
@@ -145,4 +166,8 @@ def _contract_mpi(tree, arrays, comm, root=0):
 
     result = np.zeros_like(result_np) if rank == root else None
     comm.Reduce(result_np, result, root=root)
+
+    if rank == root and is_torch:
+        import torch
+        return torch.from_numpy(np.asarray(result))
     return result