完善mps的vidal机制,多节点并行;补充tn搜索时dask集群搜索的方式
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled

This commit is contained in:
2026-05-12 15:44:19 +08:00
parent aa122964b4
commit 72f95599bb
32 changed files with 3529 additions and 320 deletions

15
.gitignore vendored
View File

@@ -5,11 +5,6 @@ __pycache__/
data/
# C extensions
*.so
bak/
path/
profiles/
vtune_expval/
perf*
# Distribution / packaging
.Python
build/
@@ -164,3 +159,13 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.devenv
# yx
bak/
path/
profiles/
vtune_expval/
perf*
experiments/
references/

View File

@@ -12,7 +12,7 @@ Tensor Network Types:
Tensor Network contractions to:
- dense vectors
- expecation values of given Pauli string
- expectation values of given Pauli strings or Pauli-sum observables
The supported HPC configurations are:
@@ -26,6 +26,18 @@ Currently, the supported tensor network libraries are:
- [cuQuantum](https://github.com/NVIDIA/cuQuantum), an NVIDIA SDK of optimized libraries and tools for accelerating quantum computing workflows.
- [quimb](https://quimb.readthedocs.io/en/latest/), an easy but fast python library for quantum information many-body calculations, focusing primarily on tensor networks.
## CPU expectation benchmarks
The current CPU expectation entrypoint is:
```sh
python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz
```
Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs.
Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`.
For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`.
## Installation
To get started:

View File

@@ -0,0 +1,145 @@
"""CLI for CPU TN/MPS expectation benchmarks."""
from __future__ import annotations
import argparse
from qibotn.benchmark_cases import (
CIRCUITS,
OBSERVABLES,
build_circuit,
observable_terms,
parse_names,
terms_to_dict,
)
from qibotn.expectation_runner import (
ExpectationConfig,
exact_for_observable,
run_cpu_expectation,
)
def build_parallel_opts(args):
slicing_opts = {}
if args.tn_target_slices is not None:
slicing_opts["target_slices"] = args.tn_target_slices
if args.tn_target_size is not None:
slicing_opts["target_size"] = args.tn_target_size
opts = {
"slicing_opts": slicing_opts or None,
"search_workers": args.tn_search_workers or args.torch_threads,
"max_repeats": args.tn_search_repeats,
"max_time": args.tn_search_time,
}
if args.tn_search_backend is not None:
opts["search_backend"] = args.tn_search_backend
if args.dask_address is not None:
opts["dask_address"] = args.dask_address
return opts
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=40)
parser.add_argument("--nlayers", type=int, default=30)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024)
parser.add_argument("--cut-ratio", type=float, default=1e-12)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--torch-threads", type=int, default=8)
parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
parser.add_argument("--mps", action="store_true")
parser.add_argument("--mpi", action="store_true")
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"])
parser.add_argument("--observables", nargs="+", default=["ring_xz"])
parser.add_argument("--pauli-pattern")
parser.add_argument("--tn-target-slices", type=int)
parser.add_argument("--tn-target-size", type=int)
parser.add_argument("--tn-search-workers", type=int)
parser.add_argument("--tn-search-repeats", type=int, default=128)
parser.add_argument("--tn-search-time", type=float, default=60.0)
parser.add_argument(
"--tn-search-backend",
choices=("processpool", "dask"),
help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
)
parser.add_argument(
"--dask-address",
help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
)
args = parser.parse_args()
ansatz = "mps" if args.mps else (args.ansatz or "tn")
circuits = parse_names(args.circuits, CIRCUITS, "circuits")
observables = [] if args.pauli_pattern else parse_names(
args.observables, OBSERVABLES, "observables"
)
rank = 0
if args.mpi:
from mpi4py import MPI
rank = MPI.COMM_WORLD.Get_rank()
config = ExpectationConfig(
ansatz=ansatz,
mpi=args.mpi,
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
torch_threads=args.torch_threads,
parallel_opts=build_parallel_opts(args),
)
if rank == 0:
mode = "MPI" if args.mpi else "serial"
print(
f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
f"nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} "
f"torch_threads={args.torch_threads} "
f"tn_search_backend={args.tn_search_backend or 'processpool'}"
)
print("circuit observable exact value abs_error rel_error seconds")
for circuit_kind in circuits:
circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
named_observables = (
[(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
if args.pauli_pattern
else [
(obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
for obs_kind in observables
]
)
for obs_name, observable in named_observables:
exact = None
if args.exact and rank == 0:
if args.nqubits > args.exact_max_qubits:
raise ValueError(
f"--exact is limited to {args.exact_max_qubits} qubits by default."
)
exact = exact_for_observable(circuit, observable, args.nqubits)
result = run_cpu_expectation(circuit, observable, config)
if args.mpi and result.rank != 0:
continue
abs_error = float("nan") if exact is None else abs(result.value - exact)
rel_error = (
float("nan")
if exact is None
else abs_error / max(abs(exact), 1e-15)
)
exact_text = "nan" if exact is None else f"{exact:.16e}"
print(
f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
)
if __name__ == "__main__":
main()

View File

@@ -31,11 +31,13 @@ cuquantum-python-cu12 = { version = "^25.9.1", optional = true }
qmatchatea = { version = "^1.4.3", optional = true }
qiskit = { version = "^1.4.0", optional = true }
qtealeaves = { version = "^1.5.20", optional = true }
distributed = { version = ">=2024", optional = true }
[tool.poetry.extras]
cuda = ["cupy-cuda12x", "cuda-toolkit", "nvidia-nccl-cu12", "cuquantum-python-cu12", "mpi4py"]
qmatchatea = ["qmatchatea"]
dask = ["distributed"]
[tool.poetry.group.docs]
optional = true

134
run_vidal_mps_cases.sh Executable file
View File

@@ -0,0 +1,134 @@
#!/usr/bin/env bash
set -euo pipefail
# Focused Vidal/MPS expectation test cases for 1D chain circuits.
#
# These cases intentionally avoid qmatchatea and generic TN paths. They target
# the current supported scope: one-qubit gates, adjacent two-qubit gates, and
# Pauli-sum expectation values on a 1D chain.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
HOSTFILE="${HOSTFILE:-hostfile}"
THREADS="${THREADS:-32}"
MPI_RANKS="${MPI_RANKS:-16}"
MPI_THREADS="${MPI_THREADS:-12}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
run() {
echo
echo "--------------------------------------------------------------------------------"
echo "$*"
echo "--------------------------------------------------------------------------------"
"$@"
}
case "${1:-help}" in
smoke)
# Short correctness-oriented run. Useful before starting long jobs.
run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mps \
--nqubits 40 \
--nlayers 10 \
--bond 2048 \
--torch-threads "$THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx long_z_string
;;
convergence)
# Same circuit/observable, increasing bond. Check value convergence.
for bond in ${BONDS:-4096 16384 65536}; do
run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mps \
--nqubits "${NQ:-80}" \
--nlayers "${LAYERS:-16}" \
--bond "$bond" \
--torch-threads "$THREADS" \
--circuits "${CIRCUIT:-brickwall_cnot}" \
--observables "${OBSERVABLE:-ring_xz}"
done
;;
single-long)
# Single long Vidal run. On node-3, a similar n=40,l=30,bond=2048 case
# took about 9 minutes for one expectation. This one is meant to be longer.
run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mps \
--nqubits "${NQ:-80}" \
--nlayers "${LAYERS:-16}" \
--bond "${BOND:-65536}" \
--torch-threads "$THREADS" \
--circuits "${CIRCUIT:-brickwall_cnot}" \
--observables "${OBSERVABLE:-ring_xz}"
;;
suite-long)
# Application-style multi-circuit, multi-observable MPS run.
# This is intentionally multi-term and should run much longer than single-long.
run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mps \
--nqubits "${NQ:-80}" \
--nlayers "${LAYERS:-16}" \
--bond "${BOND:-65536}" \
--torch-threads "$THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx long_z_string
;;
mpi-long)
# Multi-node Vidal segmented MPS run. Uses HOSTFILE.
run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${NQ:-80}" \
--nlayers "${LAYERS:-16}" \
--bond "${BOND:-65536}" \
--torch-threads "$MPI_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx long_z_string
;;
stress)
# Heavier entanglement. Start only after single-long is stable.
run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
--mps \
--nqubits "${NQ:-80}" \
--nlayers "${LAYERS:-18}" \
--bond "${BOND:-262144}" \
--torch-threads "${THREADS:-48}" \
--circuits "${CIRCUIT:-rxx_rzz}" \
--observables ring_xz open_zz range2_xx
;;
help|*)
cat <<'EOF'
Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress]
Common overrides:
PYTHON_BIN=.venv/bin/python
THREADS=32
OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
Single-node scale overrides:
NQ=80 LAYERS=16 BOND=65536
CIRCUIT=brickwall_cnot
OBSERVABLE=ring_xz
BONDS="4096 16384 65536" # for convergence mode
Multi-node overrides:
HOSTFILE=hostfile
MPI_RANKS=16 MPI_THREADS=12
Recommended first runs:
./run_vidal_mps_cases.sh smoke
./run_vidal_mps_cases.sh convergence
./run_vidal_mps_cases.sh single-long
EOF
;;
esac

View File

@@ -3,9 +3,10 @@ from typing import Union
from qibo.config import raise_error
from qibotn.backends.abstract import QibotnBackend
from qibotn.backends.cpu import CpuTensorNet
from qibotn.backends.cutensornet import CuTensorNet # pylint: disable=E0401
PLATFORMS = ("cutensornet", "quimb", "qmatchatea")
PLATFORMS = ("cutensornet", "cpu", "quimb", "qmatchatea", "vidal")
class MetaBackend:
@@ -24,6 +25,8 @@ class MetaBackend:
if platform == "cutensornet": # pragma: no cover
return CuTensorNet(runcard)
elif platform == "cpu":
return CpuTensorNet(runcard)
elif platform == "quimb": # pragma: no cover
import qibotn.backends.quimb as qmb
@@ -36,6 +39,10 @@ class MetaBackend:
from qibotn.backends.qmatchatea import QMatchaTeaBackend
return QMatchaTeaBackend()
elif platform == "vidal":
from qibotn.backends.vidal import VidalBackend
return VidalBackend()
else:
raise_error(
NotImplementedError,

415
src/qibotn/backends/cpu.py Normal file
View File

@@ -0,0 +1,415 @@
"""CPU tensor-network backend with cutensornet-style runcard support."""
from __future__ import annotations
import os
import numpy as np
from qibo import hamiltonians
from qibo.backends import NumpyBackend
from qibo.config import raise_error
from qibotn.backends.abstract import QibotnBackend
from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason
from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
from qibotn.backends.vidal_tebd import VidalTEBDExecutor
from qibotn.observables import check_observable
from qibotn.result import TensorNetworkResult
def _as_bool_or_dict(value, name):
if isinstance(value, (bool, dict)):
return value
raise TypeError(f"{name} has an unexpected type")
def _parse_cpu_list(text):
cpus = set()
for item in text.split(","):
item = item.strip()
if not item:
continue
if "-" in item:
start, stop = item.split("-", 1)
cpus.update(range(int(start), int(stop) + 1))
else:
cpus.add(int(item))
return cpus
class CpuTensorNet(QibotnBackend, NumpyBackend):
"""CPU replacement for the cutensornet runcard execution surface.
The backend preserves the high-level runcard knobs used by the GPU backend:
``MPI_enabled``, ``MPS_enabled`` and ``expectation_enabled``. Generic TN
work is delegated to quimb on CPU; MPS expectation uses the Vidal fast path
when the circuit is nearest-neighbor and falls back to quimb otherwise.
"""
def __init__(self, runcard=None):
super().__init__()
self.name = "qibotn"
self.platform = "cpu"
self.precision = "double"
self.configure_tn_simulation(runcard)
def configure_tn_simulation(self, runcard=None):
runcard = {} if runcard is None else runcard
self.rank = 0
self.MPI_enabled = bool(runcard.get("MPI_enabled", False))
self.NCCL_enabled = bool(runcard.get("NCCL_enabled", False))
if self.NCCL_enabled:
raise_error(NotImplementedError, "NCCL is only available for GPU backends.")
expectation = runcard.get("expectation_enabled", False)
if expectation is True:
self.expectation_enabled = True
self.observable = None
elif expectation is False:
self.expectation_enabled = False
self.observable = None
elif isinstance(expectation, (dict, hamiltonians.SymbolicHamiltonian)):
self.expectation_enabled = True
self.observable = expectation
else:
raise TypeError("expectation_enabled has an unexpected type")
mps = _as_bool_or_dict(runcard.get("MPS_enabled", False), "MPS_enabled")
self.MPS_enabled = bool(mps)
self.mps_options = mps if isinstance(mps, dict) else {}
self.max_bond_dimension = runcard.get(
"max_bond_dimension",
self.mps_options.get("max_bond_dimension", 512),
)
self.cut_ratio = runcard.get(
"cut_ratio",
self.mps_options.get(
"cut_ratio",
self.mps_options.get("svd_method", {}).get("abs_cutoff", 1e-12),
),
)
self.tensor_module = runcard.get("tensor_module", "torch")
self.torch_threads = runcard.get("torch_threads", None)
self.quimb_backend = runcard.get("quimb_backend", "numpy")
self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq")
self.parallel_opts = runcard.get("parallel_opts", {})
self.parallel_stats = []
def execute_circuit(
self,
circuit,
initial_state=None,
nshots=None,
prob_type=None,
return_array=False,
**prob_kwargs,
):
if initial_state is not None:
raise_error(NotImplementedError, "QiboTN CPU backend does not support initial state.")
if self.torch_threads is not None and self.tensor_module == "torch":
import torch
torch.set_num_threads(self.torch_threads)
if self.expectation_enabled:
value = self.expectation(circuit, self.observable)
if self.MPI_enabled and self.rank > 0:
return np.asarray([0], dtype=np.int64)
return np.asarray([value], dtype=np.float64)
backend = self._quimb_backend()
backend.configure_tn_simulation(
ansatz="mps" if self.MPS_enabled else None,
max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
svd_cutoff=self.cut_ratio,
)
return backend.execute_circuit(
circuit=circuit,
nshots=nshots,
return_array=return_array,
)
def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None):
observable = check_observable(observable, circuit.nqubits)
if self.MPS_enabled:
reason = _unsupported_reason(circuit)
if reason is None:
return self._vidal_expectation(circuit, observable)
backend = self._quimb_backend()
backend.configure_tn_simulation(
ansatz="mps" if self.MPS_enabled else None,
max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
svd_cutoff=self.cut_ratio,
)
if self.MPI_enabled:
return self._quimb_expectation_mpi(backend, circuit, observable)
return self._quimb_expectation_processpool(backend, circuit, observable)
def _vidal_expectation(self, circuit, observable):
terms = _symbolic_hamiltonian_to_pauli_terms(observable)
if self.MPI_enabled:
from mpi4py import MPI
comm = MPI.COMM_WORLD
self.rank = comm.Get_rank()
executor = SegmentVidalMPIExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
comm=comm,
)
executor.run_circuit(circuit)
value = executor.expectation_pauli_sum_root(terms)
return np.nan if self.rank != 0 else float(np.real(value))
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
)
executor.run_circuit(circuit)
return float(np.real(executor.expectation_pauli_sum(terms)))
def _quimb_backend(self):
import qibotn.backends.quimb as qmb
return qmb.BACKENDS[self.quimb_backend](
quimb_backend=self.quimb_backend,
contraction_optimizer=self.contraction_optimizer,
)
def _bind_rank_to_numa_domain(self, rank):
domain = rank % 2
cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
try:
cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
if cpus:
os.sched_setaffinity(0, cpus)
except (FileNotFoundError, OSError):
pass
try:
import ctypes
libnuma = ctypes.CDLL("libnuma.so.1")
if libnuma.numa_available() >= 0:
libnuma.numa_run_on_node(ctypes.c_int(domain))
libnuma.numa_set_preferred(ctypes.c_int(domain))
except Exception:
pass
self.numa_domain = domain
def _default_search_workers(self, nranks=1):
if self.torch_threads:
return max(1, int(self.torch_threads))
return max(1, (os.cpu_count() or 1) // max(1, nranks))
def _quimb_expectation_processpool(self, backend, circuit, observable):
return self._quimb_expectation_search(
backend,
circuit,
observable,
method="processpool",
comm=None,
)
def _quimb_expectation_mpi(self, backend, circuit, observable):
from mpi4py import MPI
comm = MPI.COMM_WORLD
self.rank = comm.Get_rank()
self._bind_rank_to_numa_domain(self.rank)
return self._quimb_expectation_search(
backend,
circuit,
observable,
method="mpi",
comm=comm,
)
def _quimb_expectation_search(self, backend, circuit, observable, method, comm=None):
rank = comm.Get_rank() if comm is not None else 0
size = comm.Get_size() if comm is not None else 1
self.rank = rank
from qibotn.observables import extract_gates_and_qubits
from qibotn.parallel import (
contraction_tree_costs,
parallel_contract,
parallel_path_search,
)
from qibotn.backends.quimb import (
PAULI_DENSE_MAX_QUBITS,
_pauli_term_to_dense_operator,
pauli_product_expectation_tn,
)
opts = dict(self.parallel_opts)
user_slicing_opts = opts.get("slicing_opts")
search_workers = opts.get("search_workers", self._default_search_workers(size))
search_repeats = opts.get("max_repeats", 128)
search_time = opts.get("max_time", 60)
search_backend = opts.get("search_backend")
dask_address = opts.get("dask_address")
qc = backend._qibo_circuit_to_quimb(
circuit,
quimb_circuit_type=backend.circuit_ansatz,
gate_opts={
"max_bond": self.max_bond_dimension,
"cutoff": self.cut_ratio,
},
)
total_value = 0.0 + 0.0j
terms = extract_gates_and_qubits(observable)
for coeff, factors in terms:
if not factors:
if self.rank == 0:
total_value += coeff
continue
if len(factors) > PAULI_DENSE_MAX_QUBITS:
tn = pauli_product_expectation_tn(
qc,
factors,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
)
else:
op, where = _pauli_term_to_dense_operator(factors)
tn = qc.local_expectation(
op,
where,
rehearse="tn",
simplify_sequence="ADCRS",
simplify_atol=1e-12,
)
slicing_opts = self._mpi_slicing_opts(
user_slicing_opts,
)
tree = parallel_path_search(
tn,
tn.outer_inds(),
method="dask" if method != "mpi" and search_backend == "dask" else method,
total_repeats=search_repeats,
max_time=search_time,
n_workers=search_workers,
slicing_opts=slicing_opts,
trial_timeout=opts.get("trial_timeout"),
search_backend=search_backend,
dask_address=dask_address,
)
if tree is None:
raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
if int(getattr(tree, "multiplicity", 1)) <= 1:
if self.rank == 0:
value = self._contract_term_unsliced(tn, tree, backend)
self.parallel_stats.append(
{
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"tree_slices": 1,
"slice_assignment": "root",
"rank_slices": [1] + [0] * (size - 1),
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
"search_backend": search_backend or method,
"dask_address": dask_address,
"numa_domain": getattr(self, "numa_domain", None),
}
)
total_value += coeff * complex(value)
continue
if comm is None:
value = self._contract_term_unsliced(tn, tree, backend)
self.parallel_stats.append(
{
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"tree_slices": int(getattr(tree, "multiplicity", 1)),
"slice_assignment": "local",
"rank_slices": [int(getattr(tree, "multiplicity", 1))],
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
"search_backend": search_backend or method,
"dask_address": dask_address,
"numa_domain": getattr(self, "numa_domain", None),
}
)
total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
continue
arrays = self._term_arrays(tn, backend)
value, stats = parallel_contract(
tree,
arrays,
method="mpi",
comm=comm,
return_stats=True,
)
gathered_stats = comm.gather(stats, root=0)
if rank == 0:
self.parallel_stats.append(
{
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"tree_slices": stats.nslices,
"slice_assignment": stats.assignment,
"rank_slices": [
item.local_slices for item in gathered_stats
],
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
"search_backend": search_backend or method,
"dask_address": dask_address,
"numa_domain": getattr(self, "numa_domain", None),
}
)
total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
return np.nan if rank != 0 else float(np.real(total_value))
def _contract_term_unsliced(self, tn, tree, backend):
if backend.backend == "torch":
import torch
from qibotn.backends.quimb import _torch_cpu_array
for tensor in tn.tensors:
tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128)
return tn.contract(all, output_inds=(), optimize=tree, backend="torch")
return tn.contract(
all,
output_inds=(),
optimize=tree,
backend=backend.backend,
)
def _mpi_slicing_opts(self, user_slicing_opts):
return None if user_slicing_opts is None else dict(user_slicing_opts)
def _term_arrays(self, tn, backend):
if backend.backend == "torch":
import torch
from qibotn.backends.quimb import _torch_cpu_array
return [
_torch_cpu_array(array, dtype=torch.complex128)
for array in tn.arrays
]
return [backend.engine.asarray(array) for array in tn.arrays]

View File

@@ -37,6 +37,8 @@ GATE_MAP = {
"measure": "measure",
}
PAULI_DENSE_MAX_QUBITS = 8
def _torch_cpu_array(data, dtype=None):
"""Convert array-like data to a contiguous CPU torch tensor."""
@@ -68,6 +70,81 @@ def _arrays_to_backend(arrays, backend, engine):
return [engine.asarray(array) for array in arrays]
def _pauli_term_to_dense_operator(factors):
op = None
where = []
for qubit, gate_name in factors:
pauli = qu.pauli(gate_name.lower())
op = pauli if op is None else op & pauli
where.append(qubit)
return op, tuple(where)
def pauli_product_expectation_tn(
quimb_circuit,
factors,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
simplify_equalize_norms=True,
):
"""Build the scalar TN for ``<psi|P|psi>`` without dense Pauli strings."""
import numpy as np
op_by_site = {
int(qubit): qu.pauli(str(gate_name).lower())
for qubit, gate_name in factors
if str(gate_name).upper() != "I"
}
ket = quimb_circuit.get_psi_simplified(
seq=simplify_sequence,
atol=simplify_atol,
equalize_norms=simplify_equalize_norms,
)
bra = ket.conj().reindex(
{
quimb_circuit.ket_site_ind(qubit): quimb_circuit.bra_site_ind(qubit)
for qubit in range(quimb_circuit.N)
}
)
tn = bra | ket
identity = np.eye(2, dtype=complex)
for qubit in range(quimb_circuit.N):
data = op_by_site.get(qubit, identity)
tn |= qtn.Tensor(
data=data,
inds=(
quimb_circuit.bra_site_ind(qubit),
quimb_circuit.ket_site_ind(qubit),
),
)
tn.full_simplify_(
output_inds=(),
seq=simplify_sequence,
atol=simplify_atol,
equalize_norms=simplify_equalize_norms,
)
return tn
def pauli_product_expectation(
quimb_circuit,
factors,
backend,
optimize,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
):
tn = pauli_product_expectation_tn(
quimb_circuit,
factors,
simplify_sequence=simplify_sequence,
simplify_atol=simplify_atol,
)
return tn.contract(all, output_inds=(), optimize=optimize, backend=backend)
def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"):
super(self.__class__, self).__init__()
@@ -340,6 +417,9 @@ def _qibo_circuit_to_quimb(
circ.apply_gate("CNOT", c, t)
continue
if quimb_gate_name is None:
if hasattr(gate, "matrix"):
circ.apply_gate_raw(gate.matrix(), getattr(gate, "qubits", ()))
continue
raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")
params = getattr(gate, "parameters", ())
@@ -426,20 +506,24 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
exp_val = 0.0
for coeff, factors in all_terms:
op = None
where = []
for qubit, gate_name in factors:
p = qu.pauli(gate_name.lower())
op = p if op is None else op & p
where.append(qubit)
val = qc.local_expectation(
op, tuple(where),
backend=self.backend,
optimize=self.contractions_optimizer,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
)
if len(factors) > PAULI_DENSE_MAX_QUBITS:
val = pauli_product_expectation(
qc,
factors,
backend=self.backend,
optimize=self.contractions_optimizer,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
)
else:
op, where = _pauli_term_to_dense_operator(factors)
val = qc.local_expectation(
op, where,
backend=self.backend,
optimize=self.contractions_optimizer,
simplify_sequence="ADCRS",
simplify_atol=1e-12,
)
exp_val += coeff * val
return self.real(exp_val)
@@ -487,14 +571,11 @@ def _expectation_parallel(self, circuit, observable, method, opts):
my_exp = 0.0
for coeff, factors in my_terms:
op = None
where = []
for qubit, gate_name in factors:
p = qu.pauli(gate_name.lower())
op = p if op is None else op & p
where.append(qubit)
tn = qc.local_expectation(op, tuple(where), rehearse='tn')
if len(factors) > PAULI_DENSE_MAX_QUBITS:
tn = pauli_product_expectation_tn(qc, factors)
else:
op, where = _pauli_term_to_dense_operator(factors)
tn = qc.local_expectation(op, where, rehearse='tn')
tree = parallel_path_search(
tn, tn.outer_inds(),

View File

@@ -0,0 +1,250 @@
"""Vidal/TEBD fast-path backend with qmatchatea fallback.
This backend targets MPS-friendly one-dimensional circuits: one-qubit gates and
adjacent two-qubit gates, measured with Pauli-sum expectation values. Unsupported
features fall back to the qmatchatea backend so the public behavior remains
usable while the fast path is expanded.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
import numpy as np
from qibo.backends import NumpyBackend
from qibotn.backends.abstract import QibotnBackend
from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
from qibotn.backends.vidal_tebd import VidalTEBDExecutor, _gate_sites
from qibotn.observables import check_observable
def _symbolic_hamiltonian_to_pauli_terms(hamiltonian):
terms = []
factor_pattern = re.compile(r"([^\d]+)(\d+)")
for term in hamiltonian.terms:
ops = []
for factor in term.factors:
match = factor_pattern.match(str(factor))
if match is None:
raise ValueError(f"Unsupported observable factor {factor!r}.")
name = match.group(1).upper()
if name not in ("I", "X", "Y", "Z"):
raise ValueError(f"Unsupported observable operator {name!r}.")
if name != "I":
ops.append((name, int(match.group(2))))
terms.append((complex(term.coefficient), tuple(ops)))
return terms
def _unsupported_reason(circuit):
for gate in circuit.queue:
name = getattr(gate, "name", gate.__class__.__name__)
sites = _gate_sites(gate)
if not sites:
return f"gate {name} has no target qubits"
if len(sites) > 2:
return f"gate {name} acts on {len(sites)} qubits"
if len(sites) == 2 and abs(sites[0] - sites[1]) != 1:
return f"gate {name} is non-adjacent on qubits {sites}"
if not hasattr(gate, "matrix"):
return f"gate {name} does not expose a matrix"
return None
def _can_route_non_adjacent(circuit):
"""True if the circuit's only unsupported feature is non-adjacent 2Q gates.
SWAP routing can fix non-adjacent gates at compile time. Multi-qubit
gates and matrix-less gates are truly unsupported.
"""
for gate in circuit.queue:
sites = _gate_sites(gate)
if not sites:
return False
if len(sites) > 2:
return False
if not hasattr(gate, "matrix"):
return False
return True
@dataclass
class VidalBackend(QibotnBackend, NumpyBackend):
"""QiboTN backend using Vidal/TEBD when possible.
The fast path supports:
- one-qubit gates with ``gate.matrix()``;
- adjacent two-qubit gates with ``gate.matrix()``;
- Qibo ``SymbolicHamiltonian`` / qibotn dict Pauli-sum expectation values;
- MPI chain segmentation through ``mpi_approach="CT"``.
Unsupported operations are delegated to qmatchatea.
"""
def __init__(self):
super().__init__()
self.name = "qibotn"
self.platform = "vidal"
self.precision = "double"
self.last_truncation_error = 0.0
self.configure_tn_simulation()
def configure_tn_simulation(
self,
ansatz: str = "MPS",
max_bond_dimension: int = 10,
cut_ratio: float = 1e-9,
trunc_tracking_mode: str = "C",
svd_control: str = "E!",
ini_bond_dimension: int = 1,
tensor_module: str = "torch",
compile_circuit: bool = False,
cache_gate_tensors: bool = True,
track_memory: bool = False,
mpi_approach: str = "SR",
mpi_num_procs: int = 1,
mpi_where_barriers: int = -1,
mpi_isometrization: int = -1,
fallback: bool = True,
):
self.ansatz = ansatz
self.max_bond_dimension = max_bond_dimension
self.cut_ratio = cut_ratio
self.trunc_tracking_mode = trunc_tracking_mode
self.svd_control = svd_control
self.ini_bond_dimension = ini_bond_dimension
self.tensor_module = tensor_module
self.compile_circuit = compile_circuit
self.cache_gate_tensors = cache_gate_tensors
self.track_memory = track_memory
self.mpi_approach = mpi_approach.upper()
self.mpi_num_procs = mpi_num_procs
self.mpi_where_barriers = mpi_where_barriers
self.mpi_isometrization = mpi_isometrization
self.fallback = fallback
self._fallback_backend = None
def _setup_backend_specifics(self):
return None
def _qmatchatea_fallback(self):
if self._fallback_backend is None:
backend = QMatchaTeaBackend()
backend.configure_tn_simulation(
ansatz=self.ansatz,
max_bond_dimension=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
trunc_tracking_mode=self.trunc_tracking_mode,
svd_control=self.svd_control,
ini_bond_dimension=self.ini_bond_dimension,
tensor_module=self.tensor_module,
compile_circuit=self.compile_circuit,
cache_gate_tensors=self.cache_gate_tensors,
track_memory=self.track_memory,
mpi_approach=self.mpi_approach,
mpi_num_procs=self.mpi_num_procs,
mpi_where_barriers=self.mpi_where_barriers,
mpi_isometrization=self.mpi_isometrization,
)
self._fallback_backend = backend
return self._fallback_backend
def _fallback_or_raise(self, reason):
if not self.fallback:
raise NotImplementedError(reason)
return self._qmatchatea_fallback()
def _run_fast_executor(self, circuit, compile_circuit=True):
if self.mpi_approach == "CT":
from mpi4py import MPI
executor = SegmentVidalMPIExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
comm=MPI.COMM_WORLD,
)
else:
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
)
executor.run_circuit(circuit, compile_circuit=compile_circuit)
return executor
def expectation(self, circuit, observable, preprocess=True, compile_circuit=None):
if self.ansatz.upper() != "MPS":
backend = self._fallback_or_raise("VidalBackend supports only MPS.")
return backend.expectation(circuit, observable, preprocess, compile_circuit)
if compile_circuit is None:
compile_circuit = self.compile_circuit
reason = _unsupported_reason(circuit)
if reason is not None:
# Non-adjacent gates can be routed at compile time
if compile_circuit and _can_route_non_adjacent(circuit):
pass # proceed with Vidal + SWAP routing
else:
backend = self._fallback_or_raise(reason)
return backend.expectation(circuit, observable, preprocess, compile_circuit)
# preprocess=True requests qmatchatea-style transpilation to MPS topology.
# The fast path validates MPS-compatibility via _unsupported_reason above;
# if the circuit is already MPS-compatible, preprocessing is a no-op.
# If it isn't, _unsupported_reason already routed to fallback.
# So the fast path skips preprocessing — just warn once.
if preprocess:
import warnings
warnings.warn(
"VidalBackend fast path does not preprocess circuits. "
"If the circuit passes _unsupported_reason (all gates are "
"MPS-compatible), fast-path execution proceeds directly.",
stacklevel=2,
)
hamiltonian = check_observable(observable, circuit.nqubits)
try:
terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian)
except ValueError as exc:
backend = self._fallback_or_raise(str(exc))
return backend.expectation(circuit, observable, preprocess, compile_circuit)
executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
self.last_truncation_error = float(executor.truncation_error)
if self.mpi_approach == "CT":
value = executor.expectation_pauli_sum_root(terms)
from qtealeaves.tooling.mpisupport import MPI
if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
return np.nan
return np.real(value)
return np.real(executor.expectation_pauli_sum(terms))
def execute_circuit(
self,
circuit,
initial_state=None,
nshots=None,
prob_type=None,
return_array=False,
**prob_kwargs,
):
backend = self._fallback_or_raise(
"VidalBackend.execute_circuit is delegated to qmatchatea."
)
return backend.execute_circuit(
circuit,
initial_state=initial_state,
nshots=nshots,
prob_type=prob_type,
return_array=return_array,
**prob_kwargs,
)

View File

@@ -0,0 +1,338 @@
"""Segmented MPI Vidal/TEBD executor.
Each rank owns a contiguous interval of sites. Gates fully inside an interval
are applied locally. Only two-site gates crossing a rank boundary communicate
the neighboring edge tensor and the resulting boundary update.
"""
from __future__ import annotations
import time
from dataclasses import dataclass
import numpy as np
from mpi4py import MPI
from qibotn.backends.vidal_tebd import (
_asarray,
_backend_module,
_build_theta_svd_matrix,
_disjoint_batches,
_fuse_one_site_blocks,
_gate_sites,
_is_two_qubit_batch,
_make_two_site_update,
_ones,
_route_non_adjacent_gates,
_svd,
_tensor_update_from_numpy,
_tensor_update_to_numpy,
_to_float,
_to_numpy,
_transpose,
VidalTEBDExecutor,
)
_EDGE_TAG = 1701
_UPDATE_TAG = 1702
def _partition_sites(nsites, nranks):
base = nsites // nranks
rem = nsites % nranks
starts = [0]
for rank in range(nranks):
starts.append(starts[-1] + base + int(rank < rem))
return starts
@dataclass
class SegmentVidalMPIExecutor:
nqubits: int
max_bond: int
comm: object
cut_ratio: float = 1e-12
tensor_module: str = "torch"
def __post_init__(self):
self.rank = self.comm.Get_rank()
self.size = self.comm.Get_size()
self.starts = _partition_sites(self.nqubits, self.size)
self.start = self.starts[self.rank]
self.end = self.starts[self.rank + 1]
if self.start == self.end:
raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.")
self.xp = _backend_module(self.tensor_module)
if self.xp is np:
self.dtype = np.complex128
self.device = None
else:
self.dtype = self.xp.complex128
self.device = self.xp.device("cpu")
self.gammas = {}
for site in range(self.start, self.end):
self.gammas[site] = _asarray(
self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype
)
self.lambdas = {
bond: _ones(self.xp, 1, self.dtype, self.device)
for bond in range(self.start, self.end + 1)
}
self._accumulated_truncation_error = 0.0
@property
def truncation_error(self):
return self._accumulated_truncation_error
def owns_site(self, site):
return self.start <= site < self.end
def owner_of(self, site):
return int(np.searchsorted(self.starts, site, side="right") - 1)
def run_circuit(self, circuit, compile_circuit=True):
timings = {
"local_compute": 0.0,
"edge_exchange": 0.0,
"boundary_compute": 0.0,
"boundary_update": 0.0,
"one_site": 0.0,
"gather": 0.0,
}
gates = circuit.queue
if compile_circuit:
gates = _route_non_adjacent_gates(gates, circuit.nqubits)
gates = _fuse_one_site_blocks(gates)
for batch in _disjoint_batches(gates):
if _is_two_qubit_batch(batch):
self._apply_two_site_batch(batch, timings)
else:
tic = time.perf_counter()
for gate in batch:
sites = _gate_sites(gate)
if len(sites) == 1 and self.owns_site(sites[0]):
op = _asarray(self.xp, gate.matrix(), self.dtype)
self.apply_one_site(op, sites[0])
elif len(sites) == 2:
self._apply_two_site_batch([gate], timings)
elif len(sites) > 2:
raise NotImplementedError("Only one- and two-qubit gates are supported.")
timings["one_site"] += time.perf_counter() - tic
return timings
def apply_one_site(self, op, pos):
self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
def _apply_two_site_batch(self, batch, timings):
local_gates = []
boundary_specs = []
recv_left_update = False
for gate in batch:
sites = _gate_sites(gate)
if abs(sites[0] - sites[1]) != 1:
raise NotImplementedError("Segment Vidal supports adjacent two-qubit gates only.")
left, right = sorted(sites)
left_owner = self.owner_of(left)
right_owner = self.owner_of(right)
if left_owner == self.rank and right_owner == self.rank:
local_gates.append(gate)
elif left_owner == self.rank:
boundary_specs.append((gate, left, right))
elif right_owner == self.rank:
recv_left_update = True
tic = time.perf_counter()
edge_send_req = None
if recv_left_update:
edge_send_req = self.comm.isend(
self._edge_payload(), dest=self.rank - 1, tag=_EDGE_TAG
)
right_edge = (
self.comm.recv(source=self.rank + 1, tag=_EDGE_TAG)
if boundary_specs
else None
)
timings["edge_exchange"] += time.perf_counter() - tic
boundary_update = None
tic = time.perf_counter()
for gate, left, right in boundary_specs:
boundary_update = self._compute_boundary_update(
gate, left, right, right_edge
)
timings["boundary_compute"] += time.perf_counter() - tic
tic = time.perf_counter()
update_send_req = None
if boundary_update is not None:
update_send_req = self.comm.isend(
boundary_update, dest=self.rank + 1, tag=_UPDATE_TAG
)
timings["boundary_update"] += time.perf_counter() - tic
tic = time.perf_counter()
local_items = [
self._compute_owned_two_site_update(gate)
for gate in local_gates
]
timings["local_compute"] += time.perf_counter() - tic
tic = time.perf_counter()
left_boundary_update = (
self.comm.recv(source=self.rank - 1, tag=_UPDATE_TAG)
if recv_left_update
else None
)
if update_send_req is not None:
update_send_req.wait()
if edge_send_req is not None:
edge_send_req.wait()
timings["boundary_update"] += time.perf_counter() - tic
for update in local_items:
self._install_update(update)
if boundary_update is not None:
self._install_update(boundary_update)
if left_boundary_update is not None:
self._install_update(left_boundary_update)
def _edge_payload(self):
return {
"start": self.start,
"end": self.end,
"gamma_start": _to_numpy(self.gammas[self.start]),
"lambda_after_start": _to_numpy(self.lambdas[self.start + 1]),
}
def _compute_owned_two_site_update(self, gate):
sites = _gate_sites(gate)
op = _asarray(self.xp, gate.matrix(), self.dtype)
left, right = sites
if left > right:
left, right = right, left
op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
item = self._build_item(
left,
op,
self.lambdas[left],
self.lambdas[left + 1],
self.lambdas[left + 2],
self.gammas[left],
self.gammas[right],
)
split = _svd(self.xp, item["matrix"])
return _make_two_site_update(
item, *split, self.max_bond, self.cut_ratio, self.xp
)
def _compute_boundary_update(self, gate, left, right, remote):
op = _asarray(self.xp, gate.matrix(), self.dtype)
sites = _gate_sites(gate)
if sites[0] > sites[1]:
op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
gamma_right = _asarray(self.xp, remote["gamma_start"], self.dtype)
lam_right = _asarray(
self.xp,
remote["lambda_after_start"],
self.xp.float64 if self.xp is not np else np.float64,
)
item = self._build_item(
left,
op,
self.lambdas[left],
self.lambdas[left + 1],
lam_right,
self.gammas[left],
gamma_right,
)
split = _svd(self.xp, item["matrix"])
return _tensor_update_to_numpy(
_make_two_site_update(
item, *split, self.max_bond, self.cut_ratio, self.xp
)
)
def _build_item(self, site, op, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
result = _build_theta_svd_matrix(
op, self.xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right
)
result["site"] = site
result["lam_left"] = lam_left
result["lam_right"] = lam_right
return result
def _install_update(self, update):
if isinstance(update["left"], np.ndarray):
update = _tensor_update_from_numpy(self.xp, update, self.dtype)
self._accumulated_truncation_error += update.get("truncation_error", 0.0)
site = update["site"]
if self.owns_site(site):
self.gammas[site] = update["left"]
if self.owns_site(site + 1):
self.gammas[site + 1] = update["right"]
if self.start <= site + 1 <= self.end:
self.lambdas[site + 1] = update["lambda"]
def gather_full_state(self):
payload = {
"start": self.start,
"end": self.end,
"gammas": {site: _to_numpy(tensor) for site, tensor in self.gammas.items()},
"lambdas": {bond: _to_numpy(tensor) for bond, tensor in self.lambdas.items()},
}
return self.comm.gather(payload, root=0)
def expectation_pauli_sum_root(self, terms):
payloads = self.gather_full_state()
if self.rank != 0:
return None
full = VidalTEBDExecutor(
self.nqubits,
self.max_bond,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
)
for payload in payloads:
for site, tensor in payload["gammas"].items():
full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype)
for bond, tensor in payload["lambdas"].items():
full.lambdas[int(bond)] = _asarray(
self.xp,
tensor,
self.xp.float64 if self.xp is not np else np.float64,
)
return full.expectation_pauli_sum(terms)
def expectation_ring_xz_root(self):
terms = [
(0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
for site in range(self.nqubits)
]
return self.expectation_pauli_sum_root(terms)
def run_segment_vidal_mpi_ring_xz(
circuit,
max_bond,
comm,
cut_ratio=1e-12,
tensor_module="torch",
):
executor = SegmentVidalMPIExecutor(
nqubits=circuit.nqubits,
max_bond=max_bond,
cut_ratio=cut_ratio,
tensor_module=tensor_module,
comm=comm,
)
timings = executor.run_circuit(circuit)
tic = time.perf_counter()
value = executor.expectation_ring_xz_root()
timings["gather"] = time.perf_counter() - tic
return value, timings

View File

@@ -62,6 +62,41 @@ def _to_float(x):
return float(x)
def _to_numpy(tensor):
if hasattr(tensor, "detach"):
return tensor.detach().cpu().numpy()
return np.asarray(tensor)
def _tensor_update_to_numpy(update):
result = {
"site": int(update["site"]),
"left": _to_numpy(update["left"]),
"right": _to_numpy(update["right"]),
"lambda": _to_numpy(update["lambda"]),
}
if "truncation_error" in update:
result["truncation_error"] = float(update["truncation_error"])
return result
def _tensor_update_from_numpy(xp, update, dtype):
if xp is np:
return update
result = {
"site": update["site"],
"left": _asarray(xp, update["left"], dtype),
"right": _asarray(xp, update["right"], dtype),
"lambda": xp.as_tensor(
update["lambda"],
dtype=xp.float64 if dtype == xp.complex128 else xp.float32,
),
}
if "truncation_error" in update:
result["truncation_error"] = float(update["truncation_error"])
return result
def _svd(xp, matrix):
return _svd_eigh(xp, matrix)
@@ -92,32 +127,6 @@ def _svd_eigh(xp, matrix):
return umat, singvals, _conj(xp, eigvecs).T
def _batched_svd_eigh(xp, matrices):
"""Batched EVD SVD for a stack of same-shape torch matrices."""
m_dim, n_dim = matrices.shape[-2:]
if m_dim <= n_dim:
grams = matrices @ _conj(xp, matrices).transpose(-1, -2)
eigvals, eigvecs = xp.linalg.eigh(grams)
idx = xp.argsort(eigvals, dim=-1, descending=True)
eigvals = xp.gather(eigvals, -1, idx)
eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
singvals = _sqrt_clamped(xp, eigvals)
inv_s = _safe_inverse(xp, singvals)
vhs = (eigvecs.conj().transpose(-1, -2) @ matrices) * inv_s.unsqueeze(-1)
return eigvecs, singvals, vhs
grams = _conj(xp, matrices).transpose(-1, -2) @ matrices
eigvals, eigvecs = xp.linalg.eigh(grams)
idx = xp.argsort(eigvals, dim=-1, descending=True)
eigvals = xp.gather(eigvals, -1, idx)
eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
singvals = _sqrt_clamped(xp, eigvals)
inv_s = _safe_inverse(xp, singvals)
umats = (matrices @ eigvecs) * inv_s.unsqueeze(-2)
return umats, singvals, eigvecs.conj().transpose(-1, -2)
def _eigh(xp, matrix):
if xp is np:
return np.linalg.eigh(matrix)
@@ -126,10 +135,8 @@ def _eigh(xp, matrix):
def _sort_eigh_desc(xp, eigvals, eigvecs):
if xp is np:
idx = np.argsort(eigvals)[::-1].copy()
return eigvals[idx], eigvecs[:, idx]
idx = xp.argsort(eigvals, descending=True)
return eigvals[idx], eigvecs[:, idx]
return eigvals[::-1].copy(), eigvecs[:, ::-1].copy()
return xp.flip(eigvals, dims=(0,)), xp.flip(eigvecs, dims=(1,))
def _sqrt_clamped(xp, eigvals):
@@ -150,8 +157,6 @@ class VidalTEBDExecutor:
max_bond: int
cut_ratio: float = 1e-12
tensor_module: str = "torch"
workers: int = 1
use_batched: bool = False
def __post_init__(self):
self.xp = _backend_module(self.tensor_module)
@@ -169,19 +174,20 @@ class VidalTEBDExecutor:
self.lambdas = [
_ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
]
self._accumulated_truncation_error = 0.0
def run_circuit(self, circuit):
for batch in _disjoint_batches(circuit.queue):
if (
self.use_batched
and _is_two_qubit_batch(batch)
and self.workers > 1
and len(batch) > 1
):
self.apply_two_site_batch(batch)
else:
for gate in batch:
self._apply_gate(gate)
def run_circuit(self, circuit, compile_circuit=True):
gates = circuit.queue
if compile_circuit:
gates = _route_non_adjacent_gates(gates, circuit.nqubits)
gates = _fuse_one_site_blocks(gates)
for batch in _disjoint_batches(gates):
for gate in batch:
self._apply_gate(gate)
@property
def truncation_error(self):
return self._accumulated_truncation_error
def _apply_gate(self, gate):
sites = _gate_sites(gate)
@@ -204,40 +210,6 @@ class VidalTEBDExecutor:
umat, singvals, vh = _svd(self.xp, item["matrix"])
self._install_two_site_split(item, umat, singvals, vh)
def apply_two_site_batch(self, batch):
items = []
for gate in batch:
sites = _gate_sites(gate)
op = _asarray(self.xp, gate.matrix(), self.dtype)
items.append(self._build_two_site_matrix(op, sites[0], sites[1]))
if self.xp is not np:
grouped = {}
for idx, item in enumerate(items):
grouped.setdefault(tuple(item["matrix"].shape), []).append(idx)
for indices in grouped.values():
if len(indices) == 1:
item = items[indices[0]]
umat, singvals, vh = _svd(self.xp, item["matrix"])
item["split"] = (umat, singvals, vh)
continue
matrices = self.xp.stack([items[idx]["matrix"] for idx in indices])
umats, singvals, vhs = _batched_svd_eigh(self.xp, matrices)
for out_idx, item_idx in enumerate(indices):
items[item_idx]["split"] = (
umats[out_idx],
singvals[out_idx],
vhs[out_idx],
)
else:
for item in items:
item["split"] = _svd(self.xp, item["matrix"])
for item in items:
self._install_two_site_split(item, *item["split"])
def _build_two_site_matrix(self, op, left_pos, right_pos):
if left_pos > right_pos:
left_pos, right_pos = right_pos, left_pos
@@ -246,101 +218,69 @@ class VidalTEBDExecutor:
)
i = left_pos
lam_left = self.lambdas[i]
lam_mid = self.lambdas[i + 1]
lam_right = self.lambdas[i + 2]
gamma_left = self.gammas[i]
gamma_right = self.gammas[i + 1]
theta = self.xp.einsum(
"a,asb,b,btc,c->astc",
lam_left,
gamma_left,
lam_mid,
gamma_right,
lam_right,
result = _build_theta_svd_matrix(
op, self.xp,
self.lambdas[i], self.lambdas[i + 1], self.lambdas[i + 2],
self.gammas[i], self.gammas[i + 1],
)
gate = op.reshape(2, 2, 2, 2)
theta = self.xp.einsum("uvst,astc->auvc", gate, theta)
chi_left = theta.shape[0]
chi_right = theta.shape[3]
matrix = theta.reshape(chi_left * 2, 2 * chi_right)
return {
"site": i,
"chi_left": chi_left,
"chi_right": chi_right,
"lam_left": lam_left,
"lam_right": lam_right,
"matrix": matrix,
}
result["site"] = i
result["lam_left"] = self.lambdas[i]
result["lam_right"] = self.lambdas[i + 2]
return result
def _install_two_site_split(self, item, umat, singvals, vh):
keep = self._choose_bond(singvals)
umat = umat[:, :keep]
kept = singvals[:keep]
cut = singvals[keep:]
vh = vh[:keep, :]
if cut.shape[0] > 0:
norm_kept = (kept * kept).sum()
norm_cut = (cut * cut).sum()
kept = kept / self.xp.sqrt(norm_kept / (norm_kept + norm_cut))
new_left = umat.reshape(item["chi_left"], 2, keep)
new_right = vh.reshape(keep, 2, item["chi_right"])
new_left = self._divide_left_lambda(new_left, item["lam_left"])
new_right = self._divide_right_lambda(new_right, item["lam_right"])
i = item["site"]
self.gammas[i] = new_left
self.gammas[i + 1] = new_right
self.lambdas[i + 1] = kept
def _choose_bond(self, singvals):
max_possible = int(singvals.shape[0])
keep = min(max_possible, self.max_bond)
if self.cut_ratio > 0 and max_possible > 0:
threshold = singvals[0] * self.cut_ratio
if self.xp is np:
ratio_keep = int(np.count_nonzero(singvals > threshold))
else:
ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
keep = min(keep, max(1, ratio_keep))
return keep
def _divide_left_lambda(self, tensor, lambdas):
if self.xp is np:
safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
else:
safe = self.xp.where(
self.xp.abs(lambdas) > 1e-300,
lambdas,
self.xp.ones_like(lambdas),
)
return tensor / safe.reshape(-1, 1, 1)
def _divide_right_lambda(self, tensor, lambdas):
if self.xp is np:
safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
else:
safe = self.xp.where(
self.xp.abs(lambdas) > 1e-300,
lambdas,
self.xp.ones_like(lambdas),
)
return tensor / safe.reshape(1, 1, -1)
update = _make_two_site_update(item, umat, singvals, vh,
self.max_bond, self.cut_ratio, self.xp)
self._accumulated_truncation_error += update["truncation_error"]
i = update["site"]
self.gammas[i] = update["left"]
self.gammas[i + 1] = update["right"]
self.lambdas[i + 1] = update["lambda"]
def expectation_ring_xz(self):
xmat = _asarray(self.xp, [[0, 1], [1, 0]], self.dtype)
zmat = _asarray(self.xp, [[1, 0], [0, -1]], self.dtype)
value = 0.0
for site in range(self.nqubits - 1):
value += 0.5 * _to_float(self._expect_adjacent(site, xmat, zmat))
value += 0.5 * _to_float(
self.expect_product_operators({self.nqubits - 1: xmat, 0: zmat})
return self.expectation_pauli_sum(
[
(0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
for site in range(self.nqubits)
]
)
return value / self.norm()
def expectation_pauli_sum(self, terms):
paulis = {
"I": _eye(self.xp, 2, self.dtype, self.device),
"X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
"Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
"Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
}
value = 0.0
norm = self.norm()
for coeff, ops in terms:
ops = tuple((name.upper(), site) for name, site in ops)
if len(ops) == 0:
term_value = norm
elif len(ops) == 1:
name, site = ops[0]
term_value = _to_float(self._expect_one_site(site, paulis[name]))
elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1:
(name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1])
term_value = _to_float(
self._expect_adjacent(site0, paulis[name0], paulis[name1])
)
else:
operators = {site: paulis[name] for name, site in ops}
term_value = _to_float(self.expect_product_operators(operators))
value += float(np.real(coeff)) * term_value
return value / norm
def _expect_one_site(self, site, op):
theta = self.xp.einsum(
"a,asb,b->asb",
self.lambdas[site],
self.gammas[site],
self.lambdas[site + 1],
)
op_theta = self.xp.einsum("us,asb->aub", op, theta)
return _vdot_real(self.xp, theta, op_theta)
def _expect_adjacent(self, site, op_left, op_right):
theta = self.xp.einsum(
@@ -369,6 +309,79 @@ class VidalTEBDExecutor:
return _to_float(self.expect_product_operators({}))
def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
"""Merge and apply a two-site gate, returning the SVD-ready matrix."""
theta = xp.einsum(
"a,asb,b,btc,c->astc",
lam_left, gamma_left, lam_mid, gamma_right, lam_right,
)
gate = op.reshape(2, 2, 2, 2)
theta = xp.einsum("uvst,astc->auvc", gate, theta)
chi_left = theta.shape[0]
chi_right = theta.shape[3]
return {
"chi_left": chi_left,
"chi_right": chi_right,
"matrix": theta.reshape(chi_left * 2, 2 * chi_right),
}
def _choose_bond(singvals, max_bond, cut_ratio, xp):
max_possible = int(singvals.shape[0])
keep = min(max_possible, max_bond)
if cut_ratio > 0 and max_possible > 0:
threshold = singvals[0] * cut_ratio
if xp is np:
ratio_keep = int(np.count_nonzero(singvals > threshold))
else:
ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
keep = min(keep, max(1, ratio_keep))
return keep
def _divide_left_lambda(tensor, lambdas, xp):
if xp is np:
safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
else:
safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
return tensor / safe.reshape(-1, 1, 1)
def _divide_right_lambda(tensor, lambdas, xp):
if xp is np:
safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
else:
safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
return tensor / safe.reshape(1, 1, -1)
def _make_two_site_update(item, umat, singvals, vh, max_bond, cut_ratio, xp):
keep = _choose_bond(singvals, max_bond, cut_ratio, xp)
umat = umat[:, :keep]
kept = singvals[:keep]
cut = singvals[keep:]
vh = vh[:keep, :]
discarded_weight = 0.0
if cut.shape[0] > 0:
norm_kept = (kept * kept).sum()
norm_cut = (cut * cut).sum()
discarded_weight = float(_to_float(norm_cut))
kept = kept / xp.sqrt(norm_kept / (norm_kept + norm_cut))
new_left = umat.reshape(item["chi_left"], 2, keep)
new_right = vh.reshape(keep, 2, item["chi_right"])
new_left = _divide_left_lambda(new_left, item["lam_left"], xp)
new_right = _divide_right_lambda(new_right, item["lam_right"], xp)
return {
"site": item["site"],
"left": new_left,
"right": new_right,
"lambda": kept,
"truncation_error": discarded_weight,
}
def _gate_sites(gate):
controls = tuple(getattr(gate, "control_qubits", ()))
targets = tuple(getattr(gate, "target_qubits", ()))
@@ -377,19 +390,90 @@ def _gate_sites(gate):
return targets
# ── SWAP routing for non-adjacent two-qubit gates ──────────────────────
class _SWAPGate:
"""Minimal SWAP gate wrapper for routing non-adjacent gates."""
name = "swap"
control_qubits = ()
def __init__(self, left, right):
self.target_qubits = (left, right)
def matrix(self):
return np.array(
[[1, 0, 0, 0],
[0, 0, 1, 0],
[0, 1, 0, 0],
[0, 0, 0, 1]],
dtype=complex,
)
class _RoutedTwoQubitGate:
"""Wraps a two-qubit gate with remapped physical sites after SWAP routing."""
name = "routed_two_qubit"
control_qubits = ()
def __init__(self, original_gate, left_phys, right_phys):
self.target_qubits = (left_phys, right_phys)
self._matrix = original_gate.matrix()
def matrix(self):
return self._matrix
def _route_non_adjacent_gates(gates, nqubits):
"""Insert SWAP networks to make all two-qubit gates adjacent.
For each non-adjacent two-qubit gate, inserts SWAP gates to bring the
farther qubit adjacent, applies the original gate, then inserts reverse
SWAPs to restore the qubit ordering. The resulting gate sequence
contains only adjacent two-qubit gates and is safe for VidalTEBDExecutor.
"""
routed = []
for gate in gates:
sites = _gate_sites(gate)
if len(sites) <= 1:
routed.append(gate)
continue
left, right = sorted(sites)
if right - left == 1:
routed.append(gate)
continue
# Move qubit 'right' leftwards to sit at left+1
for pos in range(right - 1, left, -1):
routed.append(_SWAPGate(pos, pos + 1))
# Apply the original gate at the adjacent physical positions
routed.append(_RoutedTwoQubitGate(gate, left, left + 1))
# Reverse SWAPs to restore original ordering
for pos in range(left + 1, right):
routed.append(_SWAPGate(pos, pos + 1))
return routed
def _disjoint_batches(gates):
batches = []
current = []
touched = set()
current_arity = None
for gate in gates:
sites = _gate_sites(gate)
arity = len(sites)
site_set = set(sites)
if current and touched & site_set:
if current and (current_arity != arity or touched & site_set):
batches.append(current)
current = []
touched = set()
current_arity = None
current.append(gate)
touched |= site_set
current_arity = arity
if current:
batches.append(current)
return batches
@@ -399,21 +483,59 @@ def _is_two_qubit_batch(batch):
return batch and all(len(_gate_sites(gate)) == 2 for gate in batch)
class _FusedOneSiteGate:
name = "fused_one_site"
def __init__(self, site, matrix):
self.target_qubits = (site,)
self.control_qubits = ()
self._matrix = matrix
def matrix(self):
return self._matrix
def _fuse_one_site_blocks(gates):
fused = []
block = []
def flush_block():
nonlocal block
if not block:
return
per_site = {}
for gate in block:
site = _gate_sites(gate)[0]
mat = gate.matrix()
if site in per_site:
per_site[site] = mat @ per_site[site]
else:
per_site[site] = mat
for site in sorted(per_site):
fused.append(_FusedOneSiteGate(site, per_site[site]))
block = []
for gate in gates:
if len(_gate_sites(gate)) == 1:
block.append(gate)
continue
flush_block()
fused.append(gate)
flush_block()
return fused
def run_vidal_ring_xz(
circuit,
max_bond,
cut_ratio=1e-12,
tensor_module="torch",
workers=1,
use_batched=False,
):
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=max_bond,
cut_ratio=cut_ratio,
tensor_module=tensor_module,
workers=workers,
use_batched=use_batched,
)
executor.run_circuit(circuit)
return executor.expectation_ring_xz()

View File

@@ -0,0 +1,151 @@
"""Reusable benchmark circuits and observables for expectation runs."""
from __future__ import annotations
import math
import numpy as np
from qibo import Circuit, gates
CIRCUITS = (
"brickwall_cnot",
"reversed_cnot",
"shifted_cz",
"rxx_rzz",
"swap_scramble",
"ghz_ladder",
)
OBSERVABLES = (
"ring_xz",
"open_zz",
"mixed_local",
"range2_xx",
"long_z_string",
)
def parse_names(raw, valid, label):
if raw == ["all"]:
return list(valid)
unknown = sorted(set(raw) - set(valid))
if unknown:
raise ValueError(f"Unknown {label}: {', '.join(unknown)}")
return raw
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
if kind == "ghz_ladder":
circuit.add(gates.H(0))
for qubit in range(nqubits - 1):
circuit.add(gates.CNOT(qubit, qubit + 1))
return circuit
for layer in range(nlayers):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
if kind in ("rxx_rzz", "swap_scramble"):
circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
if kind == "brickwall_cnot":
add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
elif kind == "reversed_cnot":
add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
elif kind == "shifted_cz":
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(qubit, qubit + 1))
elif kind == "rxx_rzz":
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
elif kind == "swap_scramble":
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(qubit, qubit + 1))
if layer % 4 == 3:
circuit.add(gates.SWAP(qubit, qubit + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def add_brickwall(circuit, nqubits, gate, layer, reverse):
for qubit in range(0, nqubits - 1, 2):
if reverse and layer % 2:
circuit.add(gate(qubit + 1, qubit))
else:
circuit.add(gate(qubit, qubit + 1))
for qubit in range(1, nqubits - 1, 2):
if reverse and not layer % 2:
circuit.add(gate(qubit + 1, qubit))
else:
circuit.add(gate(qubit, qubit + 1))
def observable_terms(kind, nqubits):
if kind == "ring_xz":
return [
(0.5, (("X", site), ("Z", (site + 1) % nqubits)))
for site in range(nqubits)
]
if kind == "open_zz":
return [
(1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
for site in range(nqubits - 1)
]
if kind == "mixed_local":
terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
terms += [
(0.125, (("Y", site), ("Y", site + 1)))
for site in range(0, nqubits - 1, 3)
]
return terms
if kind == "range2_xx":
return [
(1.0 / max(1, nqubits - 2), (("X", site), ("X", site + 2)))
for site in range(nqubits - 2)
]
if kind == "long_z_string":
stride = max(1, nqubits // 16)
return [(1.0, tuple(("Z", site) for site in range(0, nqubits, stride)))]
raise ValueError(f"Unknown observable kind {kind!r}.")
def terms_to_dict(terms):
return {
"terms": [
{
"coefficient": float(np.real(coeff)),
"operators": [(name, int(site)) for name, site in ops],
}
for coeff, ops in terms
]
}
def exact_pauli_sum(circuit, terms, nqubits):
state = circuit().state(numpy=True).reshape(-1)
indices = np.arange(state.size, dtype=np.int64)
value = 0.0 + 0.0j
for coeff, ops in terms:
flipped = indices.copy()
phase = np.ones(state.size, dtype=np.complex128)
for name, site in ops:
shift = nqubits - 1 - site
bit = (indices >> shift) & 1
if name == "X":
flipped ^= 1 << shift
elif name == "Y":
flipped ^= 1 << shift
phase *= 1j * (1 - 2 * bit)
elif name == "Z":
phase *= 1 - 2 * bit
elif name != "I":
raise ValueError(f"Unsupported Pauli {name!r}.")
value += coeff * np.vdot(state[flipped], phase * state)
return float(value.real)

View File

@@ -0,0 +1,71 @@
"""High-level CPU expectation runner used by CLI scripts."""
from __future__ import annotations
import time
from dataclasses import dataclass
import numpy as np
from qibo.backends import construct_backend
from qibotn.benchmark_cases import exact_pauli_sum
from qibotn.observables import check_observable
@dataclass
class ExpectationConfig:
ansatz: str = "tn"
mpi: bool = False
bond: int = 1024
cut_ratio: float = 1e-12
tensor_module: str = "torch"
torch_threads: int = 8
parallel_opts: dict | None = None
@dataclass
class ExpectationResult:
value: float
seconds: float
rank: int = 0
def exact_for_observable(circuit, observable, nqubits):
if isinstance(observable, dict) and "terms" in observable:
terms = [
(
term["coefficient"],
tuple((name, site) for name, site in term["operators"]),
)
for term in observable["terms"]
]
return exact_pauli_sum(circuit, terms, nqubits)
hamiltonian = check_observable(observable, nqubits)
return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
def run_cpu_expectation(circuit, observable, config):
runcard = {
"MPI_enabled": config.mpi,
"MPS_enabled": config.ansatz.lower() == "mps",
"NCCL_enabled": False,
"expectation_enabled": observable,
"max_bond_dimension": config.bond,
"cut_ratio": config.cut_ratio,
"tensor_module": config.tensor_module,
"torch_threads": config.torch_threads,
"parallel_opts": config.parallel_opts or {},
}
backend = construct_backend(
backend="qibotn",
platform="cpu",
runcard=runcard,
)
start = time.perf_counter()
value = backend.execute_circuit(circuit)[0]
elapsed = time.perf_counter() - start
rank = getattr(backend, "rank", 0)
return ExpectationResult(float(np.real(value)), elapsed, rank=rank)

View File

@@ -29,6 +29,11 @@ def build_observable(circuit_nqubit):
def create_hamiltonian_from_dict(data, circuit_nqubit):
"""Create a Qibo SymbolicHamiltonian from the qibotn dict representation."""
if "pauli_string_pattern" in data:
return create_hamiltonian_from_pauli_pattern(
data["pauli_string_pattern"], circuit_nqubit
)
pauli_gates = {"X": X, "Y": Y, "Z": Z}
terms = []
@@ -54,6 +59,38 @@ def create_hamiltonian_from_dict(data, circuit_nqubit):
return hamiltonians.SymbolicHamiltonian(sum(terms))
def create_hamiltonian_from_pauli_pattern(pattern, circuit_nqubit):
"""Create a single Pauli-string Hamiltonian by repeating ``pattern``.
Example: pattern ``"IXZ"`` on 5 qubits becomes ``I0 * X1 * Z2 * I3 * X4``.
Identity factors are omitted except for the all-identity case.
"""
if not isinstance(pattern, str) or not pattern:
raise ValueError("pauli_string_pattern must be a non-empty string.")
pauli_gates = {"X": X, "Y": Y, "Z": Z}
pattern = pattern.upper()
invalid = sorted(set(pattern) - {"I", "X", "Y", "Z"})
if invalid:
raise ValueError(
"pauli_string_pattern characters must be one of I/X/Y/Z; "
f"got {''.join(invalid)!r}."
)
expr = None
for qubit in range(circuit_nqubit):
name = pattern[qubit % len(pattern)]
if name == "I":
continue
factor = pauli_gates[name](qubit)
expr = factor if expr is None else expr * factor
if expr is None:
expr = I(0)
return hamiltonians.SymbolicHamiltonian(form=expr)
def build_random_circuit(nqubits, nlayers, seed=42):
"""Build a random circuit with RY+RZ+CNOT layers for benchmarks."""
import numpy as np

View File

@@ -2,8 +2,11 @@
import os
import pickle
import signal
import time
from math import log2, log10
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from dataclasses import dataclass
from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed
try:
from mpi4py import MPI
@@ -13,25 +16,59 @@ except ImportError:
MPI = None
def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass")
def _search_chunk(
tn_bytes,
output_inds,
repeats,
seed,
max_time,
slicing_opts,
optlib=None,
):
import random, cotengra as ctg
random.seed(seed)
tn = pickle.loads(tn_bytes)
kwargs = {}
if optlib is not None:
kwargs["optlib"] = optlib
opt = ctg.HyperOptimizer(
methods=["kahypar", "kahypar-agglom", "spinglass"],
max_repeats=1,
methods=SEARCH_METHODS,
max_repeats=repeats,
max_time=max_time,
parallel=False,
minimize="combo-256",
optlib="random",
slicing_opts=slicing_opts,
progbar=False,
**kwargs,
)
tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
return tree.combo_cost(factor=256), tree
def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
return _search_chunk(
tn_bytes,
output_inds,
repeats=1,
seed=seed,
max_time=None,
slicing_opts=slicing_opts,
optlib="random",
)
def _kill_pool(pool):
for pid in list(pool._processes.keys()):
processes = getattr(pool, "_processes", None)
if processes:
pids = list(processes.keys())
else:
pids = []
for pid in pids:
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
@@ -43,21 +80,14 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
import time
if trial_timeout is None:
import random, cotengra as ctg
random.seed(seed)
tn = pickle.loads(tn_bytes)
opt = ctg.HyperOptimizer(
methods=["kahypar", "kahypar-agglom", "spinglass"],
max_repeats=repeats,
parallel=False,
minimize="combo-256",
return _search_chunk(
tn_bytes,
output_inds,
repeats=repeats,
seed=seed,
max_time=max_time,
optlib="random",
slicing_opts=slicing_opts,
progbar=False,
)
tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
return tree.combo_cost(factor=256), tree
deadline = time.time() + max_time
best_cost, best_tree = float("inf"), None
@@ -80,17 +110,36 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
return best_cost, best_tree
def _split_repeats(total_repeats, n_workers):
n_workers = max(1, int(n_workers))
total_repeats = max(1, int(total_repeats))
chunk, extra = divmod(total_repeats, n_workers)
return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0]
def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None):
tn_bytes = pickle.dumps(tn)
repeats_per = max(1, total_repeats // n_workers)
pool = ProcessPoolExecutor(max_workers=n_workers)
futures = [
pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time, slicing_opts, trial_timeout)
for seed in range(n_workers)
]
repeat_chunks = _split_repeats(total_repeats, n_workers)
pool = ProcessPoolExecutor(max_workers=len(repeat_chunks))
futures = []
for seed, repeats in enumerate(repeat_chunks):
futures.append(
pool.submit(
_serial_search,
tn_bytes,
output_inds,
repeats,
seed,
max_time,
slicing_opts,
trial_timeout,
)
)
best_cost, best_tree = float("inf"), None
deadline = time.monotonic() + max_time if max_time is not None else None
try:
for fut in as_completed(futures, timeout=max_time + 5):
timeout = None if deadline is None else max(0.0, deadline - time.monotonic())
for fut in as_completed(futures, timeout=timeout):
try:
cost, tree = fut.result()
if cost < best_cost:
@@ -106,21 +155,133 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli
return best_tree
def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicing_opts=None, trial_timeout=None):
def _dask_search(
tn,
output_inds,
total_repeats,
max_time,
slicing_opts=None,
dask_address=None,
n_workers=None,
optlib=None,
):
"""Run one centralized cotengra hyper-optimizer over a dask pool.
With ``dask_address`` this connects to an external distributed scheduler.
Without it, a local dask cluster is created for single-node smoke testing.
"""
try:
from distributed import Client, LocalCluster, get_client
except ImportError as exc:
raise ImportError(
"Dask search requires `distributed`. Install it with "
"`pip install distributed` or the package extra that provides it."
) from exc
import cotengra as ctg
close_client = False
close_cluster = False
cluster = None
if dask_address:
client = Client(dask_address)
close_client = True
else:
try:
client = get_client()
except ValueError:
cluster = LocalCluster(
n_workers=max(1, int(n_workers or os.cpu_count() or 1)),
threads_per_worker=1,
processes=True,
memory_limit=0,
)
client = Client(cluster)
close_client = True
close_cluster = True
kwargs = {}
if optlib is not None:
kwargs["optlib"] = optlib
try:
opt = ctg.HyperOptimizer(
methods=SEARCH_METHODS,
max_repeats=total_repeats,
max_time=max_time,
parallel=client,
minimize="combo-256",
slicing_opts=slicing_opts,
progbar=False,
**kwargs,
)
return tn.contraction_tree(optimize=opt, output_inds=output_inds)
finally:
if close_client:
client.close()
if close_cluster:
cluster.close()
def _mpi_search(
tn,
output_inds,
total_repeats,
max_time,
n_workers=None,
slicing_opts=None,
trial_timeout=None,
search_backend="processpool",
dask_address=None,
):
comm = MPI.COMM_WORLD
rank, size = comm.Get_rank(), comm.Get_size()
tn_bytes = pickle.dumps(tn)
search_backend = search_backend or "processpool"
if search_backend == "dask":
if not dask_address:
raise ValueError(
"MPI + dask search requires an external dask scheduler. Start "
"dask-scheduler/dask-worker outside mpiexec and pass "
"`--dask-address tcp://host:8786`."
)
payload = None
if rank == 0:
try:
tree = _dask_search(
tn,
output_inds,
total_repeats,
max_time,
slicing_opts=slicing_opts,
dask_address=dask_address,
n_workers=n_workers,
)
payload = ("ok", tree)
except Exception as exc:
payload = ("error", repr(exc))
status, value = comm.bcast(payload, root=0)
if status == "error":
raise RuntimeError(f"Dask path search failed on rank 0: {value}")
return value
repeats_per = max(1, total_repeats // size)
if n_workers and n_workers > 1:
local_tree = _processpool_search(
tn, output_inds, repeats_per, n_workers, max_time, slicing_opts, trial_timeout
)
local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
else:
local_cost, local_tree = _serial_search(
tn_bytes, output_inds, repeats_per, rank, max_time, slicing_opts, trial_timeout
)
# Run search work in child processes even when n_workers == 1, so the parent
# MPI rank can enforce the global timeout by killing active trials.
local_tree = _processpool_search(
tn,
output_inds,
repeats_per,
max(1, n_workers or 1),
max_time,
slicing_opts,
trial_timeout,
)
local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
all_results = comm.gather((local_cost, local_tree), root=0)
best_tree = None
@@ -133,11 +294,13 @@ def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicin
def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
max_time=300, n_workers=48, slicing_opts=None, trial_timeout=None):
max_time=300, n_workers=48, slicing_opts=None,
trial_timeout=None, search_backend=None,
dask_address=None):
"""Parallel contraction path search.
Args:
method: 'processpool' | 'mpi' | 'serial'
method: 'processpool' | 'dask' | 'mpi' | 'serial'
total_repeats: Total optimization repeats across all workers
max_time: Global timeout per worker (seconds)
n_workers: Workers per MPI rank (or total for processpool)
@@ -151,45 +314,185 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
elif method == 'mpi':
if not _HAVE_MPI:
raise ImportError("mpi4py not available")
return _mpi_search(tn, output_inds, total_repeats, max_time, n_workers, slicing_opts, trial_timeout)
return _mpi_search(
tn,
output_inds,
total_repeats,
max_time,
n_workers,
slicing_opts,
trial_timeout,
search_backend=search_backend,
dask_address=dask_address,
)
elif method == 'processpool':
return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
elif method == 'dask':
return _dask_search(
tn,
output_inds,
total_repeats,
max_time,
slicing_opts=slicing_opts,
dask_address=dask_address,
n_workers=n_workers,
)
else:
raise ValueError(f"Unknown method: {method}")
def parallel_contract(tree, arrays, method='mpi', comm=None):
def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
"""Return comparable cost estimates for a cotengra contraction tree.
These values are estimates, not profiling results. They are the right first
signal for path quality: lower ``combo`` usually means lower CPU contraction
time, while ``peak_memory_gib`` estimates the largest intermediate tensor.
"""
stats = tree.contract_stats()
flops = float(stats["flops"])
write = float(stats["write"])
size = float(stats["size"])
combo = float(tree.combo_cost(factor=combo_factor))
nslices = int(getattr(tree, "multiplicity", 1))
original_flops = float(stats.get("original_flops", flops))
return {
"flops": flops,
"write": write,
"size": size,
"combo": combo,
"log10_flops": log10(flops) if flops > 0 else float("-inf"),
"log10_write": log10(write) if write > 0 else float("-inf"),
"log2_size": log2(size) if size > 0 else float("-inf"),
"log10_combo": log10(combo) if combo > 0 else float("-inf"),
"nslices": nslices,
"slicing_overhead": flops / original_flops if original_flops > 0 else float("nan"),
"peak_memory_gib": size * dtype_bytes / 1024**3,
}
@dataclass(frozen=True)
class SlicePlan:
"""Slice ownership for one MPI rank."""
rank: int
size: int
nslices: int
indices: tuple
assignment: str = "block"
@dataclass(frozen=True)
class SlicedContractStats:
"""Diagnostics for one sliced contraction."""
rank: int
size: int
nslices: int
local_slices: int
assignment: str
def mpi_slice_plan(nslices, rank, size, assignment="block"):
"""Return the contraction slice ids assigned to one MPI rank.
``block`` gives each rank a contiguous range, mirroring cutensornet's
slice-range style. ``cyclic`` gives rank ``r`` slices ``r, r + size, ...``,
which can balance better if individual slice costs vary.
"""
if nslices < 0:
raise ValueError("nslices must be non-negative.")
if size <= 0:
raise ValueError("size must be positive.")
if not 0 <= rank < size:
raise ValueError("rank must satisfy 0 <= rank < size.")
if assignment == "block":
chunk, extra = divmod(nslices, size)
start = rank * chunk + min(rank, extra)
stop = start + chunk + (1 if rank < extra else 0)
indices = tuple(range(start, stop))
elif assignment == "cyclic":
indices = tuple(range(rank, nslices, size))
else:
raise ValueError("assignment must be 'block' or 'cyclic'.")
return SlicePlan(rank, size, nslices, indices, assignment)
def _array_backend(arrays):
return "torch" if type(arrays[0]).__module__.startswith("torch") else "numpy"
def _to_numpy_vector(value, is_torch):
if is_torch:
return value.detach().cpu().numpy().reshape(-1)
return np.asarray(value).reshape(-1)
def _zero_vector_like(arrays):
return np.zeros(1, dtype=np.complex128)
def contract_tree_slices(tree, arrays, slice_indices, backend=None):
"""Contract a subset of cotengra slices and return their local sum."""
backend = backend or _array_backend(arrays)
is_torch = backend == "torch"
local = None
for slice_id in slice_indices:
value = tree.contract_slice(arrays, slice_id, backend=backend)
value = _to_numpy_vector(value, is_torch)
local = value if local is None else local + value
return _zero_vector_like(arrays) if local is None else local
def parallel_contract(
tree,
arrays,
method='mpi',
comm=None,
assignment="block",
return_stats=False,
):
if method == 'mpi':
if not _HAVE_MPI or comm is None:
raise ValueError("MPI method requires mpi4py and comm")
return _contract_mpi(tree, arrays, comm)
return _contract_mpi(
tree,
arrays,
comm,
assignment=assignment,
return_stats=return_stats,
)
raise ValueError(f"Unknown method: {method}")
def _contract_mpi(tree, arrays, comm, root=0):
def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False):
rank, size = comm.Get_rank(), comm.Get_size()
is_torch = type(arrays[0]).__module__.startswith("torch")
backend = _array_backend(arrays)
is_torch = backend == "torch"
nslices = int(getattr(tree, "multiplicity", 1))
stats = SlicedContractStats(rank, size, nslices, 0, assignment)
if is_torch:
result_torch = None
for i in range(rank, tree.multiplicity, size):
x = tree.contract_slice(arrays, i, backend="torch").reshape(-1)
result_torch = x if result_torch is None else result_torch + x
if not set(getattr(tree, "sliced_inds", ())).isdisjoint(set(getattr(tree, "output", ()))):
raise NotImplementedError(
"MPI sliced contraction currently requires sliced indices not to "
"appear in the output."
)
if result_torch is None:
result_np = np.zeros(1, dtype=np.complex128)
else:
result_np = result_torch.detach().cpu().numpy()
else:
result_np = None
for i in range(rank, tree.multiplicity, size):
x = tree.contract_slice(arrays, i)
x_np = np.asarray(x).reshape(-1)
result_np = x_np if result_np is None else result_np + x_np
if nslices <= 1:
if rank != root:
return (None, stats) if return_stats else None
result = tree.contract(arrays, backend=backend)
result = _to_numpy_vector(result, is_torch)
return (result, stats) if return_stats else result
if result_np is None:
result_np = np.zeros(1, dtype=np.complex128)
plan = mpi_slice_plan(nslices, rank, size, assignment=assignment)
local = contract_tree_slices(tree, arrays, plan.indices, backend=backend)
stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment)
result = np.zeros_like(result_np) if rank == root else None
comm.Reduce(result_np, result, root=root)
return result
result = np.zeros_like(local) if rank == root else None
comm.Reduce(local, result, root=root)
return (result, stats) if return_stats else result

130
tests/test_cpu_backend.py Normal file
View File

@@ -0,0 +1,130 @@
import math
import numpy as np
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Z
from qibotn.backends.cpu import CpuTensorNet
from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit
def build_circuit(nqubits=6):
circuit = Circuit(nqubits)
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=0.1 * (qubit + 1)))
circuit.add(gates.RZ(qubit, theta=-0.05 * (qubit + 1)))
for qubit in range(nqubits - 1):
circuit.add(gates.CNOT(qubit, qubit + 1))
return circuit
def build_observable(nqubits):
form = 0
for qubit in range(nqubits):
form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
def test_cpu_generic_tn_expectation_matches_statevector():
circuit = build_circuit()
observable = build_observable(circuit.nqubits)
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": False,
"NCCL_enabled": False,
"expectation_enabled": observable,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, exact, abs_tol=1e-12)
def test_cpu_mps_expectation_matches_statevector():
circuit = build_circuit()
observable = build_observable(circuit.nqubits)
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": True,
"NCCL_enabled": False,
"expectation_enabled": observable,
"max_bond_dimension": 64,
"tensor_module": "torch",
"torch_threads": 1,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, exact, abs_tol=1e-12)
def test_cpu_runcard_pauli_pattern_matches_statevector():
circuit = build_circuit()
observable = {"pauli_string_pattern": "IXZ"}
exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
form=X(1) * Z(2) * X(4) * Z(5)
)
exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
for mps_enabled in (False, True):
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": mps_enabled,
"NCCL_enabled": False,
"expectation_enabled": observable,
"max_bond_dimension": 64,
"tensor_module": "torch",
"torch_threads": 1,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, exact, abs_tol=1e-12)
def test_cpu_mps_sampling_uses_nshots():
circuit = Circuit(4)
circuit.add(gates.H(0))
for qubit in range(3):
circuit.add(gates.CNOT(qubit, qubit + 1))
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": True,
"NCCL_enabled": False,
"expectation_enabled": False,
}
)
result = backend.execute_circuit(circuit, nshots=100)
assert sum(result.frequencies().values()) == 100
assert set(result.frequencies()) <= {"0000", "1111"}
def test_cpu_generic_tn_long_pauli_string_matches_statevector():
circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42)
observable = {"pauli_string_pattern": "XZ"}
exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
form=X(0) * Z(1) * X(2) * Z(3) * X(4) * Z(5) * X(6) * Z(7) * X(8) * Z(9)
)
exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": False,
"NCCL_enabled": False,
"expectation_enabled": observable,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, exact, abs_tol=1e-12)

46
tests/test_parallel.py Normal file
View File

@@ -0,0 +1,46 @@
import numpy as np
from qibotn.parallel import _split_repeats, contract_tree_slices, mpi_slice_plan
def test_mpi_slice_plan_block_balances_contiguous_ranges():
plans = [mpi_slice_plan(10, rank, 4, assignment="block") for rank in range(4)]
assert [plan.indices for plan in plans] == [
(0, 1, 2),
(3, 4, 5),
(6, 7),
(8, 9),
]
def test_mpi_slice_plan_cyclic_balances_round_robin():
plans = [mpi_slice_plan(10, rank, 4, assignment="cyclic") for rank in range(4)]
assert [plan.indices for plan in plans] == [
(0, 4, 8),
(1, 5, 9),
(2, 6),
(3, 7),
]
class DummyTree:
def contract_slice(self, arrays, i, backend=None):
return arrays[0] * (i + 1)
def test_contract_tree_slices_sums_numpy_slices():
result = contract_tree_slices(
DummyTree(),
[np.asarray([2.0 + 0.0j])],
(0, 2, 3),
backend="numpy",
)
np.testing.assert_allclose(result, np.asarray([16.0 + 0.0j]))
def test_split_repeats_balances_workers():
assert _split_repeats(10, 4) == [3, 3, 2, 2]
assert _split_repeats(2, 4) == [1, 1]

167
tests/test_vidal_backend.py Normal file
View File

@@ -0,0 +1,167 @@
import math
import numpy as np
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason
from qibotn.backends.vidal_tebd import (
_route_non_adjacent_gates,
_gate_sites,
)
def build_local_circuit(nqubits=8, nlayers=3, seed=42):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
return circuit
def test_vidal_backend_expectation_matches_statevector():
circuit = build_local_circuit()
observable = hamiltonians.SymbolicHamiltonian(
form=0.5 * X(0) * Z(1) + 0.25 * Y(2) * Y(3) - 0.7 * Z(7)
)
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(max_bond_dimension=128, tensor_module="torch")
value = backend.expectation(circuit, observable)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_fallback_for_non_adjacent_gate():
"""compile_circuit=False (default) → falls back to qmatchatea for non-adjacent."""
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 3))
observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
backend = VidalBackend()
backend.configure_tn_simulation(max_bond_dimension=32, tensor_module="torch")
value = backend.expectation(circuit, observable)
exact = observable.expectation_from_state(circuit().state(numpy=True))
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_routes_non_adjacent_with_compile():
"""Non-adjacent gate with compile_circuit=True goes through Vidal SWAP routing."""
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 3))
observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
)
value = backend.expectation(circuit, observable)
exact = observable.expectation_from_state(circuit().state(numpy=True))
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_can_route_non_adjacent():
"""_can_route_non_adjacent correctly identifies routable circuits."""
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 3))
assert _can_route_non_adjacent(circuit)
circuit.add(gates.CNOT(0, 1))
assert _can_route_non_adjacent(circuit)
def test_cannot_route_multi_qubit():
"""Circuits with 3+ qubit gates cannot be routed."""
circuit = Circuit(3)
circuit.add(gates.TOFFOLI(0, 1, 2))
assert not _can_route_non_adjacent(circuit)
def test_routing_preserves_adjacent_gates():
"""_route_non_adjacent_gates leaves adjacent gates unchanged."""
circuit = build_local_circuit(nqubits=4, nlayers=2)
original = list(circuit.queue)
routed = _route_non_adjacent_gates(original, 4)
# Count 2Q gates — should be more due to inserted SWAPs, so just
# check that all 2-site gates ARE adjacent.
for gate in routed:
sites = _gate_sites(gate)
if len(sites) == 2:
diff = abs(sites[0] - sites[1])
assert diff == 1, f"Non-adjacent gate after routing: {gate.name} on {sites}"
def test_routing_non_adjacent_cnot():
"""Manually verify SWAP+CNOT+unSWAP for CNOT(0,3)."""
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.H(3))
circuit.add(gates.CNOT(0, 3))
routed = _route_non_adjacent_gates(list(circuit.queue), 4)
# Expected: H(0), H(3), SWAP(2,3), SWAP(1,2), routed CNOT on (0,1), SWAP(1,2), SWAP(2,3)
names = [getattr(g, "name", g.__class__.__name__) for g in routed]
assert names == ["h", "h", "swap", "swap", "routed_two_qubit", "swap", "swap"], f"Got {names}"
# Verify expectation through full pipeline
observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
)
value = backend.expectation(circuit, observable)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_truncation_error_no_truncation():
"""With large bond, truncation error should be essentially zero."""
circuit = build_local_circuit(nqubits=6, nlayers=2)
observable = hamiltonians.SymbolicHamiltonian(form=0.5 * X(0) * Z(1))
backend = VidalBackend()
backend.configure_tn_simulation(max_bond_dimension=256, tensor_module="torch")
value = backend.expectation(circuit, observable)
_ = value # ensure computation runs
assert backend.last_truncation_error < 1e-14, (
f"Expected near-zero truncation error, got {backend.last_truncation_error}"
)
def test_vidal_backend_matches_statevector_multiterm():
"""Multi-term observable with non-adjacent gates, compile_circuit=True."""
circuit = Circuit(5)
for q in range(5):
circuit.add(gates.RY(q, theta=0.7))
circuit.add(gates.RZ(q, theta=0.3))
circuit.add(gates.CNOT(0, 2))
circuit.add(gates.CNOT(1, 4))
observable = hamiltonians.SymbolicHamiltonian(
form=(0.3 * X(0) * Z(2) + 0.7 * Y(1) * Y(4) - 0.5 * Z(0) * X(4))
)
exact_state = circuit().state(numpy=True)
exact = observable.expectation_from_state(exact_state)
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=64, tensor_module="torch", compile_circuit=True,
)
value = backend.expectation(circuit, observable)
np.testing.assert_allclose(value, exact, atol=1e-10)

18
tools/README.md Normal file
View File

@@ -0,0 +1,18 @@
# Tools
Auxiliary scripts for profiling, legacy comparisons, and scale probes.
The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
Files here are intentionally secondary:
- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
- `run_cpu_single_cases.sh`: single-node scale probes.
- `run_cpu_large_cases.sh`: two-node MPI scale probes.
- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
- `qibojit_reference_expectation.py`: state-vector reference helper.
- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.

View File

@@ -1,62 +1,34 @@
"""Baseline MPS expectation scan with the qmatchatea backend."""
"""MPS expectation benchmark for qmatchatea and Vidal backends."""
import argparse
import json
import logging
import math
import os
import socket
import time
import numpy as np
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Z
from qibotn.benchmark_cases import (
build_circuit as build_benchmark_circuit,
exact_pauli_sum,
observable_terms,
terms_to_dict,
)
from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal_tebd import run_vidal_ring_xz
def build_circuit(nqubits, nlayers, seed):
import numpy as np
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for _ in range(nlayers):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
for qubit in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(qubit, qubit + 1))
for qubit in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(qubit, qubit + 1))
return circuit
return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
def build_observable(nqubits):
form = 0
for qubit in range(nqubits):
form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
return terms_to_dict(observable_terms("ring_xz", nqubits))
def exact_expectation(circuit, nqubits):
import numpy as np
state = circuit().state(numpy=True).reshape(-1)
value = 0.0
chunk_size = 1 << 20
for qubit in range(nqubits):
next_qubit = (qubit + 1) % nqubits
x_flip = 1 << (nqubits - 1 - qubit)
z_shift = nqubits - 1 - next_qubit
term = 0.0
for start in range(0, state.size, chunk_size):
stop = min(start + chunk_size, state.size)
indices = np.arange(start, stop, dtype=np.int64)
z_bit = (indices >> z_shift) & 1
z_phase = 1 - 2 * z_bit
term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
value += 0.5 * term
return float(value)
return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)
def main():
@@ -68,16 +40,21 @@ def main():
parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument(
"--executor", choices=("qmatchatea", "vidal"), default="qmatchatea"
"--executor",
choices=("qmatchatea", "vidal", "vidal-mpi"),
default="qmatchatea",
)
parser.add_argument("--vidal-workers", type=int, default=1)
parser.add_argument("--vidal-batched", action="store_true")
parser.add_argument("--mpi-ct", action="store_true")
parser.add_argument("--mpi-barriers", type=int, default=-1)
parser.add_argument("--mpi-isometrization", type=int, default=-1)
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
parser.add_argument("--reference-file")
parser.add_argument(
"--mpi-rank-map",
action="store_true",
help="Print MPI rank, host, pid, and torch thread placement metadata.",
)
args = parser.parse_args()
logging.getLogger("qibo.config").setLevel(logging.ERROR)
logging.getLogger("qtealeaves").setLevel(logging.ERROR)
@@ -91,6 +68,26 @@ def main():
rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
if args.mpi_rank_map:
rank_info = {
"rank": rank,
"size": size,
"host": socket.gethostname(),
"pid": os.getpid(),
"torch_threads": args.torch_threads,
"omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
"mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
}
rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
if rank == 0:
print("mpi_rank_map")
for item in sorted(rank_infos, key=lambda row: row["rank"]):
print(
"rank={rank} size={size} host={host} pid={pid} "
"torch_threads={torch_threads} "
"OMP_NUM_THREADS={omp_num_threads} "
"MKL_NUM_THREADS={mkl_num_threads}".format(**item)
)
circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
observable = build_observable(args.nqubits)
@@ -106,30 +103,42 @@ def main():
exact = exact_expectation(circuit, args.nqubits)
if rank == 0:
mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
mpi_label = f"VidalSegment/{size}"
else:
mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
print(
f"nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} seed={args.seed} "
f"tensor_module={args.tensor_module} svd_control=E! "
f"compile_circuit=True mpi={mpi_label} executor={args.executor} "
f"vidal_workers={args.vidal_workers} vidal_batched={args.vidal_batched}"
f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
)
if exact is not None:
print(f"exact={exact:.16e}")
print("expval abs_error rel_error seconds")
start = time.perf_counter()
if args.executor == "vidal":
timings = None
if args.executor in ("vidal", "vidal-mpi"):
if args.executor == "vidal-mpi" and not args.mpi_ct:
raise ValueError("--executor vidal-mpi requires --mpi-ct.")
if args.mpi_ct:
raise ValueError("--executor vidal is a single-process executor.")
value = run_vidal_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
workers=args.vidal_workers,
use_batched=args.vidal_batched,
)
from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
value, timings = run_segment_vidal_mpi_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
comm=MPI.COMM_WORLD,
)
else:
value = run_vidal_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
)
else:
backend = QMatchaTeaBackend()
backend.configure_tn_simulation(
@@ -151,6 +160,12 @@ def main():
preprocess=False,
compile_circuit=True,
)
max_timings = None
if timings:
max_timings = {
key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
for key, local_value in timings.items()
}
if rank != 0:
return
value = float(np.real(value))
@@ -158,6 +173,10 @@ def main():
abs_error = float("nan") if exact is None else abs(value - exact)
rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
if max_timings:
print("timing_section max_seconds")
for key, max_value in max_timings.items():
print(f"{key} {max_value:.6f}")
if __name__ == "__main__":

View File

@@ -0,0 +1,137 @@
"""Compare QMatchaTeaBackend with the VidalBackend fast path."""
from __future__ import annotations
import argparse
import json
import math
import time
import numpy as np
import torch
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal import VidalBackend
def build_circuit(nqubits, nlayers, seed, kind):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "brickwall":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
elif kind == "shifted-cz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(q, q + 1))
elif kind == "reversed-cnot":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def build_observable(nqubits, kind):
form = 0
if kind == "ring-xz":
for q in range(nqubits):
form += 0.5 * X(q) * Z((q + 1) % nqubits)
elif kind == "open-zz":
for q in range(nqubits - 1):
form += Z(q) * Z(q + 1) / (nqubits - 1)
elif kind == "mixed":
form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
for q in range(0, nqubits - 1, 3):
form += 0.125 * Y(q) * Y(q + 1)
else:
raise ValueError(f"Unknown observable kind {kind!r}.")
return hamiltonians.SymbolicHamiltonian(form=form)
def run_backend(backend, circuit, observable):
start = time.perf_counter()
value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
return float(np.real(value)), time.perf_counter() - start
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=34)
parser.add_argument("--nlayers", type=int, default=20)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument(
"--circuit-kind",
choices=("brickwall", "shifted-cz", "reversed-cnot"),
default="brickwall",
)
parser.add_argument(
"--observable-kind",
choices=("ring-xz", "open-zz", "mixed"),
default="ring-xz",
)
parser.add_argument("--reference-file")
parser.add_argument("--skip-qmatchatea", action="store_true")
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
observable = build_observable(args.nqubits, args.observable_kind)
exact = None
if args.reference_file:
with open(args.reference_file, "r", encoding="utf-8") as f:
exact = float(json.load(f)["expectation"])
print(
f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
f"circuit={args.circuit_kind} observable={args.observable_kind} "
f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
)
if exact is not None:
print(f"exact={exact:.16e}")
print("backend value abs_error seconds")
if not args.skip_qmatchatea:
qmt = QMatchaTeaBackend()
qmt.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
svd_control="E!",
tensor_module=args.tensor_module,
compile_circuit=True,
track_memory=False,
)
value, seconds = run_backend(qmt, circuit, observable)
error = float("nan") if exact is None else abs(value - exact)
print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
vidal = VidalBackend()
vidal.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
compile_circuit=True,
fallback=True,
)
value, seconds = run_backend(vidal, circuit, observable)
error = float("nan") if exact is None else abs(value - exact)
print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,72 @@
"""Chrome trace profiler for the VidalBackend fast path."""
from __future__ import annotations
import argparse
from pathlib import Path
import torch
from torch.profiler import ProfilerActivity, profile
from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=34)
parser.add_argument("--nlayers", type=int, default=20)
parser.add_argument("--bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument("--cut-ratio", type=float, default=1e-12)
parser.add_argument("--profile-memory", action="store_true")
parser.add_argument("--rows", type=int, default=60)
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
trace_path = Path(f"{prefix}.json")
table_path = Path(f"{prefix}.txt")
trace_path.parent.mkdir(parents=True, exist_ok=True)
circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
config = ExpectationConfig(
ansatz="mps",
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
torch_threads=args.torch_threads,
)
print(
f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} threads={args.torch_threads}"
)
with profile(
activities=[ProfilerActivity.CPU],
record_shapes=args.profile_memory,
profile_memory=args.profile_memory,
with_stack=args.profile_memory,
) as prof:
result = run_cpu_expectation(circuit, observable, config)
table = (
f"expval={result.value:.16e}\n\n"
f"# sorted by self_cpu_time_total\n"
f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
f"# sorted by cpu_time_total\n"
f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
)
print(table, end="")
table_path.write_text(table, encoding="utf-8")
prof.export_chrome_trace(str(trace_path))
print(f"trace={trace_path}\ntable={table_path}")
if __name__ == "__main__":
main()

127
tools/run_cpu_large_cases.sh Executable file
View File

@@ -0,0 +1,127 @@
#!/usr/bin/env bash
set -euo pipefail
# Large CPU expectation benchmarks for two-server runs.
#
# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
# each. Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
# needed.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
HOSTFILE="${HOSTFILE:-hostfile}"
MPS_RANKS="${MPS_RANKS:-8}"
MPS_THREADS="${MPS_THREADS:-12}"
TN_RANKS="${TN_RANKS:-12}"
TN_THREADS="${TN_THREADS:-8}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
run_mpi() {
local ranks="$1"
shift
"$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
}
run_case() {
local title="$1"
shift
echo
echo "================================================================================"
echo "$title"
echo "================================================================================"
echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
echo "$*"
"$@"
}
case "${1:-help}" in
smoke)
run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_SMOKE_NQ:-40}" \
--nlayers "${MPS_SMOKE_LAYERS:-30}" \
--bond "${MPS_SMOKE_BOND:-2048}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz \
--observables ring_xz open_zz range2_xx
run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
--mpi \
--nqubits "${TN_SMOKE_NQ:-32}" \
--nlayers "${TN_SMOKE_LAYERS:-16}" \
--torch-threads "$TN_THREADS" \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices "${TN_SMOKE_SLICES:-12}"
;;
mps-long)
run_case "MPS MPI long: n=64 layers=48 bond=4096" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_LONG_NQ:-64}" \
--nlayers "${MPS_LONG_LAYERS:-48}" \
--bond "${MPS_LONG_BOND:-4096}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
mps-pressure)
run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_PRESSURE_NQ:-80}" \
--nlayers "${MPS_PRESSURE_LAYERS:-64}" \
--bond "${MPS_PRESSURE_BOND:-4096}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
--observables ring_xz open_zz mixed_local range2_xx long_z_string
;;
tn-long)
run_case "TN MPI long: n=36 layers=20 target_slices=24" \
run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
--mpi \
--nqubits "${TN_LONG_NQ:-36}" \
--nlayers "${TN_LONG_LAYERS:-20}" \
--torch-threads "$TN_THREADS" \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices "${TN_LONG_SLICES:-24}"
;;
all)
"$0" smoke
"$0" mps-long
"$0" tn-long
;;
help|*)
cat >&2 <<'EOF'
Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
Common overrides:
HOSTFILE=hostfile
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPS_RANKS=8 MPS_THREADS=12
TN_RANKS=12 TN_THREADS=8
Scale overrides:
MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
EOF
exit 2
;;
esac

148
tools/run_cpu_single_cases.sh Executable file
View File

@@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -euo pipefail
# Single-node CPU scale probes for expectation benchmarks.
#
# Intended for one 96-core / ~500 GiB RAM node. The default "probe" mode runs
# moderate MPS and TN cases first. Larger modes are available after checking
# runtime and memory from the probe output.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
MPIEXEC="${MPIEXEC:-mpiexec}"
TIME_BIN="${TIME_BIN:-/usr/bin/time}"
MPS_RANKS="${MPS_RANKS:-8}"
MPS_THREADS="${MPS_THREADS:-12}"
TN_RANKS="${TN_RANKS:-8}"
TN_THREADS="${TN_THREADS:-12}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
estimate_mps_memory() {
local nqubits="$1"
local bond="$2"
"$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
import sys
n = int(sys.argv[1])
chi = int(sys.argv[2])
ranks = int(sys.argv[3])
resident = n * 2 * chi * chi * 16
per_rank = resident / ranks
print(
"MPS rough resident memory: "
f"total={resident / 1024**3:.1f} GiB "
f"per_rank={per_rank / 1024**3:.1f} GiB "
"(temporary eig/SVD workspaces are additional)"
)
PY
}
run_timed() {
echo
echo "--------------------------------------------------------------------------------"
echo "$*"
echo "--------------------------------------------------------------------------------"
"$TIME_BIN" -v "$@"
}
run_mps_case() {
local label="$1"
local nqubits="$2"
local nlayers="$3"
local bond="$4"
shift 4
echo
echo "================================================================================"
echo "$label"
echo "================================================================================"
echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
estimate_mps_memory "$nqubits" "$bond"
run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "$nqubits" \
--nlayers "$nlayers" \
--bond "$bond" \
--torch-threads "$MPS_THREADS" \
"$@"
}
run_tn_case() {
local label="$1"
local nqubits="$2"
local nlayers="$3"
shift 3
echo
echo "================================================================================"
echo "$label"
echo "================================================================================"
echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
--mpi \
--nqubits "$nqubits" \
--nlayers "$nlayers" \
--torch-threads "$TN_THREADS" \
"$@"
}
case "${1:-help}" in
probe)
run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
--circuits brickwall_cnot \
--observables ring_xz
run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
--circuits brickwall_cnot \
--observables ring_xz \
--tn-target-slices 8
;;
mps-medium)
run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
mps-long)
run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
tn-medium)
run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices 16
;;
tn-long)
run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices 32
;;
help|*)
cat >&2 <<'EOF'
Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
Common overrides:
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPS_RANKS=8 MPS_THREADS=12
TN_RANKS=8 TN_THREADS=12
OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
EOF
exit 2
;;
esac

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -euo pipefail
NQ="${NQ:-34}"
LAYERS="${LAYERS:-20}"
BOND="${BOND:-512}"
SEED="${SEED:-42}"
RANKS="${RANKS:-1 2 4}"
THREADS="${THREADS:-32 32 16}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
CIRCUIT="${CIRCUIT:-brickwall_cnot}"
OBSERVABLE="${OBSERVABLE:-ring_xz}"
EXACT="${EXACT:-0}"
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
if [[ "${1:-help}" != "run" ]]; then
cat >&2 <<'EOF'
Usage: tools/run_vidal_segment_mpi_scan.sh run
Overrides:
NQ=34 LAYERS=20 BOND=512 SEED=42
RANKS="1 2 4" THREADS="32 32 16"
CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
EXACT=1
PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
EOF
if [[ "${1:-help}" == "help" ]]; then
exit 0
fi
exit 2
fi
read -r -a ranks <<< "$RANKS"
read -r -a threads <<< "$THREADS"
if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
echo "RANKS and THREADS must have the same number of entries." >&2
exit 2
fi
common=(
--nqubits "$NQ"
--nlayers "$LAYERS"
--bond "$BOND"
--seed "$SEED"
--mps
--circuits "$CIRCUIT"
--observables "$OBSERVABLE"
)
if [[ "$EXACT" == "1" ]]; then
common+=(--exact)
fi
for idx in "${!ranks[@]}"; do
nrank="${ranks[$idx]}"
nthr="${threads[$idx]}"
if [[ "$nrank" == "1" ]]; then
echo "== Vidal serial ranks=1 torch_threads=$nthr =="
"$PYTHON_BIN" -u benchmark_cpu_expectation.py \
"${common[@]}" --torch-threads "$nthr"
else
echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
"$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
"${common[@]}" --torch-threads "$nthr" --mpi
fi
done

View File

@@ -0,0 +1,202 @@
"""Correctness checks for the Vidal/TEBD MPS fast path.
The cases here intentionally cover more than the benchmark ring-XZ observable:
different nearest-neighbor gate orientations and several Pauli-sum observables.
Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
the segmented Vidal executor.
"""
from __future__ import annotations
import argparse
import math
import time
import numpy as np
import torch
from qibo import Circuit, gates
from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
from qibotn.backends.vidal_tebd import VidalTEBDExecutor
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "rx_ry_cz":
circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
if kind in ("brickwall", "reversed_cnot"):
for q in range(0, nqubits - 1, 2):
if kind == "reversed_cnot" and (layer % 2):
circuit.add(gates.CNOT(q + 1, q))
else:
circuit.add(gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
if kind == "reversed_cnot" and not (layer % 2):
circuit.add(gates.CNOT(q + 1, q))
else:
circuit.add(gates.CNOT(q, q + 1))
elif kind == "rx_ry_cz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def observable_terms(kind, nqubits):
if kind == "ring_xz":
return [
(0.5, (("X", site), ("Z", (site + 1) % nqubits)))
for site in range(nqubits)
]
if kind == "open_zz":
return [
(1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
for site in range(nqubits - 1)
]
if kind == "mixed_local":
terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
terms += [
(0.125, (("Y", site), ("Y", site + 1)))
for site in range(0, nqubits - 1, 3)
]
return terms
raise ValueError(f"Unknown observable kind {kind!r}.")
def exact_pauli_sum(circuit, terms, nqubits):
state = circuit().state(numpy=True).reshape(-1)
indices = np.arange(state.size, dtype=np.int64)
value = 0.0 + 0.0j
for coeff, ops in terms:
flipped = indices.copy()
phase = np.ones(state.size, dtype=np.complex128)
for name, site in ops:
shift = nqubits - 1 - site
bit = (indices >> shift) & 1
name = name.upper()
if name == "X":
flipped ^= 1 << shift
elif name == "Y":
flipped ^= 1 << shift
phase *= 1j * (1 - 2 * bit)
elif name == "Z":
phase *= 1 - 2 * bit
elif name != "I":
raise ValueError(f"Unsupported Pauli {name!r}.")
value += coeff * np.vdot(state[flipped], phase * state)
return float(value.real)
def run_vidal(circuit, terms, nqubits, bond, tensor_module):
executor = VidalTEBDExecutor(
nqubits=nqubits,
max_bond=bond,
cut_ratio=1e-12,
tensor_module=tensor_module,
)
executor.run_circuit(circuit)
return float(executor.expectation_pauli_sum(terms))
def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
executor = SegmentVidalMPIExecutor(
nqubits=nqubits,
max_bond=bond,
cut_ratio=1e-12,
tensor_module=tensor_module,
comm=comm,
)
executor.run_circuit(circuit)
return executor.expectation_pauli_sum_root(terms)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=16)
parser.add_argument("--nlayers", type=int, default=6)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument("--mpi", action="store_true")
parser.add_argument(
"--circuits",
nargs="+",
default=("brickwall", "reversed_cnot", "rx_ry_cz"),
)
parser.add_argument(
"--observables",
nargs="+",
default=("ring_xz", "open_zz", "mixed_local"),
)
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
comm = None
rank = 0
size = 1
if args.mpi:
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
if rank == 0:
mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
print(
f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} tensor_module={args.tensor_module}"
)
print("circuit observable exact value abs_error seconds")
for circuit_kind in args.circuits:
circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
exact = None
if rank == 0:
exact_values = {
obs: exact_pauli_sum(
circuit, observable_terms(obs, args.nqubits), args.nqubits
)
for obs in args.observables
}
else:
exact_values = None
if comm is not None:
exact_values = comm.bcast(exact_values, root=0)
for obs_kind in args.observables:
terms = observable_terms(obs_kind, args.nqubits)
start = time.perf_counter()
if args.mpi:
value = run_segment_mpi(
circuit,
terms,
args.nqubits,
args.bond,
args.tensor_module,
comm,
)
else:
value = run_vidal(
circuit, terms, args.nqubits, args.bond, args.tensor_module
)
if rank != 0:
continue
elapsed = time.perf_counter() - start
exact = exact_values[obs_kind]
print(
f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
f"{abs(value - exact):.6e} {elapsed:.3f}"
)
if __name__ == "__main__":
main()