diff --git a/benchmark_cpu_expectation.py b/benchmark_cpu_expectation.py index 9141849..3d5897d 100644 --- a/benchmark_cpu_expectation.py +++ b/benchmark_cpu_expectation.py @@ -3,6 +3,10 @@ from __future__ import annotations import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import urlparse from qibotn.benchmark_cases import ( CIRCUITS, @@ -19,6 +23,51 @@ from qibotn.expectation_runner import ( ) +def optional_int(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return int(text) + + +def optional_float(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return float(text) + + +def format_optional(value, fmt="g"): + return "None" if value is None else format(value, fmt) + + +def should_stop_dask(args): + return ( + not args.keep_dask + and args.tn_search_backend == "dask" + and args.dask_address is not None + and args.tn_load_tree is None + ) + + +def stop_dask_cluster(args, rank): + if rank != 0 or not should_stop_dask(args): + return + script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh" + if not script.exists(): + print(f"dask_stop_skipped reason=missing_script path={script}", flush=True) + return + + env = os.environ.copy() + parsed = urlparse(args.dask_address) + if parsed.hostname: + env.setdefault("SCHEDULER_HOST", parsed.hostname) + if parsed.port: + env.setdefault("SCHEDULER_PORT", str(parsed.port)) + + print("dask_stop_after_search start", flush=True) + subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False) + print("dask_stop_after_search done", flush=True) + + def build_parallel_opts(args): slicing_opts = {} if args.tn_target_slices is not None: @@ -31,11 +80,24 @@ def build_parallel_opts(args): "search_workers": args.tn_search_workers or args.torch_threads, "max_repeats": args.tn_search_repeats, "max_time": args.tn_search_time, + "print_stats": not args.no_tn_stats, } if args.tn_search_backend is not None: opts["search_backend"] = args.tn_search_backend if args.dask_address is not None: opts["dask_address"] = args.dask_address + if args.tn_save_tree is not None: + opts["save_tree_path"] = args.tn_save_tree + if args.tn_load_tree is not None: + opts["load_tree_path"] = args.tn_load_tree + if args.tn_search_only: + opts["search_only"] = True + if args.tn_debug_trials: + opts["debug_trials"] = True + if args.tn_contract_implementation is not None: + opts["contract_implementation"] = args.tn_contract_implementation + if args.dask_close_workers: + opts["dask_close_workers"] = True return opts @@ -43,10 +105,16 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--nqubits", type=int, default=40) parser.add_argument("--nlayers", type=int, default=30) - parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024) - parser.add_argument("--cut-ratio", type=float, default=1e-12) + parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) + parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--torch-threads", type=int, default=8) + parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") + parser.add_argument( + "--dtype", + choices=("complex128", "complex64"), + default="complex128", + ) parser.add_argument("--ansatz", choices=("tn", "mps"), default=None) parser.add_argument("--mps", action="store_true") parser.add_argument("--mpi", action="store_true") @@ -56,19 +124,62 @@ def main(): parser.add_argument("--observables", nargs="+", default=["ring_xz"]) parser.add_argument("--pauli-pattern") parser.add_argument("--tn-target-slices", type=int) - parser.add_argument("--tn-target-size", type=int) + parser.add_argument("--tn-target-size", type=int,default=2**32) parser.add_argument("--tn-search-workers", type=int) parser.add_argument("--tn-search-repeats", type=int, default=128) parser.add_argument("--tn-search-time", type=float, default=60.0) + parser.add_argument( + "--no-tn-stats", + action="store_true", + help="Do not print per-term TN search/contraction diagnostics.", + ) parser.add_argument( "--tn-search-backend", choices=("processpool", "dask"), + default="dask", help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.", ) parser.add_argument( "--dask-address", help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.", ) + parser.add_argument( + "--dask-close-workers", + action="store_true", + help="After dask path search, ask the scheduler to close all currently connected workers.", + ) + parser.add_argument( + "--keep-dask", + action="store_true", + help=( + "Keep an external dask cluster running after search. By default, " + "tools/manage_tn_dask_cluster.sh stop is called after search when " + "--dask-address is used." + ), + ) + parser.add_argument( + "--tn-save-tree", + help="Save searched cotengra contraction tree(s) to this pickle file.", + ) + parser.add_argument( + "--tn-load-tree", + help="Load cotengra contraction tree(s) from this pickle file and skip path search.", + ) + parser.add_argument( + "--tn-search-only", + action="store_true", + help="Only run path search and optional --tn-save-tree; skip contraction.", + ) + parser.add_argument( + "--tn-debug-trials", + action="store_true", + help="Print dask worker summary and per-trial worker start/done logs.", + ) + parser.add_argument( + "--tn-contract-implementation", + choices=("auto", "cotengra", "autoray", "cpp"), + help="cotengra contraction implementation for TN contraction.", + ) args = parser.parse_args() ansatz = "mps" if args.mps else (args.ansatz or "tn") @@ -89,6 +200,8 @@ def main(): bond=args.bond, cut_ratio=args.cut_ratio, tensor_module="torch", + quimb_backend=args.quimb_backend, + dtype=args.dtype, torch_threads=args.torch_threads, parallel_opts=build_parallel_opts(args), ) @@ -98,47 +211,74 @@ def main(): print( f"backend=cpu ansatz={ansatz.upper()} mode={mode} " f"nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} " + f"bond={format_optional(args.bond)} " + f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} " + f"quimb_backend={args.quimb_backend} dtype={args.dtype} " f"torch_threads={args.torch_threads} " - f"tn_search_backend={args.tn_search_backend or 'processpool'}" + f"tn_search_backend={args.tn_search_backend}" ) print("circuit observable exact value abs_error rel_error seconds") - for circuit_kind in circuits: - circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) - named_observables = ( - [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})] - if args.pauli_pattern - else [ - (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits))) - for obs_kind in observables - ] - ) + try: + for circuit_kind in circuits: + circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) + named_observables = ( + [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})] + if args.pauli_pattern + else [ + (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits))) + for obs_kind in observables + ] + ) - for obs_name, observable in named_observables: - exact = None - if args.exact and rank == 0: - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." + for obs_name, observable in named_observables: + exact = None + if args.exact and rank == 0: + if args.nqubits > args.exact_max_qubits: + raise ValueError( + f"--exact is limited to {args.exact_max_qubits} qubits by default." + ) + exact = exact_for_observable(circuit, observable, args.nqubits) + + result = run_cpu_expectation(circuit, observable, config) + if args.mpi and result.rank != 0: + continue + + abs_error = float("nan") if exact is None else abs(result.value - exact) + rel_error = ( + float("nan") + if exact is None + else abs_error / max(abs(exact), 1e-15) + ) + exact_text = "nan" if exact is None else f"{exact:.16e}" + print( + f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} " + f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}" + ) + for stat in result.parallel_stats or (): + cost = stat["path_cost"] + search_stats = stat.get("search_stats", {}) + print( + "tn_term_summary " + f"term={stat.get('term_index', 0)} " + f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " + f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " + f"completed_trials={search_stats.get('completed_trials', 'na')} " + f"finite_trials={search_stats.get('finite_trials', 'na')} " + f"failed_trials={search_stats.get('failed_trials', 'na')} " + f"requested_trials={search_stats.get('requested_trials', 'na')} " + f"best_score={search_stats.get('best_score', float('nan')):.6g} " + f"slices={cost['nslices']} " + f"log10_flops={cost['log10_flops']:.3f} " + f"log10_write={cost['log10_write']:.3f} " + f"log2_size={cost['log2_size']:.3f} " + f"log10_combo={cost['log10_combo']:.3f} " + f"peak_memory_gib={cost['peak_memory_gib']:.6g} " + f"slicing_overhead={cost['slicing_overhead']:.6g} " + f"rank_slices={stat.get('rank_slices', 'na')}" ) - exact = exact_for_observable(circuit, observable, args.nqubits) - - result = run_cpu_expectation(circuit, observable, config) - if args.mpi and result.rank != 0: - continue - - abs_error = float("nan") if exact is None else abs(result.value - exact) - rel_error = ( - float("nan") - if exact is None - else abs_error / max(abs(exact), 1e-15) - ) - exact_text = "nan" if exact is None else f"{exact:.16e}" - print( - f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} " - f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}" - ) + finally: + stop_dask_cluster(args, rank) if __name__ == "__main__": diff --git a/docs/contest_runners.md b/docs/contest_runners.md new file mode 100644 index 0000000..28c0d33 --- /dev/null +++ b/docs/contest_runners.md @@ -0,0 +1,254 @@ +# Contest Runners + +This directory contains two self-contained contest entrypoints: + +- `tools/tn_contest_runner.py`: general tensor-network path search and contraction. +- `tools/mps_contest_runner.py`: Vidal/MPS multi-node expectation runner. + +Both scripts keep circuit and observable definitions inside the script so a +contest case can be edited in one place. + +## Environment + +Run commands from the repository root: + +```bash +cd /home/yx/qibotn +``` + +For Intel MPI on two nodes, use the known working style: + +```bash +mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 ... +``` + +Set `TCM_ENABLE=1` for CPU runs: + +```bash +export TCM_ENABLE=1 +``` + +## TN Workflow + +List built-in TN contest cases: + +```bash +python -u tools/tn_contest_runner.py list +``` + +TN path search uses dask by default. Without `--dask-address`, the script starts +a local dask cluster. For multiple servers, start one scheduler and workers +with the helper script, then pass the scheduler address to the search command. + +Start the default two-node dask cluster: + +```bash +cd /home/yx/qibotn +tools/manage_tn_dask_cluster.sh start +``` + +Check status: + +```bash +cd /home/yx/qibotn +tools/manage_tn_dask_cluster.sh status +``` + +Stop the cluster: + +```bash +cd /home/yx/qibotn +tools/manage_tn_dask_cluster.sh stop +``` + +The helper defaults are: + +```bash +SCHEDULER_HOST=10.20.1.103 +WORKER_HOSTS="10.20.1.103 10.20.1.102" +NWORKERS=48 +NTHREADS=1 +ROOT_DIR=/home/yx/qibotn +PYTHON_BIN=.venv/bin/python +DASK_WORKER_TTL="24 hours" +DASK_TICK_LIMIT="30 minutes" +DASK_LOST_WORKER_TIMEOUT="30 minutes" +``` + +Override them inline if needed: + +```bash +WORKER_HOSTS="10.20.1.103 10.20.1.102" NWORKERS=48 \ + tools/manage_tn_dask_cluster.sh restart +``` + +Check that both nodes are connected by adding `--tn-debug-trials` to a small +search. The output should include `qibotn_dask_workers` with both hosts. + +`tools/tn_contest_runner.py search` stops the external dask cluster after the +search phase by default. Pass `--keep-dask` if you want to reuse the same dask +cluster for several searches. + +Use enough trials to fill the cluster. With the default two-node setup there are +96 worker slots, so `--tn-search-repeats` should be at least 96. The contest +runner default is 2048. + +Cotengra trials are CPU-bound and can hold the Python GIL long enough for dask +to report `Event loop was unresponsive`. Dask defaults are much more aggressive: +`scheduler.worker-ttl=5 minutes`, `admin.tick.limit=3s`, and +`deploy.lost-worker-timeout=15s`. The helper script raises these limits so +workers are not killed by dask during search. The intended timeout is +`--tn-search-time`; after that, the runner stops the external dask cluster. + +Small correctness check against statevector: + +```bash +python -u tools/tn_contest_runner.py validate \ + --case main1 \ + --nqubits 8 \ + --nlayers 2 \ + --torch-threads 4 \ + --tn-search-repeats 8 \ + --tn-search-time 5 +``` + +Search and save contraction trees: + +```bash +TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \ + --case main1 \ + --torch-threads 48 \ + --dtype complex64 \ + --dask-address tcp://10.20.1.103:8786 \ + --tn-search-repeats 2048 \ + --tn-search-time 300 +``` + +Contract using the saved tree on one node: + +```bash +TCM_ENABLE=1 mpirun -np 2 python -u tools/tn_contest_runner.py contract \ + --mpi \ + --case main1 \ + --torch-threads 48 \ + --dtype complex64 +``` + +Contract using the saved tree on two nodes: + +```bash +TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \ + python -u tools/tn_contest_runner.py contract \ + --mpi \ + --case main1 \ + --torch-threads 48 \ + --dtype complex64 +``` + +Run search and contract in one command: + +```bash +TCM_ENABLE=1 python -u tools/tn_contest_runner.py all \ + --case main1 \ + --torch-threads 48 \ + --dtype complex64 \ + --dask-address tcp://10.20.1.103:8786 \ + --tn-search-repeats 2048 \ + --tn-search-time 300 +``` + +Run only selected observables: + +```bash +python -u tools/tn_contest_runner.py search \ + --case main2 \ + --observables open_zz +``` + +Tree files are written to `trees/contest_tn/` by default. The tree filename +contains case, observable, qubit count, layer count, and target slice count. +If any of these change, search again. + +Edit TN contest cases in `tools/tn_contest_runner.py`: + +- `CASES`: case name, circuit kind, observable list, default scale. +- `build_circuit`: circuit definitions. +- `pauli_sum_observable`: observable definitions. + +## MPS Workflow + +List built-in Vidal/MPS contest cases: + +```bash +python -u tools/mps_contest_runner.py list +``` + +Small correctness check against statevector: + +```bash +mpirun -np 2 python -u tools/mps_contest_runner.py validate \ + --case main1 \ + --nqubits 8 \ + --nlayers 2 \ + --bond 64 \ + --torch-threads 4 +``` + +Run one MPS case on one node: + +```bash +TCM_ENABLE=1 mpirun -np 2 python -u tools/mps_contest_runner.py run \ + --case main1 \ + --torch-threads 48 +``` + +Run one MPS case on two nodes: + +```bash +TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \ + python -u tools/mps_contest_runner.py run \ + --case main1 \ + --torch-threads 48 +``` + +Run only one observable: + +```bash +TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \ + python -u tools/mps_contest_runner.py run \ + --case main1 \ + --observables ring_xz \ + --torch-threads 48 +``` + +Override scale: + +```bash +TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \ + python -u tools/mps_contest_runner.py run \ + --case main1 \ + --nqubits 128 \ + --nlayers 24 \ + --bond 1024 \ + --torch-threads 48 +``` + +Edit MPS contest cases in `tools/mps_contest_runner.py`: + +- `CASES`: case name, circuit kind, observable list, default scale and bond. +- `build_circuit`: circuit definitions. +- `observable`: observable definitions, including dense local terms. + +## Notes + +- TN uses path search plus contraction. Reuse tree files only for the exact same + circuit, observable, qubit count, layer count, seed, and slicing setup. +- TN path search defaults to dask. Use `--tn-search-backend processpool` only + for fallback/debugging. +- Prefer the default `--tn-target-size 4294967296` memory target. Do not force + `--tn-target-slices` unless you have already verified that cotengra can find + valid trees for that exact setting. +- MPS/Vidal does not use contraction-tree search. It runs the circuit directly + and reports `trunc_sum` and `trunc_max`. +- Default TN contraction is the stable torch/quimb path. Do not pass + `--tn-contract-implementation cpp` for contest runs. diff --git a/hostfile b/hostfile index 338d16a..e596b93 100644 --- a/hostfile +++ b/hostfile @@ -1,2 +1,2 @@ -10.20.6.74 -#10.20.6.102 \ No newline at end of file +10.20.1.103:2 +10.20.1.102:2 diff --git a/src/qibotn/backends/__init__.py b/src/qibotn/backends/__init__.py index f2c1189..954f793 100644 --- a/src/qibotn/backends/__init__.py +++ b/src/qibotn/backends/__init__.py @@ -30,7 +30,7 @@ class MetaBackend: elif platform == "quimb": # pragma: no cover import qibotn.backends.quimb as qmb - quimb_backend = kwargs.get("quimb_backend", "numpy") + quimb_backend = kwargs.get("quimb_backend", "torch") contraction_optimizer = kwargs.get("contraction_optimizer", "auto-hq") return qmb.BACKENDS[quimb_backend]( quimb_backend=quimb_backend, contraction_optimizer=contraction_optimizer diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py index 0abe8f7..5259fe0 100644 --- a/src/qibotn/backends/cpu.py +++ b/src/qibotn/backends/cpu.py @@ -3,6 +3,9 @@ from __future__ import annotations import os +import pickle +import time +from pathlib import Path import numpy as np from qibo import hamiltonians @@ -10,7 +13,12 @@ from qibo.backends import NumpyBackend from qibo.config import raise_error from qibotn.backends.abstract import QibotnBackend -from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason +from qibotn.backends.vidal import ( + _observable_mpo_tensors, + _operator_terms_to_mpo, + _symbolic_hamiltonian_to_operator_terms, + _unsupported_reason, +) from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor from qibotn.backends.vidal_tebd import VidalTEBDExecutor from qibotn.observables import check_observable @@ -23,6 +31,51 @@ def _as_bool_or_dict(value, name): raise TypeError(f"{name} has an unexpected type") +def _bind_numa_node(rank): + """Bind the calling process (or thread) to the NUMA node for *rank*. + + The MPI rank is converted to a local (per-node) rank through the + environment variables commonly set by Open MPI, MVAPICH, and Slurm. + The process CPU affinity and NUMA memory policy are set accordingly. + + Returns the NUMA domain that was selected, or ``None`` if the binding + could not be determined. + """ + local_rank = rank + for name in ( + "OMPI_COMM_WORLD_LOCAL_RANK", + "MV2_COMM_WORLD_LOCAL_RANK", + "MPI_LOCALRANKID", + "SLURM_LOCALID", + ): + try: + local_rank = int(os.environ[name]) + break + except (KeyError, ValueError): + pass + + domain = local_rank % 2 + cpulist = f"/sys/devices/system/node/node{domain}/cpulist" + try: + cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip()) + if cpus: + os.sched_setaffinity(0, cpus) + except (FileNotFoundError, OSError): + pass + + try: + import ctypes + + libnuma = ctypes.CDLL("libnuma.so.1") + if libnuma.numa_available() >= 0: + libnuma.numa_run_on_node(ctypes.c_int(domain)) + libnuma.numa_set_preferred(ctypes.c_int(domain)) + except Exception: + pass + + return domain + + def _parse_cpu_list(text): cpus = set() for item in text.split(","): @@ -90,8 +143,15 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): ), ) self.tensor_module = runcard.get("tensor_module", "torch") + self.dtype = runcard.get("dtype", "complex128") + self.compile_circuit = bool(runcard.get("compile_circuit", False)) + self.preprocess = bool(runcard.get("preprocess", False)) + self.mpi_term_batch_size = runcard.get( + "mpi_term_batch_size", + runcard.get("term_batch_size", None), + ) self.torch_threads = runcard.get("torch_threads", None) - self.quimb_backend = runcard.get("quimb_backend", "numpy") + self.quimb_backend = runcard.get("quimb_backend", "torch") self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq") self.parallel_opts = runcard.get("parallel_opts", {}) self.parallel_stats = [] @@ -117,7 +177,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): value = self.expectation(circuit, self.observable) if self.MPI_enabled and self.rank > 0: return np.asarray([0], dtype=np.int64) - return np.asarray([value], dtype=np.float64) + dtype = np.complex128 if np.iscomplexobj(value) else np.float64 + return np.asarray([value], dtype=dtype) backend = self._quimb_backend() backend.configure_tn_simulation( @@ -131,13 +192,26 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): return_array=return_array, ) - def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None): - observable = check_observable(observable, circuit.nqubits) + def expectation(self, circuit, observable=None, preprocess=None, compile_circuit=None): + mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits) + if mpo_tensors is None: + observable = check_observable(observable, circuit.nqubits) + use_preprocess = self.preprocess if preprocess is None else preprocess + if mpo_tensors is not None and not self.MPS_enabled: + raise_error( + NotImplementedError, + "MPO expectation is currently supported only by the Vidal MPS path.", + ) if self.MPS_enabled: reason = _unsupported_reason(circuit) - if reason is None: - return self._vidal_expectation(circuit, observable) + if reason is None or self.compile_circuit or use_preprocess: + return self._vidal_expectation( + circuit, + observable, + preprocess=use_preprocess, + compile_circuit=compile_circuit, + ) backend = self._quimb_backend() backend.configure_tn_simulation( @@ -149,8 +223,45 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): return self._quimb_expectation_mpi(backend, circuit, observable) return self._quimb_expectation_processpool(backend, circuit, observable) - def _vidal_expectation(self, circuit, observable): - terms = _symbolic_hamiltonian_to_pauli_terms(observable) + def _vidal_expectation( + self, circuit, observable, preprocess=False, compile_circuit=None + ): + if compile_circuit is None: + compile_circuit = self.compile_circuit + if preprocess: + if self.MPI_enabled: + from mpi4py import MPI + + self.rank = MPI.COMM_WORLD.Get_rank() + + from qibotn.backends.vidal import VidalBackend + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + compile_circuit=compile_circuit, + mpi_approach="CT" if self.MPI_enabled else "SR", + mpi_term_batch_size=self.mpi_term_batch_size, + fallback=False, + ) + value = backend.expectation( + circuit, + observable, + preprocess=True, + compile_circuit=compile_circuit, + ) + self.rank = getattr(backend, "rank", self.rank) + self.last_truncation_error = getattr( + backend, "last_truncation_error", np.nan + ) + self.last_max_truncation_error = getattr( + backend, "last_max_truncation_error", np.nan + ) + return value + + mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits) if self.MPI_enabled: from mpi4py import MPI @@ -164,8 +275,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): comm=comm, ) executor.run_circuit(circuit) - value = executor.expectation_pauli_sum_root(terms) - return np.nan if self.rank != 0 else float(np.real(value)) + self.last_truncation_error = float(executor.global_truncation_error()) + self.last_max_truncation_error = float( + executor.global_max_truncation_error() + ) + if mpo_tensors is not None: + value = executor.expectation_mpo_root(mpo_tensors) + else: + terms = _symbolic_hamiltonian_to_operator_terms(observable) + value = executor.expectation_mpo_root( + _operator_terms_to_mpo(terms, circuit.nqubits) + ) + return np.nan if self.rank != 0 else value executor = VidalTEBDExecutor( nqubits=circuit.nqubits, @@ -174,7 +295,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): tensor_module=self.tensor_module, ) executor.run_circuit(circuit) - return float(np.real(executor.expectation_pauli_sum(terms))) + self.last_truncation_error = float(executor.truncation_error) + self.last_max_truncation_error = float(executor.max_truncation_error) + if mpo_tensors is not None: + return executor.expectation_mpo(mpo_tensors) + terms = _symbolic_hamiltonian_to_operator_terms(observable) + return executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits)) def _quimb_backend(self): import qibotn.backends.quimb as qmb @@ -185,26 +311,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): ) def _bind_rank_to_numa_domain(self, rank): - domain = rank % 2 - cpulist = f"/sys/devices/system/node/node{domain}/cpulist" - try: - cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip()) - if cpus: - os.sched_setaffinity(0, cpus) - except (FileNotFoundError, OSError): - pass - - try: - import ctypes - - libnuma = ctypes.CDLL("libnuma.so.1") - if libnuma.numa_available() >= 0: - libnuma.numa_run_on_node(ctypes.c_int(domain)) - libnuma.numa_set_preferred(ctypes.c_int(domain)) - except Exception: - pass - - self.numa_domain = domain + self.numa_domain = _bind_numa_node(rank) def _default_search_workers(self, nranks=1): if self.torch_threads: @@ -259,6 +366,22 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): search_time = opts.get("max_time", 60) search_backend = opts.get("search_backend") dask_address = opts.get("dask_address") + dask_close_workers = bool(opts.get("dask_close_workers", False)) + print_stats = bool(opts.get("print_stats", False)) + debug_trials = bool(opts.get("debug_trials", False)) + search_only = bool(opts.get("search_only", False)) + save_tree_path = opts.get("save_tree_path") + load_tree_path = opts.get("load_tree_path") + loaded_trees = None + saved_trees = [] + saved_costs = [] + + if load_tree_path: + with Path(load_tree_path).open("rb") as f: + payload = pickle.load(f) + loaded_trees = payload["trees"] if isinstance(payload, dict) else payload + if not isinstance(loaded_trees, (list, tuple)): + loaded_trees = [loaded_trees] qc = backend._qibo_circuit_to_quimb( circuit, @@ -271,7 +394,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): total_value = 0.0 + 0.0j terms = extract_gates_and_qubits(observable) - for coeff, factors in terms: + for term_index, (coeff, factors) in enumerate(terms): if not factors: if self.rank == 0: total_value += coeff @@ -297,31 +420,112 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): user_slicing_opts, ) - tree = parallel_path_search( - tn, - tn.outer_inds(), - method="dask" if method != "mpi" and search_backend == "dask" else method, - total_repeats=search_repeats, - max_time=search_time, - n_workers=search_workers, - slicing_opts=slicing_opts, - trial_timeout=opts.get("trial_timeout"), - search_backend=search_backend, - dask_address=dask_address, - ) + if loaded_trees is not None: + if term_index >= len(loaded_trees): + raise ValueError( + f"Loaded tree file has {len(loaded_trees)} tree(s), " + f"but term {term_index} was requested." + ) + tree = loaded_trees[term_index] + search_seconds = 0.0 + if self.rank == 0 and print_stats: + print( + f"tn_tree_loaded term={term_index} path={load_tree_path}", + flush=True, + ) + else: + search_start = time.perf_counter() + tree = parallel_path_search( + tn, + tn.outer_inds(), + method="dask" if method != "mpi" and search_backend == "dask" else method, + total_repeats=search_repeats, + max_time=search_time, + n_workers=search_workers, + slicing_opts=slicing_opts, + trial_timeout=opts.get("trial_timeout"), + search_backend=search_backend, + dask_address=dask_address, + debug_trials=debug_trials, + dask_close_workers=dask_close_workers, + ) + search_seconds = time.perf_counter() - search_start if tree is None: raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.") + if self.parallel_opts.get("contract_implementation") == "cpp": + from qibotn.torch_contractor import prepare_torch_cpp_contractor - if int(getattr(tree, "multiplicity", 1)) <= 1: + prepare_torch_cpp_contractor(tree) + + path_cost = contraction_tree_costs(tree) + search_stats = getattr(tree, "qibotn_search_stats", {}) + if save_tree_path and loaded_trees is None: + saved_trees.append(tree) + saved_costs.append(path_cost) + if self.rank == 0 and print_stats: + print( + "tn_search_done " + f"term={term_index} " + f"search_seconds={search_seconds:.3f} " + f"completed_trials={search_stats.get('completed_trials', 'na')} " + f"finite_trials={search_stats.get('finite_trials', 'na')} " + f"failed_trials={search_stats.get('failed_trials', 'na')} " + f"requested_trials={search_stats.get('requested_trials', search_repeats)} " + f"best_score={search_stats.get('best_score', float('nan')):.6g} " + f"slices={path_cost['nslices']} " + f"log10_flops={path_cost['log10_flops']:.3f} " + f"log10_write={path_cost['log10_write']:.3f} " + f"log2_size={path_cost['log2_size']:.3f} " + f"log10_combo={path_cost['log10_combo']:.3f} " + f"peak_memory_gib={path_cost['peak_memory_gib']:.6g}", + flush=True, + ) + + if search_only: + self.parallel_stats.append( + { + "term_index": term_index, + "term_factors": tuple(factors), + "path_cost": path_cost, + "search_stats": search_stats, + "tree_slices": int(getattr(tree, "multiplicity", 1)), + "slice_assignment": "search_only", + "rank_slices": [], + "search_seconds": search_seconds, + "contract_seconds": 0.0, + "search_workers": search_workers, + "search_repeats": search_repeats, + "search_time": search_time, + "search_backend": search_backend or method, + "dask_address": dask_address, + "numa_domain": getattr(self, "numa_domain", None), + } + ) + continue + + if comm is None and int(getattr(tree, "multiplicity", 1)) <= 1: if self.rank == 0: + contract_start = time.perf_counter() value = self._contract_term_unsliced(tn, tree, backend) + contract_seconds = time.perf_counter() - contract_start + if print_stats: + print( + "tn_contract_done " + f"term={term_index} " + f"contract_seconds={contract_seconds:.3f}", + flush=True, + ) self.parallel_stats.append( { + "term_index": term_index, "term_factors": tuple(factors), - "path_cost": contraction_tree_costs(tree), + "path_cost": path_cost, + "search_stats": search_stats, "tree_slices": 1, "slice_assignment": "root", "rank_slices": [1] + [0] * (size - 1), + "search_seconds": search_seconds, + "contract_seconds": contract_seconds, "search_workers": search_workers, "search_repeats": search_repeats, "search_time": search_time, @@ -334,14 +538,27 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): continue if comm is None: + contract_start = time.perf_counter() value = self._contract_term_unsliced(tn, tree, backend) + contract_seconds = time.perf_counter() - contract_start + if print_stats: + print( + "tn_contract_done " + f"term={term_index} " + f"contract_seconds={contract_seconds:.3f}", + flush=True, + ) self.parallel_stats.append( { + "term_index": term_index, "term_factors": tuple(factors), - "path_cost": contraction_tree_costs(tree), + "path_cost": path_cost, + "search_stats": search_stats, "tree_slices": int(getattr(tree, "multiplicity", 1)), "slice_assignment": "local", "rank_slices": [int(getattr(tree, "multiplicity", 1))], + "search_seconds": search_seconds, + "contract_seconds": contract_seconds, "search_workers": search_workers, "search_repeats": search_repeats, "search_time": search_time, @@ -353,6 +570,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) continue + contract_start = time.perf_counter() arrays = self._term_arrays(tn, backend) value, stats = parallel_contract( tree, @@ -360,18 +578,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): method="mpi", comm=comm, return_stats=True, + implementation=self.parallel_opts.get("contract_implementation"), ) + contract_seconds = time.perf_counter() - contract_start gathered_stats = comm.gather(stats, root=0) if rank == 0: + if print_stats: + print( + "tn_contract_done " + f"term={term_index} " + f"contract_seconds={contract_seconds:.3f}", + flush=True, + ) self.parallel_stats.append( { + "term_index": term_index, "term_factors": tuple(factors), - "path_cost": contraction_tree_costs(tree), + "path_cost": path_cost, + "search_stats": search_stats, "tree_slices": stats.nslices, "slice_assignment": stats.assignment, "rank_slices": [ item.local_slices for item in gathered_stats ], + "search_seconds": search_seconds, + "contract_seconds": contract_seconds, "search_workers": search_workers, "search_repeats": search_repeats, "search_time": search_time, @@ -382,22 +613,73 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): ) total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) + if self.rank == 0 and save_tree_path and loaded_trees is None: + path = Path(save_tree_path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("wb") as f: + pickle.dump( + { + "trees": saved_trees, + "costs": saved_costs, + "nterms": len(saved_trees), + }, + f, + protocol=pickle.HIGHEST_PROTOCOL, + ) + if print_stats: + print( + f"tn_tree_saved path={save_tree_path} nterms={len(saved_trees)}", + flush=True, + ) + + if search_only: + return np.nan + return np.nan if rank != 0 else float(np.real(total_value)) def _contract_term_unsliced(self, tn, tree, backend): + contract_implementation = self.parallel_opts.get("contract_implementation") + if contract_implementation == "cpp": + if backend.backend != "torch": + raise ValueError("contract_implementation='cpp' requires torch backend.") + from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype + from qibotn.torch_contractor import contract_tree_cpp + + arrays = [ + _torch_cpu_array(array, dtype=_torch_dtype(self.dtype)) + for array in tn.arrays + ] + nslices = int(getattr(tree, "multiplicity", 1)) + if nslices > 1: + total = None + for slice_id in range(nslices): + value = contract_tree_cpp(tree, tree.slice_arrays(arrays, slice_id)) + total = value if total is None else total + value + return total + return contract_tree_cpp(tree, arrays) + if backend.backend == "torch": - import torch - from qibotn.backends.quimb import _torch_cpu_array + from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype for tensor in tn.tensors: - tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128) - return tn.contract(all, output_inds=(), optimize=tree, backend="torch") + tensor._data = _torch_cpu_array( + tensor._data, + dtype=_torch_dtype(self.dtype), + ) + return tn.contract( + all, + output_inds=(), + optimize=tree, + backend="torch", + implementation=contract_implementation, + ) return tn.contract( all, output_inds=(), optimize=tree, backend=backend.backend, + implementation=contract_implementation, ) def _mpi_slicing_opts(self, user_slicing_opts): @@ -405,11 +687,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): def _term_arrays(self, tn, backend): if backend.backend == "torch": - import torch - from qibotn.backends.quimb import _torch_cpu_array + from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype return [ - _torch_cpu_array(array, dtype=torch.complex128) + _torch_cpu_array(array, dtype=_torch_dtype(self.dtype)) for array in tn.arrays ] - return [backend.engine.asarray(array) for array in tn.arrays] + from qibotn.backends.quimb import _numpy_dtype + + return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays] diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py index ea894d9..3d49b00 100644 --- a/src/qibotn/backends/quimb.py +++ b/src/qibotn/backends/quimb.py @@ -62,12 +62,26 @@ def _torch_cpu_array(data, dtype=None): return x -def _arrays_to_backend(arrays, backend, engine): - if backend == "torch": - import torch +def _torch_dtype(dtype): + import torch - return [_torch_cpu_array(array, dtype=torch.complex128) for array in arrays] - return [engine.asarray(array) for array in arrays] + if dtype in ("complex64", "single"): + return torch.complex64 + return torch.complex128 + + +def _numpy_dtype(dtype): + import numpy as np + + if dtype in ("complex64", "single"): + return np.complex64 + return np.complex128 + + +def _arrays_to_backend(arrays, backend, engine, dtype="complex128"): + if backend == "torch": + return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in arrays] + return [engine.asarray(array, dtype=_numpy_dtype(dtype)) for array in arrays] def _pauli_term_to_dense_operator(factors): @@ -145,7 +159,7 @@ def pauli_product_expectation( return tn.contract(all, output_inds=(), optimize=optimize, backend=backend) -def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"): +def __init__(self, quimb_backend="torch", contraction_optimizer="auto-hq"): super(self.__class__, self).__init__() self.name = "qibotn" @@ -198,7 +212,7 @@ def circuit_ansatz(self): def setup_backend_specifics( - self, quimb_backend="numpy", contractions_optimizer="auto-hq" + self, quimb_backend="torch", contractions_optimizer="auto-hq" ): """Setup backend specifics. Args: @@ -645,7 +659,7 @@ METHODS = { } -def _generate_backend(quimb_backend: str = "numpy"): +def _generate_backend(quimb_backend: str = "torch"): bases = (QibotnBackend,) if quimb_backend == "numpy": @@ -653,9 +667,14 @@ def _generate_backend(quimb_backend: str = "numpy"): bases += (NumpyBackend,) elif quimb_backend == "torch": - from qiboml.backends import PyTorchBackend + try: + from qiboml.backends import PyTorchBackend + except ImportError: + from qibo.backends import NumpyBackend - bases += (PyTorchBackend,) + bases += (NumpyBackend,) + else: + bases += (PyTorchBackend,) elif quimb_backend == "jax": from qiboml.backends import JaxBackend diff --git a/src/qibotn/backends/vidal.py b/src/qibotn/backends/vidal.py index 02c8a93..8fbd4d5 100644 --- a/src/qibotn/backends/vidal.py +++ b/src/qibotn/backends/vidal.py @@ -39,6 +39,162 @@ def _symbolic_hamiltonian_to_pauli_terms(hamiltonian): return terms +def _symbolic_hamiltonian_to_operator_terms(hamiltonian): + terms = [] + factor_pattern = re.compile(r"([^\d]+)(\d+)") + paulis = { + "I": np.eye(2, dtype=np.complex128), + "X": np.array([[0, 1], [1, 0]], dtype=np.complex128), + "Y": np.array([[0, -1j], [1j, 0]], dtype=np.complex128), + "Z": np.array([[1, 0], [0, -1]], dtype=np.complex128), + } + for term in hamiltonian.terms: + ops_by_site = {} + for factor in term.factors: + site = getattr(factor, "target_qubit", None) + matrix = getattr(factor, "matrix", None) + if site is None or matrix is None: + match = factor_pattern.match(str(factor)) + if match is None: + raise ValueError(f"Unsupported observable factor {factor!r}.") + name = match.group(1).upper() + if name not in paulis: + raise ValueError(f"Unsupported observable operator {name!r}.") + site = int(match.group(2)) + matrix = paulis[name] + matrix = np.asarray(matrix, dtype=np.complex128) + site = int(site) + if site in ops_by_site: + ops_by_site[site] = ops_by_site[site] @ matrix + else: + ops_by_site[site] = matrix + terms.append((complex(term.coefficient), tuple(ops_by_site.items()))) + return terms + + +def _dense_operator_to_product_terms(coeff, qubits, matrix): + """Expand a dense k-local operator into product-matrix terms. + + The dense matrix basis is ordered by the provided ``qubits`` sequence. For + example, ``qubits=[2, 5]`` means matrix rows/columns are ordered as + ``|q2 q5>``. + """ + qubits = tuple(int(qubit) for qubit in qubits) + if len(set(qubits)) != len(qubits): + raise ValueError("Dense observable qubits must be unique.") + matrix = np.asarray(matrix, dtype=np.complex128) + dim = 2 ** len(qubits) + if matrix.shape != (dim, dim): + raise ValueError( + "Dense observable matrix shape must be " + f"({dim}, {dim}) for {len(qubits)} qubits." + ) + + units = [ + np.array([[1, 0], [0, 0]], dtype=np.complex128), + np.array([[0, 1], [0, 0]], dtype=np.complex128), + np.array([[0, 0], [1, 0]], dtype=np.complex128), + np.array([[0, 0], [0, 1]], dtype=np.complex128), + ] + terms = [] + for row in range(dim): + for col in range(dim): + value = complex(coeff) * complex(matrix[row, col]) + if value == 0: + continue + ops = [] + for offset, site in enumerate(qubits): + shift = len(qubits) - offset - 1 + out_bit = (row >> shift) & 1 + in_bit = (col >> shift) & 1 + ops.append((site, units[2 * out_bit + in_bit])) + terms.append((value, tuple(ops))) + return terms + + +def _dense_observable_to_operator_terms(observable): + if not isinstance(observable, dict): + return None + + if "matrix" in observable: + terms = [observable] + else: + terms = observable.get("dense_terms") + if terms is None: + raw_terms = observable.get("terms") + if not raw_terms or not any("matrix" in term for term in raw_terms): + return None + terms = raw_terms + + operator_terms = [] + for term in terms: + if "matrix" not in term: + raise ValueError("Dense observable terms must include a matrix.") + qubits = term.get("qubits", term.get("sites")) + if qubits is None: + raise ValueError("Dense observable terms must include qubits or sites.") + operator_terms.extend( + _dense_operator_to_product_terms( + term.get("coefficient", 1.0), + qubits, + term["matrix"], + ) + ) + return operator_terms + + +def _operator_terms_to_mpo(terms, nqubits): + """Build an exact direct-sum MPO for product-operator terms. + + This intentionally favors correctness and generality over compression: an + ``m``-term sum becomes an MPO with bond dimension ``m``. Local Hamiltonians + can be compressed later without changing the public expectation path. + """ + identity = np.eye(2, dtype=np.complex128) + expanded_terms = [] + for coeff, ops in terms: + local_ops = [identity for _ in range(nqubits)] + for site, matrix in ops: + site = int(site) + if site < 0 or site >= nqubits: + raise ValueError(f"Observable site {site} is outside the circuit.") + matrix = np.asarray(matrix, dtype=np.complex128) + if matrix.shape != (2, 2): + raise ValueError("Only qubit local operators with shape (2, 2) are supported.") + local_ops[site] = matrix + expanded_terms.append((complex(coeff), local_ops)) + + if not expanded_terms: + raise ValueError("Cannot build an MPO from an empty observable.") + + bond_dim = len(expanded_terms) + mpo = [] + for site in range(nqubits): + left_dim = 1 if site == 0 else bond_dim + right_dim = 1 if site == nqubits - 1 else bond_dim + tensor = np.zeros((left_dim, 2, 2, right_dim), dtype=np.complex128) + for term_index, (coeff, local_ops) in enumerate(expanded_terms): + left = 0 if site == 0 else term_index + right = 0 if site == nqubits - 1 else term_index + op = coeff * local_ops[site] if site == 0 else local_ops[site] + tensor[left, :, :, right] += op + mpo.append(tensor) + return mpo + + +def _observable_mpo_tensors(observable, nqubits=None): + if isinstance(observable, dict): + if "mpo_tensors" in observable: + return observable["mpo_tensors"] + if "mpo" in observable: + return observable["mpo"] + if nqubits is not None: + terms = _dense_observable_to_operator_terms(observable) + if terms is not None: + return _operator_terms_to_mpo(terms, nqubits) + return None + + def _unsupported_reason(circuit): for gate in circuit.queue: name = getattr(gate, "name", gate.__class__.__name__) @@ -71,6 +227,44 @@ def _can_route_non_adjacent(circuit): return True +@dataclass +class _PreparedCircuit: + nqubits: int + queue: list + + +def _decompose_gate_for_mps(gate, nqubits, stack=()): + sites = _gate_sites(gate) + if len(sites) <= 2: + return [gate] + if gate in stack or not hasattr(gate, "decompose"): + name = getattr(gate, "name", gate.__class__.__name__) + raise ValueError(f"gate {name} acts on {len(sites)} qubits") + + free = [qubit for qubit in range(nqubits) if qubit not in sites] + try: + decomposed = gate.decompose(*free, use_toffolis=False, method="standard") + except TypeError: + decomposed = gate.decompose(*free) + if not decomposed or decomposed == [gate]: + name = getattr(gate, "name", gate.__class__.__name__) + raise ValueError(f"gate {name} could not be decomposed for Vidal MPS") + + result = [] + for item in decomposed: + result.extend(_decompose_gate_for_mps(item, nqubits, stack + (gate,))) + return result + + +def _prepare_circuit_for_mps(circuit, decompose=True): + if not decompose: + return circuit + queue = [] + for gate in circuit.queue: + queue.extend(_decompose_gate_for_mps(gate, circuit.nqubits)) + return _PreparedCircuit(nqubits=circuit.nqubits, queue=queue) + + @dataclass class VidalBackend(QibotnBackend, NumpyBackend): """QiboTN backend using Vidal/TEBD when possible. @@ -89,14 +283,16 @@ class VidalBackend(QibotnBackend, NumpyBackend): self.name = "qibotn" self.platform = "vidal" self.precision = "double" + self.rank = 0 self.last_truncation_error = 0.0 + self.last_max_truncation_error = 0.0 self.configure_tn_simulation() def configure_tn_simulation( self, ansatz: str = "MPS", - max_bond_dimension: int = 10, - cut_ratio: float = 1e-9, + max_bond_dimension: int | None = 10, + cut_ratio: float | None = 1e-9, trunc_tracking_mode: str = "C", svd_control: str = "E!", ini_bond_dimension: int = 1, @@ -108,6 +304,7 @@ class VidalBackend(QibotnBackend, NumpyBackend): mpi_num_procs: int = 1, mpi_where_barriers: int = -1, mpi_isometrization: int = -1, + mpi_term_batch_size: int | None = None, fallback: bool = True, ): self.ansatz = ansatz @@ -124,6 +321,7 @@ class VidalBackend(QibotnBackend, NumpyBackend): self.mpi_num_procs = mpi_num_procs self.mpi_where_barriers = mpi_where_barriers self.mpi_isometrization = mpi_isometrization + self.mpi_term_batch_size = mpi_term_batch_size self.fallback = fallback self._fallback_backend = None @@ -157,10 +355,15 @@ class VidalBackend(QibotnBackend, NumpyBackend): raise NotImplementedError(reason) return self._qmatchatea_fallback() + def _preprocess_circuit(self, circuit, compile_circuit): + """Decompose unsupported multi-qubit gates for the local Vidal path.""" + return _prepare_circuit_for_mps(circuit, decompose=True) + def _run_fast_executor(self, circuit, compile_circuit=True): if self.mpi_approach == "CT": from mpi4py import MPI + self.rank = MPI.COMM_WORLD.Get_rank() executor = SegmentVidalMPIExecutor( nqubits=circuit.nqubits, max_bond=self.max_bond_dimension, @@ -169,6 +372,7 @@ class VidalBackend(QibotnBackend, NumpyBackend): comm=MPI.COMM_WORLD, ) else: + self.rank = 0 executor = VidalTEBDExecutor( nqubits=circuit.nqubits, max_bond=self.max_bond_dimension, @@ -183,9 +387,21 @@ class VidalBackend(QibotnBackend, NumpyBackend): backend = self._fallback_or_raise("VidalBackend supports only MPS.") return backend.expectation(circuit, observable, preprocess, compile_circuit) + original_circuit = circuit if compile_circuit is None: compile_circuit = self.compile_circuit + if preprocess: + try: + circuit = self._preprocess_circuit(circuit, compile_circuit) + except Exception as exc: + backend = self._fallback_or_raise( + f"VidalBackend preprocessing failed: {exc}" + ) + return backend.expectation( + original_circuit, observable, preprocess, compile_circuit + ) + reason = _unsupported_reason(circuit) if reason is not None: # Non-adjacent gates can be routed at compile time @@ -193,40 +409,51 @@ class VidalBackend(QibotnBackend, NumpyBackend): pass # proceed with Vidal + SWAP routing else: backend = self._fallback_or_raise(reason) - return backend.expectation(circuit, observable, preprocess, compile_circuit) + return backend.expectation( + original_circuit, observable, preprocess, compile_circuit + ) - # preprocess=True requests qmatchatea-style transpilation to MPS topology. - # The fast path validates MPS-compatibility via _unsupported_reason above; - # if the circuit is already MPS-compatible, preprocessing is a no-op. - # If it isn't, _unsupported_reason already routed to fallback. - # So the fast path skips preprocessing — just warn once. - if preprocess: - import warnings + executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit) + self.last_truncation_error = float( + executor.global_truncation_error() + if hasattr(executor, "global_truncation_error") + else executor.truncation_error + ) + self.last_max_truncation_error = float( + executor.global_max_truncation_error() + if hasattr(executor, "global_max_truncation_error") + else executor.max_truncation_error + ) - warnings.warn( - "VidalBackend fast path does not preprocess circuits. " - "If the circuit passes _unsupported_reason (all gates are " - "MPS-compatible), fast-path execution proceeds directly.", - stacklevel=2, - ) + mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits) + if mpo_tensors is not None: + if self.mpi_approach == "CT": + value = executor.expectation_mpo_root(mpo_tensors) + from qtealeaves.tooling.mpisupport import MPI + + if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0: + return np.nan + return value + return executor.expectation_mpo(mpo_tensors) hamiltonian = check_observable(observable, circuit.nqubits) try: - terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian) + terms = _symbolic_hamiltonian_to_operator_terms(hamiltonian) except ValueError as exc: backend = self._fallback_or_raise(str(exc)) - return backend.expectation(circuit, observable, preprocess, compile_circuit) + return backend.expectation( + original_circuit, observable, preprocess, compile_circuit + ) - executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit) - self.last_truncation_error = float(executor.truncation_error) + mpo_tensors = _operator_terms_to_mpo(terms, circuit.nqubits) if self.mpi_approach == "CT": - value = executor.expectation_pauli_sum_root(terms) + value = executor.expectation_mpo_root(mpo_tensors) from qtealeaves.tooling.mpisupport import MPI if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0: return np.nan - return np.real(value) - return np.real(executor.expectation_pauli_sum(terms)) + return value + return executor.expectation_mpo(mpo_tensors) def execute_circuit( self, diff --git a/src/qibotn/backends/vidal_mpi_segment.py b/src/qibotn/backends/vidal_mpi_segment.py index c0742c6..1014588 100644 --- a/src/qibotn/backends/vidal_mpi_segment.py +++ b/src/qibotn/backends/vidal_mpi_segment.py @@ -23,6 +23,7 @@ from qibotn.backends.vidal_tebd import ( _is_two_qubit_batch, _make_two_site_update, _ones, + _real_if_close, _route_non_adjacent_gates, _svd, _tensor_update_from_numpy, @@ -35,6 +36,8 @@ from qibotn.backends.vidal_tebd import ( _EDGE_TAG = 1701 _UPDATE_TAG = 1702 +_EXPECT_ENV_TAG = 1703 +_EXPECT_RESULT_TAG = 1704 def _partition_sites(nsites, nranks): @@ -49,9 +52,9 @@ def _partition_sites(nsites, nranks): @dataclass class SegmentVidalMPIExecutor: nqubits: int - max_bond: int + max_bond: int | None comm: object - cut_ratio: float = 1e-12 + cut_ratio: float | None = 1e-12 tensor_module: str = "torch" def __post_init__(self): @@ -63,6 +66,10 @@ class SegmentVidalMPIExecutor: if self.start == self.end: raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.") + from qibotn.backends.cpu import _bind_numa_node + + self.numa_domain = _bind_numa_node(self.rank) + self.xp = _backend_module(self.tensor_module) if self.xp is np: self.dtype = np.complex128 @@ -82,11 +89,22 @@ class SegmentVidalMPIExecutor: for bond in range(self.start, self.end + 1) } self._accumulated_truncation_error = 0.0 + self._max_truncation_error = 0.0 @property def truncation_error(self): return self._accumulated_truncation_error + def global_truncation_error(self): + return self.comm.allreduce(self._accumulated_truncation_error, op=MPI.SUM) + + @property + def max_truncation_error(self): + return self._max_truncation_error + + def global_max_truncation_error(self): + return self.comm.allreduce(self._max_truncation_error, op=MPI.MAX) + def owns_site(self, site): return self.start <= site < self.end @@ -270,7 +288,12 @@ class SegmentVidalMPIExecutor: def _install_update(self, update): if isinstance(update["left"], np.ndarray): update = _tensor_update_from_numpy(self.xp, update, self.dtype) - self._accumulated_truncation_error += update.get("truncation_error", 0.0) + truncation_error = update.get("truncation_error", 0.0) + self._accumulated_truncation_error += truncation_error + self._max_truncation_error = max( + self._max_truncation_error, + truncation_error, + ) site = update["site"] if self.owns_site(site): self.gammas[site] = update["left"] @@ -288,26 +311,189 @@ class SegmentVidalMPIExecutor: } return self.comm.gather(payload, root=0) - def expectation_pauli_sum_root(self, terms): - payloads = self.gather_full_state() - if self.rank != 0: - return None - full = VidalTEBDExecutor( - self.nqubits, - self.max_bond, - cut_ratio=self.cut_ratio, - tensor_module=self.tensor_module, + def expectation_pauli_sum_root(self, terms, term_batch_size=None): + paulis = { + "I": self._eye(2), + "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype), + "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype), + "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype), + } + operator_terms = [ + ( + coeff, + tuple((site, paulis[name.upper()]) for name, site in ops), + ) + for coeff, ops in terms + ] + return self.expectation_operator_sum_root( + operator_terms, + term_batch_size=term_batch_size, ) - for payload in payloads: - for site, tensor in payload["gammas"].items(): - full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype) - for bond, tensor in payload["lambdas"].items(): - full.lambdas[int(bond)] = _asarray( - self.xp, - tensor, - self.xp.float64 if self.xp is not np else np.float64, + + def expectation_operator_sum_root(self, terms, term_batch_size=None): + if term_batch_size is None: + term_batch_size = max(1, len(terms)) + norm = self._distributed_product_expectation({}) + total = 0.0 + 0.0j + for start in range(0, len(terms), int(term_batch_size)): + batch = terms[start : start + int(term_batch_size)] + values = self._distributed_operator_batch_expectation(batch, norm) + if self.rank == 0: + for (coeff, _), term_value in zip(batch, values): + total += complex(coeff) * complex(term_value) + return None if self.rank != 0 else _real_if_close(total / norm) + + def _eye(self, size): + if self.xp is np: + return np.eye(size, dtype=self.dtype) + return self.xp.eye(size, dtype=self.dtype, device=self.device) + + def _distributed_product_expectation(self, operators): + if self.rank == 0: + env = self._segment_product_environment(operators) + if self.size == 1: + return env.reshape(-1)[0] + self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG) + return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG) + + incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG) + env = self._segment_product_environment(operators, incoming) + if self.rank == self.size - 1: + self.comm.send(_to_numpy(env).reshape(-1)[0], dest=0, tag=_EXPECT_RESULT_TAG) + else: + self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG) + return None + + def _segment_product_environment(self, operators, incoming=None): + if incoming is None: + env = _asarray( + self.xp, + np.eye(len(self.lambdas[self.start]), dtype=np.complex128), + self.dtype, + ) + else: + env = _asarray(self.xp, incoming, self.dtype) + + identity = self._eye(2) + for site in range(self.start, self.end): + tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1) + op = operators.get(site, identity) + env = self.xp.einsum( + "xy,xsb,st,ytd->bd", env, self._conj(tensor), op, tensor + ) + return env + + def _distributed_operator_batch_expectation(self, terms, norm): + if not terms: + return [] + if all(not ops for _, ops in terms): + return [norm] * len(terms) if self.rank == 0 else None + + batch_ops = [ + {int(site): _asarray(self.xp, matrix, self.dtype) for site, matrix in ops} + for _, ops in terms + ] + if self.rank == 0: + env = self._segment_operator_batch_environment(batch_ops) + if self.size == 1: + return list(env.reshape(len(terms), -1)[:, 0]) + self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG) + return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG) + + incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG) + env = self._segment_operator_batch_environment(batch_ops, incoming) + if self.rank == self.size - 1: + values = list(_to_numpy(env).reshape(len(terms), -1)[:, 0]) + self.comm.send(values, dest=0, tag=_EXPECT_RESULT_TAG) + else: + self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG) + return None + + def _segment_operator_batch_environment(self, batch_ops, incoming=None): + batch_size = len(batch_ops) + if incoming is None: + dim = len(self.lambdas[self.start]) + env = _asarray( + self.xp, + np.tile(np.eye(dim, dtype=np.complex128), (batch_size, 1, 1)), + self.dtype, + ) + else: + env = _asarray(self.xp, incoming, self.dtype) + + identity = self._eye(2) + for site in range(self.start, self.end): + tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1) + ops = self.xp.stack( + [operators.get(site, identity) for operators in batch_ops], + axis=0, + ) + env = self.xp.einsum( + "nxy,xsb,nst,ytd->nbd", + env, + self._conj(tensor), + ops, + tensor, + ) + return env + + def _conj(self, tensor): + return np.conjugate(tensor) if self.xp is np else tensor.conj() + + def expectation_mpo_root(self, mpo_tensors): + if len(mpo_tensors) != self.nqubits: + raise ValueError( + f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}." + ) + norm = self._distributed_product_expectation({}) + if self.rank == 0: + env = self._segment_mpo_environment(mpo_tensors) + if self.size == 1: + return _real_if_close(env.reshape(-1)[0] / norm) + self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG) + value = self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG) + return _real_if_close(value / norm) + + incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG) + env = self._segment_mpo_environment(mpo_tensors, incoming) + if self.rank == self.size - 1: + self.comm.send( + _to_numpy(env).reshape(-1)[0], + dest=0, + tag=_EXPECT_RESULT_TAG, + ) + else: + self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG) + return None + + def _segment_mpo_environment(self, mpo_tensors, incoming=None): + if incoming is None: + left_dim = len(self.lambdas[self.start]) + env = _asarray( + self.xp, + np.zeros((left_dim, 1, left_dim), dtype=np.complex128), + self.dtype, + ) + env[:, 0, :] = self._eye(left_dim) + else: + env = _asarray(self.xp, incoming, self.dtype) + + for site in range(self.start, self.end): + mpo = _asarray(self.xp, mpo_tensors[site], self.dtype) + if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2): + raise ValueError( + "Each MPO tensor must have shape " + "(left_bond, 2, 2, right_bond)." ) - return full.expectation_pauli_sum(terms) + tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1) + env = self.xp.einsum( + "xlc,xub,lutr,ctd->brd", + env, + self._conj(tensor), + mpo, + tensor, + ) + return env def expectation_ring_xz_root(self): terms = [ diff --git a/src/qibotn/backends/vidal_tebd.py b/src/qibotn/backends/vidal_tebd.py index e2b2640..25f001e 100644 --- a/src/qibotn/backends/vidal_tebd.py +++ b/src/qibotn/backends/vidal_tebd.py @@ -50,10 +50,10 @@ def _transpose(xp, tensor, axes): return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes) -def _vdot_real(xp, left, right): +def _vdot(xp, left, right): if xp is np: - return np.vdot(left.reshape(-1), right.reshape(-1)).real - return xp.vdot(left.reshape(-1), right.reshape(-1)).real + return np.vdot(left.reshape(-1), right.reshape(-1)) + return xp.vdot(left.reshape(-1), right.reshape(-1)) def _to_float(x): @@ -62,6 +62,19 @@ def _to_float(x): return float(x) +def _to_scalar(x): + if hasattr(x, "detach"): + return x.detach().cpu().item() + if isinstance(x, np.ndarray): + return x.item() + return x + + +def _real_if_close(x, tol=1000): + value = np.real_if_close(x, tol=tol) + return value.item() if isinstance(value, np.ndarray) else value + + def _to_numpy(tensor): if hasattr(tensor, "detach"): return tensor.detach().cpu().numpy() @@ -154,8 +167,8 @@ def _safe_inverse(xp, values): @dataclass class VidalTEBDExecutor: nqubits: int - max_bond: int - cut_ratio: float = 1e-12 + max_bond: int | None + cut_ratio: float | None = 1e-12 tensor_module: str = "torch" def __post_init__(self): @@ -175,6 +188,7 @@ class VidalTEBDExecutor: _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1) ] self._accumulated_truncation_error = 0.0 + self._max_truncation_error = 0.0 def run_circuit(self, circuit, compile_circuit=True): gates = circuit.queue @@ -189,6 +203,10 @@ class VidalTEBDExecutor: def truncation_error(self): return self._accumulated_truncation_error + @property + def max_truncation_error(self): + return self._max_truncation_error + def _apply_gate(self, gate): sites = _gate_sites(gate) matrix = _asarray(self.xp, gate.matrix(), self.dtype) @@ -232,6 +250,10 @@ class VidalTEBDExecutor: update = _make_two_site_update(item, umat, singvals, vh, self.max_bond, self.cut_ratio, self.xp) self._accumulated_truncation_error += update["truncation_error"] + self._max_truncation_error = max( + self._max_truncation_error, + update["truncation_error"], + ) i = update["site"] self.gammas[i] = update["left"] self.gammas[i + 1] = update["right"] @@ -252,25 +274,37 @@ class VidalTEBDExecutor: "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype), "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype), } - value = 0.0 + operator_terms = [ + ( + coeff, + tuple((site, paulis[name.upper()]) for name, site in ops), + ) + for coeff, ops in terms + ] + return self.expectation_operator_sum(operator_terms) + + def expectation_operator_sum(self, terms): + value = 0.0 + 0.0j norm = self.norm() for coeff, ops in terms: - ops = tuple((name.upper(), site) for name, site in ops) + operators = { + int(site): _asarray(self.xp, matrix, self.dtype) + for site, matrix in ops + } if len(ops) == 0: term_value = norm - elif len(ops) == 1: - name, site = ops[0] - term_value = _to_float(self._expect_one_site(site, paulis[name])) - elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1: - (name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1]) - term_value = _to_float( - self._expect_adjacent(site0, paulis[name0], paulis[name1]) + elif len(operators) == 1: + site, matrix = next(iter(operators.items())) + term_value = _to_scalar(self._expect_one_site(site, matrix)) + elif len(operators) == 2 and abs(max(operators) - min(operators)) == 1: + site0, site1 = sorted(operators) + term_value = _to_scalar( + self._expect_adjacent(site0, operators[site0], operators[site1]) ) else: - operators = {site: paulis[name] for name, site in ops} - term_value = _to_float(self.expect_product_operators(operators)) - value += float(np.real(coeff)) * term_value - return value / norm + term_value = _to_scalar(self.expect_product_operators(operators)) + value += complex(coeff) * complex(term_value) + return _real_if_close(value / norm) def _expect_one_site(self, site, op): theta = self.xp.einsum( @@ -280,7 +314,7 @@ class VidalTEBDExecutor: self.lambdas[site + 1], ) op_theta = self.xp.einsum("us,asb->aub", op, theta) - return _vdot_real(self.xp, theta, op_theta) + return _vdot(self.xp, theta, op_theta) def _expect_adjacent(self, site, op_left, op_right): theta = self.xp.einsum( @@ -292,7 +326,7 @@ class VidalTEBDExecutor: self.lambdas[site + 2], ) op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta) - return _vdot_real(self.xp, theta, op_theta) + return _vdot(self.xp, theta, op_theta) def expect_product_operators(self, operators): env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype) @@ -303,10 +337,38 @@ class VidalTEBDExecutor: env = self.xp.einsum( "xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor ) - return env.reshape(-1)[0].real + return env.reshape(-1)[0] def norm(self): - return _to_float(self.expect_product_operators({})) + return float(np.real(_to_scalar(self.expect_product_operators({})))) + + def expectation_mpo(self, mpo_tensors): + """Compute `` / ``. + + MPO tensors are expected in ``(left_bond, phys_out, phys_in, right_bond)`` + order, with physical dimension 2 on every site. + """ + if len(mpo_tensors) != self.nqubits: + raise ValueError( + f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}." + ) + env = _asarray(self.xp, [[[1.0 + 0.0j]]], self.dtype) + for site, raw_mpo in enumerate(mpo_tensors): + mpo = _asarray(self.xp, raw_mpo, self.dtype) + if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2): + raise ValueError( + "Each MPO tensor must have shape " + "(left_bond, 2, 2, right_bond)." + ) + tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1) + env = self.xp.einsum( + "xlc,xub,lutr,ctd->brd", + env, + _conj(self.xp, tensor), + mpo, + tensor, + ) + return _real_if_close(_to_scalar(env.reshape(-1)[0]) / self.norm()) def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right): @@ -328,8 +390,8 @@ def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, ga def _choose_bond(singvals, max_bond, cut_ratio, xp): max_possible = int(singvals.shape[0]) - keep = min(max_possible, max_bond) - if cut_ratio > 0 and max_possible > 0: + keep = max_possible if max_bond is None else min(max_possible, int(max_bond)) + if cut_ratio is not None and cut_ratio > 0 and max_possible > 0: threshold = singvals[0] * cut_ratio if xp is np: ratio_keep = int(np.count_nonzero(singvals > threshold)) @@ -415,8 +477,8 @@ class _RoutedTwoQubitGate: name = "routed_two_qubit" control_qubits = () - def __init__(self, original_gate, left_phys, right_phys): - self.target_qubits = (left_phys, right_phys) + def __init__(self, original_gate, physical_sites): + self.target_qubits = tuple(physical_sites) self._matrix = original_gate.matrix() def matrix(self): @@ -447,8 +509,10 @@ def _route_non_adjacent_gates(gates, nqubits): for pos in range(right - 1, left, -1): routed.append(_SWAPGate(pos, pos + 1)) - # Apply the original gate at the adjacent physical positions - routed.append(_RoutedTwoQubitGate(gate, left, left + 1)) + # Apply the original gate in its original qubit order. For gates like + # CNOT(5, 0), sorting the routed sites would swap control and target. + physical_map = {left: left, right: left + 1} + routed.append(_RoutedTwoQubitGate(gate, [physical_map[site] for site in sites])) # Reverse SWAPs to restore original ordering for pos in range(left + 1, right): diff --git a/src/qibotn/circuit_convertor.py b/src/qibotn/circuit_convertor.py index 1c8b3ee..900cdf7 100644 --- a/src/qibotn/circuit_convertor.py +++ b/src/qibotn/circuit_convertor.py @@ -1,6 +1,19 @@ -import cupy as cp import numpy as np +try: + import cupy as cp +except ImportError: # pragma: no cover - exercised on CPU-only installations + cp = None + + +def _require_cupy(): + if cp is None: + raise ImportError( + "The cuQuantum circuit converter requires cupy. " + "Install the GPU dependencies or use the CPU backend." + ) + return cp + # Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter @@ -19,7 +32,7 @@ class QiboCircuitToEinsum: """ def __init__(self, circuit, dtype="complex128"): - self.backend = cp + self.backend = _require_cupy() self.dtype = getattr(self.backend, dtype) self.init_basis_map(self.backend, dtype) self.init_intermediate_circuit(circuit) @@ -116,7 +129,9 @@ class QiboCircuitToEinsum: required_shape = self.op_shape_from_qubits(len(gate_qubits)) self.gate_tensors.append( ( - cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape), + self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape( + required_shape + ), gate_qubits, ) ) @@ -161,7 +176,7 @@ class QiboCircuitToEinsum: required_shape = self.op_shape_from_qubits(len(gate_qubits)) self.gate_tensors_inverse.append( ( - cp.asarray(gate.matrix()).reshape(required_shape), + self.backend.asarray(gate.matrix()).reshape(required_shape), gate_qubits, ) ) @@ -169,7 +184,7 @@ class QiboCircuitToEinsum: # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. self.active_qubits_inverse = np.unique(gates_qubits_inverse) - def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp): + def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None): """Populate the gates for all pauli operators. Parameters: @@ -180,6 +195,8 @@ class QiboCircuitToEinsum: Returns: A sequence of pauli gates. """ + if backend is None: + backend = _require_cupy() asarray = backend.asarray pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype) pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype) diff --git a/src/qibotn/circuit_to_mps.py b/src/qibotn/circuit_to_mps.py index 680ef9b..48cf55d 100644 --- a/src/qibotn/circuit_to_mps.py +++ b/src/qibotn/circuit_to_mps.py @@ -1,10 +1,23 @@ -import cupy as cp -import cuquantum.bindings.cutensornet as cutn import numpy as np from qibotn.circuit_convertor import QiboCircuitToEinsum from qibotn.mps_utils import apply_gate, initial +try: + import cupy as cp + import cuquantum.bindings.cutensornet as cutn +except ImportError: # pragma: no cover - exercised on CPU-only installations + cp = None + cutn = None + + +def _require_cuquantum(): + if cp is None or cutn is None: + raise ImportError( + "The cuQuantum MPS converter requires cupy and cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) + class QiboCircuitToMPS: """A helper class to convert Qibo circuit to MPS. @@ -23,6 +36,7 @@ class QiboCircuitToMPS: dtype="complex128", rand_seed=0, ): + _require_cuquantum() np.random.seed(rand_seed) cp.random.seed(rand_seed) @@ -44,4 +58,6 @@ class QiboCircuitToMPS: ) def __del__(self): - cutn.destroy(self.handle) + handle = getattr(self, "handle", None) + if cutn is not None and handle is not None: + cutn.destroy(handle) diff --git a/src/qibotn/csrc/torch_contractor.cpp b/src/qibotn/csrc/torch_contractor.cpp new file mode 100644 index 0000000..9551460 --- /dev/null +++ b/src/qibotn/csrc/torch_contractor.cpp @@ -0,0 +1,1024 @@ +#include + +#include +#include +#include + +#ifdef QIBOTN_USE_MKL +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using Tensor = torch::Tensor; + +struct ContractOp { + int64_t out_id; + int64_t left_id; + int64_t right_id; + int64_t kind; + std::vector left_perm; + std::vector left_shape; + std::vector right_perm; + std::vector right_shape; + std::vector out_shape; + std::vector out_perm; +}; + +bool env_enabled(const char* name) { + const char* raw = std::getenv(name); + if (raw == nullptr) { + return false; + } + std::string value(raw); + return value == "1" || value == "true" || value == "TRUE" || + value == "yes" || value == "on"; +} + +int env_int(const char* name, int default_value) { + const char* raw = std::getenv(name); + if (raw == nullptr || raw[0] == '\0') { + return default_value; + } + char* end = nullptr; + long value = std::strtol(raw, &end, 10); + if (end == raw || value <= 0) { + return default_value; + } + return static_cast(value); +} + +int64_t env_i64(const char* name, int64_t default_value) { + const char* raw = std::getenv(name); + if (raw == nullptr || raw[0] == '\0') { + return default_value; + } + char* end = nullptr; + long long value = std::strtoll(raw, &end, 10); + if (end == raw || value <= 0) { + return default_value; + } + return static_cast(value); +} + +struct WorkspaceKey { + c10::ScalarType dtype; + c10::DeviceType device_type; + c10::DeviceIndex device_index; + int64_t numel; + + bool operator==(const WorkspaceKey& other) const { + return dtype == other.dtype && device_type == other.device_type && + device_index == other.device_index && numel == other.numel; + } +}; + +struct WorkspaceKeyHash { + size_t operator()(const WorkspaceKey& key) const { + size_t h = static_cast(key.numel); + h ^= static_cast(key.dtype) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(key.device_type) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(key.device_index) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + return h; + } +}; + +class Workspace { + public: + Workspace() + : max_cached_bytes_(read_size_env( + "QIBOTN_CPP_WORKSPACE_MAX_BYTES", + 1LL * 1024 * 1024 * 1024)), + max_buffer_bytes_(read_size_env( + "QIBOTN_CPP_WORKSPACE_MAX_BUFFER_BYTES", + 256LL * 1024 * 1024)) {} + + Tensor acquire( + const std::vector& shape, + c10::ScalarType dtype, + c10::Device device) { + const auto numel = product(shape); + WorkspaceKey key{dtype, device.type(), device.index(), numel}; + auto& bucket = pool_[key]; + Tensor out; + if (!bucket.empty()) { + out = bucket.back(); + bucket.pop_back(); + cached_bytes_ -= tensor_bytes(out); + out.resize_(shape); + } else { + out = torch::empty(shape, torch::TensorOptions().dtype(dtype).device(device)); + } + return out; + } + + void release(const Tensor& x) { + if (!x.defined() || !x.is_contiguous() || x.storage_offset() != 0) { + return; + } + const auto bytes = tensor_bytes(x); + if (bytes > max_buffer_bytes_) { + return; + } + if (cached_bytes_ + bytes > max_cached_bytes_) { + return; + } + WorkspaceKey key{ + x.scalar_type(), + x.device().type(), + x.device().index(), + x.numel(), + }; + pool_[key].push_back(x); + cached_bytes_ += bytes; + } + + private: + static int64_t read_size_env(const char* name, int64_t default_value) { + const char* raw = std::getenv(name); + if (raw == nullptr || raw[0] == '\0') { + return default_value; + } + char* end = nullptr; + const auto value = std::strtod(raw, &end); + if (end == raw || value < 0) { + return default_value; + } + std::string suffix(end); + int64_t multiplier = 1; + if (suffix == "g" || suffix == "G" || suffix == "gb" || suffix == "GB") { + multiplier = 1024LL * 1024 * 1024; + } else if ( + suffix == "m" || suffix == "M" || suffix == "mb" || suffix == "MB") { + multiplier = 1024LL * 1024; + } else if ( + suffix == "k" || suffix == "K" || suffix == "kb" || suffix == "KB") { + multiplier = 1024LL; + } + return static_cast(value * multiplier); + } + + static int64_t product(const std::vector& shape) { + int64_t out = 1; + for (const auto value : shape) { + out *= value; + } + return out; + } + + static int64_t tensor_bytes(const Tensor& x) { + return x.numel() * x.element_size(); + } + + std::unordered_map, WorkspaceKeyHash> pool_; + int64_t cached_bytes_ = 0; + int64_t max_cached_bytes_; + int64_t max_buffer_bytes_; +}; + +struct PreparedTensor { + Tensor tensor; + bool workspace_owned; +}; + +std::vector tensor_sizes(const Tensor& x) { + return std::vector(x.sizes().begin(), x.sizes().end()); +} + +PreparedTensor apply_preparation( + Workspace& workspace, + Tensor x, + const std::vector& perm, + const std::vector& shape) { + bool workspace_owned = false; + if (!perm.empty()) { + x = x.permute(perm); + } + if (!shape.empty()) { + try { + x = x.view(shape); + } catch (const c10::Error&) { + Tensor out = workspace.acquire(shape, x.scalar_type(), x.device()); + out.view(tensor_sizes(x)).copy_(x); + x = out; + workspace_owned = true; + } + } + return PreparedTensor{x, workspace_owned}; +} + +std::vector broadcast_shape( + c10::IntArrayRef left, + c10::IntArrayRef right) { + const auto ndim = std::max(left.size(), right.size()); + std::vector out(ndim, 1); + for (size_t i = 0; i < ndim; ++i) { + const auto li = left.size() < ndim - i ? 1 : left[i - (ndim - left.size())]; + const auto ri = right.size() < ndim - i ? 1 : right[i - (ndim - right.size())]; + if (li != ri && li != 1 && ri != 1) { + throw std::runtime_error("Shapes are not broadcastable."); + } + out[i] = std::max(li, ri); + } + return out; +} + +std::vector matmul_output_shape(const Tensor& left, const Tensor& right) { + if (left.dim() == 2 && right.dim() == 2) { + return {left.size(-2), right.size(-1)}; + } + auto batch = broadcast_shape( + left.sizes().slice(0, left.dim() - 2), + right.sizes().slice(0, right.dim() - 2)); + batch.push_back(left.size(-2)); + batch.push_back(right.size(-1)); + return batch; +} + +std::vector multiply_output_shape(const Tensor& left, const Tensor& right) { + return broadcast_shape(left.sizes(), right.sizes()); +} + +bool direct_cblas_mm(const Tensor& left, const Tensor& right, Tensor& out) { +#ifndef QIBOTN_USE_MKL + return false; +#else + static const bool direct_mkl_enabled = env_enabled("QIBOTN_CPP_DIRECT_MKL"); + if (!direct_mkl_enabled) { + return false; + } + if (left.device().type() != c10::DeviceType::CPU || + right.device().type() != c10::DeviceType::CPU || + out.device().type() != c10::DeviceType::CPU) { + return false; + } + if (left.dim() != 2 || right.dim() != 2 || out.dim() != 2) { + return false; + } + if (!left.is_contiguous() || !right.is_contiguous() || !out.is_contiguous()) { + return false; + } + if (left.scalar_type() != right.scalar_type() || + left.scalar_type() != out.scalar_type()) { + return false; + } + if (left.size(1) != right.size(0) || + out.size(0) != left.size(0) || + out.size(1) != right.size(1)) { + return false; + } + + const int m = static_cast(left.size(0)); + const int k = static_cast(left.size(1)); + const int n = static_cast(right.size(1)); + const int lda = static_cast(left.stride(0)); + const int ldb = static_cast(right.stride(0)); + const int ldc = static_cast(out.stride(0)); + const int mkl_threads = env_int("QIBOTN_CPP_MKL_THREADS", 0); + const int previous_threads = + mkl_threads > 0 ? mkl_set_num_threads_local(mkl_threads) : 0; + + if (left.scalar_type() == c10::ScalarType::ComplexFloat) { + const std::complex alpha(1.0f, 0.0f); + const std::complex beta(0.0f, 0.0f); + cblas_cgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + &alpha, + left.data_ptr(), + lda, + right.data_ptr(), + ldb, + &beta, + out.data_ptr(), + ldc); + if (mkl_threads > 0) { + mkl_set_num_threads_local(previous_threads); + } + return true; + } + if (left.scalar_type() == c10::ScalarType::ComplexDouble) { + const std::complex alpha(1.0, 0.0); + const std::complex beta(0.0, 0.0); + cblas_zgemm( + CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + &alpha, + left.data_ptr(), + lda, + right.data_ptr(), + ldb, + &beta, + out.data_ptr(), + ldc); + if (mkl_threads > 0) { + mkl_set_num_threads_local(previous_threads); + } + return true; + } + if (mkl_threads > 0) { + mkl_set_num_threads_local(previous_threads); + } + return false; +#endif +} + +Tensor apply_output_preparation( + Tensor x, + const std::vector& shape, + const std::vector& perm) { + if (!shape.empty()) { + x = x.reshape(shape); + } + if (!perm.empty()) { + x = x.permute(perm); + } + return x; +} + +struct ExecutedOp { + Tensor left; + Tensor right; + bool left_workspace_owned = false; + bool right_workspace_owned = false; +}; + +void release_inputs( + const ContractOp& op, + const ExecutedOp& executed, + size_t ninputs, + std::vector& temps, + Workspace& workspace) { + if (executed.left_workspace_owned) { + workspace.release(executed.left); + } + if (executed.right_workspace_owned) { + workspace.release(executed.right); + } + if (op.left_id >= static_cast(ninputs)) { + workspace.release(temps.at(static_cast(op.left_id))); + temps.at(static_cast(op.left_id)) = Tensor(); + } + if (op.right_id >= static_cast(ninputs)) { + workspace.release(temps.at(static_cast(op.right_id))); + temps.at(static_cast(op.right_id)) = Tensor(); + } +} + +ExecutedOp execute_single_op( + const ContractOp& op, + std::vector& temps, + Workspace& workspace) { + Tensor left = temps.at(static_cast(op.left_id)); + Tensor right = temps.at(static_cast(op.right_id)); + if (!left.defined() || !right.defined()) { + throw std::runtime_error("Contraction plan referenced a released tensor."); + } + + auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape); + auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape); + left = prepared_left.tensor; + right = prepared_right.tensor; + + Tensor out; + if (op.kind == 0) { + auto out_shape = matmul_output_shape(left, right); + out = workspace.acquire(out_shape, left.scalar_type(), left.device()); + if (left.dim() == 2 && right.dim() == 2) { + if (!direct_cblas_mm(left, right, out)) { + at::_ops::mm_out::call(left, right, out); + } + } else if (left.dim() == 3 && right.dim() == 3) { + at::_ops::bmm_out::call(left, right, out); + } else { + out = torch::matmul(left, right); + } + } else if (op.kind == 1) { + auto out_shape = multiply_output_shape(left, right); + out = workspace.acquire(out_shape, left.scalar_type(), left.device()); + at::_ops::mul_out::call(left, right, out); + } else { + throw std::runtime_error("Unknown contraction op kind."); + } + + out = apply_output_preparation(out, op.out_shape, op.out_perm); + temps.at(static_cast(op.out_id)) = out; + return ExecutedOp{left, right, prepared_left.workspace_owned, prepared_right.workspace_owned}; +} + +struct GemmBatchKey { + c10::ScalarType dtype; + int64_t m; + int64_t k; + int64_t n; + + bool operator==(const GemmBatchKey& other) const { + return dtype == other.dtype && m == other.m && k == other.k && n == other.n; + } +}; + +struct GemmBatchKeyHash { + size_t operator()(const GemmBatchKey& key) const { + size_t h = static_cast(key.m); + h ^= static_cast(key.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(key.n) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(key.dtype) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + return h; + } +}; + +bool static_small_2d_gemm_key( + const ContractOp& op, + int64_t max_flops, + GemmBatchKey& key) { + if (op.kind != 0 || op.left_shape.size() != 2 || op.right_shape.size() != 2) { + return false; + } + const int64_t m = op.left_shape[0]; + const int64_t k = op.left_shape[1]; + const int64_t n = op.right_shape[1]; + if (op.right_shape[0] != k || m * k * n > max_flops) { + return false; + } + key = GemmBatchKey{c10::ScalarType::Undefined, m, k, n}; + return true; +} + +#ifdef QIBOTN_USE_MKL +bool execute_mkl_batch( + const std::vector& op_indices, + const std::vector& ops, + std::vector& temps, + Workspace& workspace, + std::vector& executed) { + if (op_indices.empty()) { + return false; + } + + const auto& first_op = ops[op_indices[0]]; + std::vector lefts; + std::vector rights; + std::vector outs; + lefts.reserve(op_indices.size()); + rights.reserve(op_indices.size()); + outs.reserve(op_indices.size()); + executed.clear(); + executed.reserve(op_indices.size()); + + c10::ScalarType dtype = c10::ScalarType::Undefined; + int m = 0; + int k = 0; + int n = 0; + for (const int op_index : op_indices) { + const auto& op = ops[op_index]; + Tensor left = temps.at(static_cast(op.left_id)); + Tensor right = temps.at(static_cast(op.right_id)); + if (!left.defined() || !right.defined()) { + throw std::runtime_error("Batch contraction referenced a released tensor."); + } + auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape); + auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape); + left = prepared_left.tensor; + right = prepared_right.tensor; + if (left.dim() != 2 || right.dim() != 2 || !left.is_contiguous() || + !right.is_contiguous() || left.device().type() != c10::DeviceType::CPU || + right.device().type() != c10::DeviceType::CPU || + left.scalar_type() != right.scalar_type()) { + return false; + } + if (dtype == c10::ScalarType::Undefined) { + dtype = left.scalar_type(); + m = static_cast(left.size(0)); + k = static_cast(left.size(1)); + n = static_cast(right.size(1)); + } + if (left.scalar_type() != dtype || left.size(0) != m || left.size(1) != k || + right.size(0) != k || right.size(1) != n) { + return false; + } + Tensor out = workspace.acquire({m, n}, dtype, left.device()); + if (!out.is_contiguous()) { + return false; + } + lefts.push_back(left); + rights.push_back(right); + outs.push_back(out); + executed.push_back( + ExecutedOp{left, right, prepared_left.workspace_owned, prepared_right.workspace_owned}); + } + + if (dtype != c10::ScalarType::ComplexFloat && dtype != c10::ScalarType::ComplexDouble) { + return false; + } + + const int group_count = 1; + const int group_size[] = {static_cast(op_indices.size())}; + const CBLAS_TRANSPOSE transa[] = {CblasNoTrans}; + const CBLAS_TRANSPOSE transb[] = {CblasNoTrans}; + const int m_array[] = {m}; + const int n_array[] = {n}; + const int k_array[] = {k}; + const int lda[] = {k}; + const int ldb[] = {n}; + const int ldc[] = {n}; + std::vector a_array(op_indices.size()); + std::vector b_array(op_indices.size()); + std::vector c_array(op_indices.size()); + for (size_t i = 0; i < op_indices.size(); ++i) { + a_array[i] = lefts[i].data_ptr(); + b_array[i] = rights[i].data_ptr(); + c_array[i] = outs[i].data_ptr(); + } + + const int mkl_threads = env_int("QIBOTN_CPP_BATCH_MKL_THREADS", 0); + const int previous_threads = + mkl_threads > 0 ? mkl_set_num_threads_local(mkl_threads) : 0; + if (dtype == c10::ScalarType::ComplexFloat) { + const std::complex alpha(1.0f, 0.0f); + const std::complex beta(0.0f, 0.0f); + const void* alpha_array[] = {&alpha}; + const void* beta_array[] = {&beta}; + cblas_cgemm_batch( + CblasRowMajor, + transa, + transb, + m_array, + n_array, + k_array, + alpha_array, + a_array.data(), + lda, + b_array.data(), + ldb, + beta_array, + c_array.data(), + ldc, + group_count, + group_size); + } else { + const std::complex alpha(1.0, 0.0); + const std::complex beta(0.0, 0.0); + const void* alpha_array[] = {&alpha}; + const void* beta_array[] = {&beta}; + cblas_zgemm_batch( + CblasRowMajor, + transa, + transb, + m_array, + n_array, + k_array, + alpha_array, + a_array.data(), + lda, + b_array.data(), + ldb, + beta_array, + c_array.data(), + ldc, + group_count, + group_size); + } + if (mkl_threads > 0) { + mkl_set_num_threads_local(previous_threads); + } + + for (size_t i = 0; i < op_indices.size(); ++i) { + const auto& op = ops[op_indices[i]]; + Tensor out = apply_output_preparation(outs[i], op.out_shape, op.out_perm); + temps.at(static_cast(op.out_id)) = out; + } + return true; +} +#endif + +std::vector list_to_i64_vector(const pybind11::handle& object) { + std::vector values; + for (const auto item : object) { + values.push_back(pybind11::cast(item)); + } + return values; +} + +ContractOp parse_op(const pybind11::handle& object) { + auto tuple = pybind11::cast(object); + if (tuple.size() != 10) { + throw std::runtime_error("Compiled contraction op must have 10 fields."); + } + + ContractOp op; + op.out_id = pybind11::cast(tuple[0]); + op.left_id = pybind11::cast(tuple[1]); + op.right_id = pybind11::cast(tuple[2]); + op.kind = pybind11::cast(tuple[3]); + op.left_perm = list_to_i64_vector(tuple[4]); + op.left_shape = list_to_i64_vector(tuple[5]); + op.right_perm = list_to_i64_vector(tuple[6]); + op.right_shape = list_to_i64_vector(tuple[7]); + op.out_shape = list_to_i64_vector(tuple[8]); + op.out_perm = list_to_i64_vector(tuple[9]); + return op; +} + +std::vector parse_plan(const pybind11::list& plan) { + std::vector ops; + ops.reserve(plan.size()); + for (const auto object : plan) { + ops.push_back(parse_op(object)); + } + return ops; +} + +Tensor run_contract( + const std::vector& arrays, + const std::vector& ops, + int64_t ntemps, + int64_t root_id, + Workspace& workspace) { + c10::InferenceMode inference_guard(true); + std::vector temps(static_cast(ntemps)); + + if (arrays.size() > temps.size()) { + throw std::runtime_error("Temporary tensor table is smaller than input arrays."); + } + for (size_t i = 0; i < arrays.size(); ++i) { + temps[i] = arrays[i]; + } + + for (const auto& op : ops) { + Tensor left = temps.at(static_cast(op.left_id)); + Tensor right = temps.at(static_cast(op.right_id)); + if (!left.defined() || !right.defined()) { + throw std::runtime_error("Contraction plan referenced a released tensor."); + } + + auto prepared_left = apply_preparation(workspace, left, op.left_perm, op.left_shape); + auto prepared_right = apply_preparation(workspace, right, op.right_perm, op.right_shape); + left = prepared_left.tensor; + right = prepared_right.tensor; + + Tensor out; + if (op.kind == 0) { + auto out_shape = matmul_output_shape(left, right); + out = workspace.acquire(out_shape, left.scalar_type(), left.device()); + if (left.dim() == 2 && right.dim() == 2) { + if (!direct_cblas_mm(left, right, out)) { + at::_ops::mm_out::call(left, right, out); + } + } else if (left.dim() == 3 && right.dim() == 3) { + at::_ops::bmm_out::call(left, right, out); + } else { + out = torch::matmul(left, right); + } + } else if (op.kind == 1) { + auto out_shape = multiply_output_shape(left, right); + out = workspace.acquire(out_shape, left.scalar_type(), left.device()); + at::_ops::mul_out::call(left, right, out); + } else { + throw std::runtime_error("Unknown contraction op kind."); + } + + out = apply_output_preparation(out, op.out_shape, op.out_perm); + temps.at(static_cast(op.out_id)) = out; + + if (prepared_left.workspace_owned) { + workspace.release(left); + } + if (prepared_right.workspace_owned) { + workspace.release(right); + } + if (op.left_id >= static_cast(arrays.size())) { + workspace.release(temps.at(static_cast(op.left_id))); + temps.at(static_cast(op.left_id)) = Tensor(); + } + if (op.right_id >= static_cast(arrays.size())) { + workspace.release(temps.at(static_cast(op.right_id))); + temps.at(static_cast(op.right_id)) = Tensor(); + } + } + + Tensor result = temps.at(static_cast(root_id)); + if (!result.defined()) { + throw std::runtime_error("Contraction plan did not produce a result tensor."); + } + return result; +} + +Tensor run_contract_dag( + const std::vector& arrays, + const std::vector& ops, + int64_t ntemps, + int64_t root_id, + Workspace& workspace) { + c10::InferenceMode inference_guard(true); + std::vector temps(static_cast(ntemps)); + if (arrays.size() > temps.size()) { + throw std::runtime_error("Temporary tensor table is smaller than input arrays."); + } + for (size_t i = 0; i < arrays.size(); ++i) { + temps[i] = arrays[i]; + } + + std::unordered_map producer; + producer.reserve(ops.size()); + for (size_t i = 0; i < ops.size(); ++i) { + producer[ops[i].out_id] = static_cast(i); + } + + std::vector> dependents(ops.size()); + std::vector pending_inputs(ops.size(), 0); + std::vector remaining_consumers(static_cast(ntemps), 0); + for (size_t i = 0; i < ops.size(); ++i) { + for (const auto tensor_id : {ops[i].left_id, ops[i].right_id}) { + if (tensor_id >= 0 && tensor_id < ntemps) { + remaining_consumers[static_cast(tensor_id)] += 1; + } + auto it = producer.find(tensor_id); + if (it != producer.end()) { + pending_inputs[i] += 1; + dependents[it->second].push_back(static_cast(i)); + } + } + } + + std::vector ready; + ready.reserve(ops.size()); + for (size_t i = 0; i < ops.size(); ++i) { + if (pending_inputs[i] == 0) { + ready.push_back(static_cast(i)); + } + } + + const bool batch_enabled = env_enabled("QIBOTN_CPP_BATCH_SMALL_GEMM"); + const int min_batch = env_int("QIBOTN_CPP_BATCH_MIN_SIZE", 4); + const int64_t max_flops = env_i64("QIBOTN_CPP_BATCH_MAX_FLOPS", 1000000); + std::vector done(ops.size(), 0); + size_t completed = 0; + + auto mark_consumed_and_release = [&](const ContractOp& op, const ExecutedOp& executed) { + if (executed.left_workspace_owned) { + workspace.release(executed.left); + } + if (executed.right_workspace_owned) { + workspace.release(executed.right); + } + for (const auto tensor_id : {op.left_id, op.right_id}) { + if (tensor_id < static_cast(arrays.size()) || tensor_id < 0) { + continue; + } + auto& count = remaining_consumers[static_cast(tensor_id)]; + count -= 1; + if (count == 0) { + workspace.release(temps.at(static_cast(tensor_id))); + temps.at(static_cast(tensor_id)) = Tensor(); + } + } + }; + + while (completed < ops.size()) { + if (ready.empty()) { + throw std::runtime_error("DAG contractor has no ready ops before completion."); + } + + std::vector to_execute; +#ifdef QIBOTN_USE_MKL + if (batch_enabled) { + std::unordered_map, GemmBatchKeyHash> groups; + for (const int op_index : ready) { + if (done[op_index]) { + continue; + } + GemmBatchKey key; + if (!static_small_2d_gemm_key(ops[op_index], max_flops, key)) { + continue; + } + Tensor left = temps.at(static_cast(ops[op_index].left_id)); + if (!left.defined()) { + continue; + } + key.dtype = left.scalar_type(); + groups[key].push_back(op_index); + } + for (const auto& item : groups) { + if (static_cast(item.second.size()) >= min_batch) { + to_execute = item.second; + break; + } + } + } +#endif + + if (to_execute.empty()) { + to_execute.push_back(ready.back()); + } + + std::vector executed_batch; + bool batch_ok = false; +#ifdef QIBOTN_USE_MKL + if (to_execute.size() > 1) { + batch_ok = execute_mkl_batch(to_execute, ops, temps, workspace, executed_batch); + } +#endif + if (!batch_ok) { + if (to_execute.size() > 1) { + to_execute.resize(1); + } + executed_batch.clear(); + executed_batch.push_back(execute_single_op(ops[to_execute[0]], temps, workspace)); + } + + for (size_t j = 0; j < to_execute.size(); ++j) { + const int op_index = to_execute[j]; + if (done[op_index]) { + continue; + } + done[op_index] = 1; + completed += 1; + mark_consumed_and_release(ops[op_index], executed_batch[j]); + for (const int dep : dependents[op_index]) { + pending_inputs[dep] -= 1; + if (pending_inputs[dep] == 0) { + ready.push_back(dep); + } + } + } + + ready.erase( + std::remove_if( + ready.begin(), + ready.end(), + [&](int op_index) { return done[op_index] != 0; }), + ready.end()); + } + + Tensor result = temps.at(static_cast(root_id)); + if (!result.defined()) { + throw std::runtime_error("DAG contractor did not produce a result tensor."); + } + return result; +} + +Tensor contract_impl( + const std::vector& arrays, + const pybind11::list& plan, + int64_t ntemps, + int64_t root_id) { + auto ops = parse_plan(plan); + Workspace workspace; + return run_contract(arrays, ops, ntemps, root_id, workspace); +} + +class Contractor { + public: + Contractor(const pybind11::list& plan, int64_t ntemps, int64_t root_id) + : ops_(parse_plan(plan)), ntemps_(ntemps), root_id_(root_id) { + compute_batch_summary(); + } + + Tensor contract(const std::vector& arrays) { + if (env_enabled("QIBOTN_CPP_DAG_SCHEDULER")) { + return run_contract_dag(arrays, ops_, ntemps_, root_id_, workspace_); + } + return run_contract(arrays, ops_, ntemps_, root_id_, workspace_); + } + + int64_t nsteps() const { + return static_cast(ops_.size()); + } + + pybind11::dict batch_summary() const { + pybind11::dict out; + out["groups"] = batch_groups_; + out["covered_ops"] = batch_covered_ops_; + out["calls_saved"] = batch_calls_saved_; + return out; + } + + private: + void compute_batch_summary() { + const int min_batch = env_int("QIBOTN_CPP_BATCH_MIN_SIZE", 4); + const int64_t max_flops = env_i64("QIBOTN_CPP_BATCH_MAX_FLOPS", 1000000); + std::unordered_map producer; + for (size_t i = 0; i < ops_.size(); ++i) { + producer[ops_[i].out_id] = static_cast(i); + } + std::vector pending_inputs(ops_.size(), 0); + std::vector> dependents(ops_.size()); + for (size_t i = 0; i < ops_.size(); ++i) { + for (const auto tensor_id : {ops_[i].left_id, ops_[i].right_id}) { + auto it = producer.find(tensor_id); + if (it != producer.end()) { + pending_inputs[i] += 1; + dependents[it->second].push_back(static_cast(i)); + } + } + } + std::vector ready; + for (size_t i = 0; i < ops_.size(); ++i) { + if (pending_inputs[i] == 0) { + ready.push_back(static_cast(i)); + } + } + std::vector done(ops_.size(), 0); + size_t completed = 0; + while (completed < ops_.size() && !ready.empty()) { + std::unordered_map counts; + int best_count = 0; + for (const int op_index : ready) { + GemmBatchKey key; + if (!static_small_2d_gemm_key(ops_[op_index], max_flops, key)) { + continue; + } + key.dtype = c10::ScalarType::ComplexFloat; + int count = ++counts[key]; + best_count = std::max(best_count, count); + } + if (best_count >= min_batch) { + batch_groups_ += 1; + batch_covered_ops_ += best_count; + batch_calls_saved_ += best_count - 1; + } + // Simulate one scheduler step: batch best group if present, otherwise one op. + std::vector executed; + if (best_count >= min_batch) { + GemmBatchKey best_key; + for (const auto& item : counts) { + if (item.second == best_count) { + best_key = item.first; + break; + } + } + for (const int op_index : ready) { + GemmBatchKey key; + if (static_small_2d_gemm_key(ops_[op_index], max_flops, key)) { + key.dtype = c10::ScalarType::ComplexFloat; + if (key == best_key) { + executed.push_back(op_index); + } + } + } + } else { + executed.push_back(ready.back()); + } + for (const int op_index : executed) { + if (done[op_index]) { + continue; + } + done[op_index] = 1; + completed += 1; + for (const int dep : dependents[op_index]) { + pending_inputs[dep] -= 1; + if (pending_inputs[dep] == 0) { + ready.push_back(dep); + } + } + } + ready.erase( + std::remove_if( + ready.begin(), + ready.end(), + [&](int op_index) { return done[op_index] != 0; }), + ready.end()); + } + } + + std::vector ops_; + int64_t ntemps_; + int64_t root_id_; + Workspace workspace_; + int64_t batch_groups_ = 0; + int64_t batch_covered_ops_ = 0; + int64_t batch_calls_saved_ = 0; +}; + +} // namespace + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "contract", + &contract_impl, + "Run a compiled cotengra contraction plan with torch CPU tensors."); + pybind11::class_(m, "Contractor") + .def(pybind11::init()) + .def("contract", &Contractor::contract) + .def("batch_summary", &Contractor::batch_summary) + .def_property_readonly("nsteps", &Contractor::nsteps); +} diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py index ae36893..f2fbf71 100644 --- a/src/qibotn/eval.py +++ b/src/qibotn/eval.py @@ -1,8 +1,3 @@ -import cupy as cp -import cuquantum.bindings.cutensornet as cutn -from cupy.cuda import nccl -from cupy.cuda.runtime import getDeviceCount -from cuquantum.tensornet import Network, contract from mpi4py import MPI from qibotn.circuit_convertor import QiboCircuitToEinsum @@ -15,8 +10,37 @@ from qibotn.observables import ( extract_gates_and_qubits, ) +try: + import cupy as cp + import cuquantum.bindings.cutensornet as cutn + from cupy.cuda import nccl + from cupy.cuda.runtime import getDeviceCount + from cuquantum.tensornet import Network, contract +except ImportError: # pragma: no cover - exercised on CPU-only installations + cp = None + cutn = None + nccl = None + getDeviceCount = None + Network = None + contract = None -def get_ham_gates(pauli_map, dtype="complex128", backend=cp): + +def _require_cuquantum(): + if ( + cp is None + or cutn is None + or nccl is None + or getDeviceCount is None + or Network is None + or contract is None + ): + raise ImportError( + "The legacy GPU evaluation helpers require cupy and cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) + + +def get_ham_gates(pauli_map, dtype="complex128", backend=None): """Populate the gates for all pauli operators. Parameters: @@ -27,6 +51,13 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp): Returns: A sequence of pauli gates. """ + if backend is None: + backend = cp + if backend is None: + raise ImportError( + "get_ham_gates requires an array backend; cupy is unavailable " + "in this CPU-only environment." + ) asarray = backend.asarray pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype) pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype) @@ -46,6 +77,7 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp): def initialize_mpi(): """Initialize MPI communication and device selection.""" + _require_cuquantum() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() @@ -56,6 +88,7 @@ def initialize_mpi(): def initialize_nccl(comm_mpi, rank, size): """Initialize NCCL communication.""" + _require_cuquantum() nccl_id = nccl.get_unique_id() if rank == 0 else None nccl_id = comm_mpi.bcast(nccl_id, root=0) return nccl.NcclCommunicator(size, nccl_id, rank) @@ -73,6 +106,7 @@ def get_operands(qibo_circ, datatype, rank, comm): def compute_optimal_path(network, n_samples, size, comm): """Compute contraction path and broadcast optimal selection.""" + _require_cuquantum() path, info = network.contract_path( optimize={ "samples": n_samples, @@ -101,6 +135,8 @@ def compute_slices(info, rank, size): def reduce_result(result, comm, method="MPI", root=0): """Reduce results across processes.""" + if method == "NCCL": + _require_cuquantum() if method == "MPI": return comm.reduce(sendobj=result, op=MPI.SUM, root=root) @@ -148,6 +184,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8): Returns: Dense vector of quantum circuit. """ + _require_cuquantum() comm, rank, size, device_id = initialize_mpi() operands = get_operands(qibo_circ, datatype, rank, comm) network = Network(*operands, options={"device_id": device_id}) @@ -179,6 +216,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): Returns: Dense vector of quantum circuit. """ + _require_cuquantum() comm_mpi, rank, size, device_id = initialize_mpi() comm_nccl = initialize_nccl(comm_mpi, rank, size) operands = get_operands(qibo_circ, datatype, rank, comm_mpi) @@ -203,6 +241,7 @@ def dense_vector_tn(qibo_circ, datatype): Returns: Dense vector of quantum circuit. """ + _require_cuquantum() myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract(*myconvertor.state_vector_operands()) @@ -231,6 +270,7 @@ def expectation_tn_nccl(qibo_circ, datatype, observable, n_samples=8): Expectation of quantum circuit due to pauli string. """ + _require_cuquantum() comm_mpi, rank, size, device_id = initialize_mpi() comm_nccl = initialize_nccl(comm_mpi, rank, size) @@ -299,6 +339,7 @@ def expectation_tn_MPI(qibo_circ, datatype, observable, n_samples=8): Returns: Expectation of quantum circuit due to pauli string. """ + _require_cuquantum() # Initialize MPI and device comm, rank, size, device_id = initialize_mpi() @@ -358,6 +399,7 @@ def expectation_tn(qibo_circ, datatype, observable): Returns: Expectation of quantum circuit due to pauli string. """ + _require_cuquantum() myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) observable = check_observable(observable, qibo_circ.nqubits) @@ -383,6 +425,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype): Returns: Dense vector of quantum circuit. """ + _require_cuquantum() myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype) mps_helper = MPSContractionHelper(myconvertor.num_qubits) diff --git a/src/qibotn/expectation_runner.py b/src/qibotn/expectation_runner.py index 22c2f55..6af6de8 100644 --- a/src/qibotn/expectation_runner.py +++ b/src/qibotn/expectation_runner.py @@ -16,9 +16,11 @@ from qibotn.observables import check_observable class ExpectationConfig: ansatz: str = "tn" mpi: bool = False - bond: int = 1024 - cut_ratio: float = 1e-12 + bond: int | None = 1024 + cut_ratio: float | None = 1e-12 tensor_module: str = "torch" + quimb_backend: str = "torch" + dtype: str = "complex128" torch_threads: int = 8 parallel_opts: dict | None = None @@ -28,6 +30,7 @@ class ExpectationResult: value: float seconds: float rank: int = 0 + parallel_stats: list | None = None def exact_for_observable(circuit, observable, nqubits): @@ -54,6 +57,8 @@ def run_cpu_expectation(circuit, observable, config): "max_bond_dimension": config.bond, "cut_ratio": config.cut_ratio, "tensor_module": config.tensor_module, + "quimb_backend": config.quimb_backend, + "dtype": config.dtype, "torch_threads": config.torch_threads, "parallel_opts": config.parallel_opts or {}, } @@ -68,4 +73,10 @@ def run_cpu_expectation(circuit, observable, config): elapsed = time.perf_counter() - start rank = getattr(backend, "rank", 0) - return ExpectationResult(float(np.real(value)), elapsed, rank=rank) + stats = getattr(backend, "parallel_stats", None) + return ExpectationResult( + float(np.real(value)), + elapsed, + rank=rank, + parallel_stats=list(stats) if stats is not None else None, + ) diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py index fcb4441..b44cfb2 100644 --- a/src/qibotn/mps_contraction_helper.py +++ b/src/qibotn/mps_contraction_helper.py @@ -1,4 +1,16 @@ -from cuquantum.tensornet import contract, contract_path +try: + from cuquantum.tensornet import contract, contract_path +except ImportError: # pragma: no cover - exercised on CPU-only installations + contract = None + contract_path = None + + +def _require_cuquantum(): + if contract is None or contract_path is None: + raise ImportError( + "The cuQuantum MPS contraction helper requires cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb @@ -113,6 +125,7 @@ class MPSContractionHelper: return self._contract(interleaved_inputs, options=options) / norm def _contract(self, interleaved_inputs, options=None): + _require_cuquantum() path = contract_path(*interleaved_inputs, options=options)[0] return contract(*interleaved_inputs, options=options, optimize={"path": path}) diff --git a/src/qibotn/mps_utils.py b/src/qibotn/mps_utils.py index 1450837..ff3d010 100644 --- a/src/qibotn/mps_utils.py +++ b/src/qibotn/mps_utils.py @@ -1,6 +1,19 @@ -import cupy as cp -from cuquantum.tensornet import contract -from cuquantum.tensornet.experimental import contract_decompose +try: + import cupy as cp + from cuquantum.tensornet import contract + from cuquantum.tensornet.experimental import contract_decompose +except ImportError: # pragma: no cover - exercised on CPU-only installations + cp = None + contract = None + contract_decompose = None + + +def _require_cuquantum(): + if cp is None or contract is None or contract_decompose is None: + raise ImportError( + "The cuQuantum MPS helpers require cupy and cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) def initial(num_qubits, dtype): @@ -13,6 +26,7 @@ def initial(num_qubits, dtype): Returns: The initial MPS tensors. """ + _require_cuquantum() state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1) mps_tensors = [state_tensor] * num_qubits return mps_tensors @@ -28,6 +42,7 @@ def mps_site_right_swap(mps_tensors, i, **kwargs): Returns: The updated MPS tensors. """ + _require_cuquantum() # contraction followed by QR decomposition a, _, b = contract_decompose( "ipj,jqk->iqj,jpk", @@ -60,6 +75,7 @@ def apply_gate(mps_tensors, gate, qubits, **kwargs): The updated MPS tensors. """ + _require_cuquantum() n_qubits = len(qubits) if n_qubits == 1: # single-qubit gate diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py index 9445c97..2603b4f 100644 --- a/src/qibotn/parallel.py +++ b/src/qibotn/parallel.py @@ -17,6 +17,196 @@ except ImportError: SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass") +_COTENGRA_DASK_PATCHED = False +_COTENGRA_DASK_SUBMIT_PATCHED = False +_DASK_TRIAL_DEBUG = False + + +def _optimizer_search_stats(opt): + scores = list(getattr(opt, "scores", ())) + finite_scores = [score for score in scores if np.isfinite(score)] + times = list(getattr(opt, "times", ())) + best = getattr(opt, "best", {}) or {} + return { + "completed_trials": len(scores), + "finite_trials": len(finite_scores), + "failed_trials": len(scores) - len(finite_scores), + "requested_trials": int(getattr(opt, "max_repeats", 0) or 0), + "trial_seconds_sum": float(sum(times)), + "best_score": float(best.get("score", float("inf"))), + "best_flops": float(best.get("flops", float("inf"))), + "best_write": float(best.get("write", float("inf"))), + "best_size": float(best.get("size", float("inf"))), + } + + +def _attach_search_stats(tree, opt): + try: + tree.qibotn_search_stats = _optimizer_search_stats(opt) + except Exception: + pass + return tree + + +def _dask_worker_slots(client): + info = client.scheduler_info(n_workers=-1) + workers = info.get("workers", {}) + return workers, sum(int(w.get("nthreads", 1) or 1) for w in workers.values()) + + +def _print_dask_worker_summary(client): + workers, slots = _dask_worker_slots(client) + by_host = {} + for worker in workers.values(): + host = worker.get("host", "unknown") + by_host.setdefault(host, {"workers": 0, "threads": 0}) + by_host[host]["workers"] += 1 + by_host[host]["threads"] += int(worker.get("nthreads", 1) or 1) + print( + "qibotn_dask_workers " + f"workers={len(workers)} threads={slots} by_host={by_host}", + flush=True, + ) + + +def _run_trial_with_debug(fn, *args, **kwargs): + import os + import socket + + try: + from distributed import get_worker + + worker = get_worker() + worker_address = worker.address + except Exception: + worker_address = "unknown" + + method = kwargs.get("method", "unknown") + pid = os.getpid() + host = socket.gethostname() + print( + "qibotn_trial_start " + f"worker={worker_address} host={host} pid={pid} method={method}", + flush=True, + ) + start = time.perf_counter() + try: + trial = fn(*args, **kwargs) + except Exception as exc: + elapsed = time.perf_counter() - start + print( + "qibotn_trial_error " + f"worker={worker_address} host={host} pid={pid} " + f"method={method} seconds={elapsed:.3f} error={exc!r}", + flush=True, + ) + raise + elapsed = time.perf_counter() - start + print( + "qibotn_trial_done " + f"worker={worker_address} host={host} pid={pid} method={method} " + f"seconds={elapsed:.3f} score={trial.get('score', float('nan')):.6g} " + f"flops={trial.get('flops', float('nan')):.6g} " + f"size={trial.get('size', float('nan')):.6g}", + flush=True, + ) + return trial + + +def _patch_cotengra_dask_submit(debug_trials=False): + global _COTENGRA_DASK_SUBMIT_PATCHED, _DASK_TRIAL_DEBUG + _DASK_TRIAL_DEBUG = bool(debug_trials) + if _COTENGRA_DASK_SUBMIT_PATCHED: + return + + import cotengra.parallel as ctg_parallel + import cotengra.hyperoptimizers.hyper as hyper + + original_submit = ctg_parallel.submit + + def submit(pool, fn, *args, **kwargs): + backend = pool.__class__.__module__.split(".", 1)[0] + if _DASK_TRIAL_DEBUG and backend == "distributed": + return original_submit( + pool, + _run_trial_with_debug, + fn, + *args, + **kwargs, + ) + return original_submit(pool, fn, *args, **kwargs) + + ctg_parallel.submit = submit + hyper.submit = submit + _COTENGRA_DASK_SUBMIT_PATCHED = True + + +def _patch_cotengra_dask_as_completed(): + """Make cotengra 0.7.5 handle distributed.Future objects. + + This cotengra release routes all parallel futures through + ``concurrent.futures.as_completed()``, which does not accept dask + ``distributed.Future`` instances. Keep cotengra's optimizer/reporting logic + intact and only swap the wait primitive when the futures are from dask. + """ + global _COTENGRA_DASK_PATCHED + if _COTENGRA_DASK_PATCHED: + return + + from cotengra.hyperoptimizers.hyper import HyperOptimizer + + def _get_and_report_next_future(self): + futures_map = {future: setting for setting, future in self._futures} + if not futures_map: + return { + "score": float("inf"), + "flops": float("inf"), + "write": float("inf"), + "size": float("inf"), + "time": 0.0, + } + + future0 = next(iter(futures_map)) + if future0.__class__.__module__.split(".", 1)[0] == "distributed": + from distributed import as_completed + + deadline = getattr(self, "_qibotn_deadline", None) + timeout = None if deadline is None else max(0.0, deadline - time.time()) + try: + future = next(iter(as_completed(futures_map, timeout=timeout))) + except TimeoutError: + for future in futures_map: + future.cancel() + self._futures = [] + return { + "score": float("inf"), + "flops": float("inf"), + "write": float("inf"), + "size": float("inf"), + "time": 0.0, + } + else: + import concurrent.futures as _cf + + future = next(_cf.as_completed(futures_map)) + + setting = futures_map[future] + self._futures = [(s, f) for s, f in self._futures if f is not future] + try: + trial = future.result() + except Exception: + trial = { + "score": float("inf"), + "flops": float("inf"), + "write": float("inf"), + "size": float("inf"), + "time": 0.0, + } + self._maybe_report_result(setting, trial) + return trial + + HyperOptimizer._get_and_report_next_future = _get_and_report_next_future + _COTENGRA_DASK_PATCHED = True def _search_chunk( @@ -46,7 +236,7 @@ def _search_chunk( **kwargs, ) tree = tn.contraction_tree(optimize=opt, output_inds=output_inds) - return tree.combo_cost(factor=256), tree + return tree.combo_cost(factor=256), _attach_search_stats(tree, opt) def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts): @@ -164,6 +354,8 @@ def _dask_search( dask_address=None, n_workers=None, optlib=None, + debug_trials=False, + close_workers=False, ): """Run one centralized cotengra hyper-optimizer over a dask pool. @@ -180,6 +372,9 @@ def _dask_search( import cotengra as ctg + _patch_cotengra_dask_as_completed() + _patch_cotengra_dask_submit(debug_trials=debug_trials) + close_client = False close_cluster = False cluster = None @@ -205,7 +400,20 @@ def _dask_search( if optlib is not None: kwargs["optlib"] = optlib + retire_workers = [] try: + workers, worker_slots = _dask_worker_slots(client) + if close_workers: + retire_workers = list(workers) + if debug_trials: + _print_dask_worker_summary(client) + if total_repeats < worker_slots: + print( + "qibotn_dask_underutilized " + f"requested_trials={total_repeats} worker_slots={worker_slots} " + "hint='increase --tn-search-repeats to at least worker_slots'", + flush=True, + ) opt = ctg.HyperOptimizer( methods=SEARCH_METHODS, max_repeats=total_repeats, @@ -216,8 +424,31 @@ def _dask_search( progbar=False, **kwargs, ) - return tn.contraction_tree(optimize=opt, output_inds=output_inds) + opt._num_workers = max(1, worker_slots) + opt.pre_dispatch = max(1, min(int(total_repeats), worker_slots)) + if max_time is not None: + opt._qibotn_deadline = time.time() + max_time + tree = tn.contraction_tree(optimize=opt, output_inds=output_inds) + return _attach_search_stats(tree, opt) finally: + if close_workers and retire_workers: + try: + retired = client.retire_workers( + workers=retire_workers, + close_workers=True, + remove=True, + ) + print( + "qibotn_dask_workers_closed " + f"requested={len(retire_workers)} retired={len(retired)}", + flush=True, + ) + except Exception as exc: + print( + "qibotn_dask_workers_close_failed " + f"requested={len(retire_workers)} error={exc!r}", + flush=True, + ) if close_client: client.close() if close_cluster: @@ -234,6 +465,8 @@ def _mpi_search( trial_timeout=None, search_backend="processpool", dask_address=None, + debug_trials=False, + dask_close_workers=False, ): comm = MPI.COMM_WORLD rank, size = comm.Get_rank(), comm.Get_size() @@ -258,6 +491,8 @@ def _mpi_search( slicing_opts=slicing_opts, dask_address=dask_address, n_workers=n_workers, + debug_trials=debug_trials, + close_workers=dask_close_workers, ) payload = ("ok", tree) except Exception as exc: @@ -296,7 +531,8 @@ def _mpi_search( def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024, max_time=300, n_workers=48, slicing_opts=None, trial_timeout=None, search_backend=None, - dask_address=None): + dask_address=None, debug_trials=False, + dask_close_workers=False): """Parallel contraction path search. Args: @@ -324,6 +560,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 trial_timeout, search_backend=search_backend, dask_address=dask_address, + debug_trials=debug_trials, + dask_close_workers=dask_close_workers, ) elif method == 'processpool': return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout) @@ -336,6 +574,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 slicing_opts=slicing_opts, dask_address=dask_address, n_workers=n_workers, + debug_trials=debug_trials, + close_workers=dask_close_workers, ) else: raise ValueError(f"Unknown method: {method}") @@ -431,17 +671,37 @@ def _to_numpy_vector(value, is_torch): def _zero_vector_like(arrays): - return np.zeros(1, dtype=np.complex128) + array = arrays[0] + if type(array).__module__.startswith("torch"): + return np.zeros(1, dtype=np.complex64 if "64" in str(array.dtype) else np.complex128) + return np.zeros(1, dtype=np.asarray(array).dtype) -def contract_tree_slices(tree, arrays, slice_indices, backend=None): +def contract_tree_slices(tree, arrays, slice_indices, backend=None, implementation=None): """Contract a subset of cotengra slices and return their local sum.""" backend = backend or _array_backend(arrays) is_torch = backend == "torch" local = None + cpp_contract = None + if implementation == "cpp": + if backend != "torch": + raise ValueError("implementation='cpp' requires torch arrays.") + from qibotn.torch_contractor import contract_tree_cpp + + cpp_contract = contract_tree_cpp for slice_id in slice_indices: - value = tree.contract_slice(arrays, slice_id, backend=backend) + if cpp_contract is not None: + value = cpp_contract(tree, tree.slice_arrays(arrays, slice_id)) + elif implementation is None: + value = tree.contract_slice(arrays, slice_id, backend=backend) + else: + value = tree.contract_slice( + arrays, + slice_id, + backend=backend, + implementation=implementation, + ) value = _to_numpy_vector(value, is_torch) local = value if local is None else local + value @@ -455,6 +715,7 @@ def parallel_contract( comm=None, assignment="block", return_stats=False, + implementation=None, ): if method == 'mpi': if not _HAVE_MPI or comm is None: @@ -465,11 +726,20 @@ def parallel_contract( comm, assignment=assignment, return_stats=return_stats, + implementation=implementation, ) raise ValueError(f"Unknown method: {method}") -def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False): +def _contract_mpi( + tree, + arrays, + comm, + root=0, + assignment="block", + return_stats=False, + implementation=None, +): rank, size = comm.Get_rank(), comm.Get_size() backend = _array_backend(arrays) is_torch = backend == "torch" @@ -482,15 +752,14 @@ def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=F "appear in the output." ) - if nslices <= 1: - if rank != root: - return (None, stats) if return_stats else None - result = tree.contract(arrays, backend=backend) - result = _to_numpy_vector(result, is_torch) - return (result, stats) if return_stats else result - plan = mpi_slice_plan(nslices, rank, size, assignment=assignment) - local = contract_tree_slices(tree, arrays, plan.indices, backend=backend) + local = contract_tree_slices( + tree, + arrays, + plan.indices, + backend=backend, + implementation=implementation, + ) stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment) result = np.zeros_like(local) if rank == root else None diff --git a/src/qibotn/torch_contractor.py b/src/qibotn/torch_contractor.py new file mode 100644 index 0000000..f2e7a67 --- /dev/null +++ b/src/qibotn/torch_contractor.py @@ -0,0 +1,252 @@ +"""Torch C++ contraction backend for cotengra trees. + +This module compiles a restricted cotengra contraction tree into a compact +execution plan, then executes that plan in a C++ torch extension. It is an +experimental CPU path for reducing Python-level overhead between many +pairwise contractions. +""" + +from __future__ import annotations + +import importlib +import os +from functools import lru_cache +from pathlib import Path +from collections import defaultdict + + +_EXTENSION = None +_CONTRACTORS = {} +SMALL_GEMM_BATCH_FLOPS = 1_000_000 + + +def _load_extension(): + global _EXTENSION + if _EXTENSION is not None: + return _EXTENSION + + from torch.utils.cpp_extension import load + + source = Path(__file__).resolve().parent / "csrc" / "torch_contractor.cpp" + mklroot = os.environ.get("MKLROOT") + extra_cflags = ["-O3"] + extra_ldflags = [] + extra_include_paths = [] + if mklroot: + mklroot_path = Path(mklroot) + mkl_include = mklroot_path / "include" + mkl_lib = mklroot_path / "lib" + if (mkl_include / "mkl_cblas.h").exists() and ( + (mkl_lib / "libmkl_rt.so").exists() + or (mkl_lib / "libmkl_rt.so.2").exists() + ): + extra_cflags.append("-DQIBOTN_USE_MKL") + extra_include_paths.append(str(mkl_include)) + extra_ldflags.extend([f"-L{mkl_lib}", "-lmkl_rt"]) + + _EXTENSION = load( + name="qibotn_torch_contractor", + sources=[str(source)], + extra_cflags=extra_cflags, + extra_ldflags=extra_ldflags, + extra_include_paths=extra_include_paths, + verbose=False, + ) + return _EXTENSION + + +def _is_plain_permutation(expr): + if expr is None: + return None + if isinstance(expr, tuple): + return tuple(int(i) for i in expr) + if not isinstance(expr, str): + return None + if "," in expr or "->" not in expr: + return None + source, target = expr.split("->", 1) + if len(source) != len(target): + return None + if len(set(source)) != len(source) or set(source) != set(target): + return None + return tuple(source.index(ix) for ix in target) + + +def _maybe_tuple(values): + return () if values is None else tuple(int(x) for x in values) + + +def _shape_from_inds(tree, node): + return tuple(int(tree.size_dict[ix]) for ix in tree.get_inds(node)) + + +def _matmul_signature(op): + kind = op[3] + if kind != 0: + return None + left_shape = op[5] + right_shape = op[7] + if len(left_shape) == 2 and len(right_shape) == 2: + m, k, n = left_shape[-2], left_shape[-1], right_shape[-1] + return ("mm", int(m), int(k), int(n), int(m * k * n)) + return None + + +def _normalize_node_ids(tree, contractions): + leaf_to_id = { + frozenset((i,)): i + for i in range(tree.N) + } + next_id = len(leaf_to_id) + node_to_id = dict(leaf_to_id) + for parent, _left, _right, _tdot, _arg, _perm in contractions: + if parent not in node_to_id: + node_to_id[parent] = next_id + next_id += 1 + + return node_to_id, next_id + + +@lru_cache(maxsize=32) +def compile_torch_plan(tree): + """Compile ``tree`` into C++ contractor plan fields. + + The supported subset is the same pairwise matmul lowering used by + cotengra for torch CPU. Single-tensor diagonal/sum preprocessing is not + supported yet because it appears only in less common trees; callers should + fall back to cotengra for those cases. + """ + + contract_mod = importlib.import_module("cotengra.contract") + contractions = contract_mod.extract_contractions(tree) + node_to_id, ntemps = _normalize_node_ids(tree, contractions) + plan = [] + + for parent, left, right, tdot, arg, perm in contractions: + if left is None or right is None: + raise NotImplementedError( + "C++ torch contractor does not support cotengra preprocessing." + ) + + left_shape = _shape_from_inds(tree, left) + right_shape = _shape_from_inds(tree, right) + if tdot: + parsed = contract_mod._parse_tensordot_axes_to_matmul( + arg, + left_shape, + right_shape, + ) + else: + parsed = contract_mod._parse_eq_to_batch_matmul( + arg, + left_shape, + right_shape, + ) + + ( + eq_a, + eq_b, + new_shape_a, + new_shape_b, + new_shape_ab, + perm_ab, + pure_multiplication, + ) = parsed + + left_perm = _is_plain_permutation(eq_a) + right_perm = _is_plain_permutation(eq_b) + if left_perm is None and eq_a is not None: + raise NotImplementedError(f"Unsupported left preparation: {eq_a!r}") + if right_perm is None and eq_b is not None: + raise NotImplementedError(f"Unsupported right preparation: {eq_b!r}") + + plan.append( + ( + node_to_id[parent], + node_to_id[left], + node_to_id[right], + 1 if pure_multiplication else 0, + left_perm or (), + _maybe_tuple(new_shape_a), + right_perm or (), + _maybe_tuple(new_shape_b), + _maybe_tuple(new_shape_ab), + _maybe_tuple(perm_ab), + ) + ) + + if perm is not None: + raise NotImplementedError( + "C++ torch contractor does not support cotengra tensordot perm." + ) + + root_id = node_to_id[tree.root] + return tuple(plan), int(ntemps), int(root_id) + + +@lru_cache(maxsize=32) +def compile_batch_groups(tree, max_flops=SMALL_GEMM_BATCH_FLOPS): + plan, _ntemps, _root_id = compile_torch_plan(tree) + contractions = importlib.import_module("cotengra.contract").extract_contractions(tree) + node_to_id, _ntemps = _normalize_node_ids(tree, contractions) + depth = {frozenset((i,)): 0 for i in range(tree.N)} + tensor_depth = {i: 0 for i in range(tree.N)} + groups = defaultdict(list) + + for op_index, (contract_op, contraction) in enumerate(zip(plan, contractions)): + parent, left, right, _tdot, _arg, _perm = contraction + d = max(depth[left], depth[right]) + 1 + depth[parent] = d + tensor_depth[contract_op[0]] = d + sig = _matmul_signature(contract_op) + if sig is None: + continue + kind, m, k, n, flops = sig + if flops > max_flops: + continue + groups[(d, kind, m, k, n)].append(op_index) + + batch_groups = tuple( + tuple(items) + for _key, items in sorted(groups.items(), key=lambda item: (item[0], item[1][0])) + if len(items) >= 2 + ) + return batch_groups + + +def batch_group_summary(tree, max_flops=SMALL_GEMM_BATCH_FLOPS): + plan, _ntemps, _root_id = compile_torch_plan(tree) + groups = compile_batch_groups(tree, max_flops=max_flops) + covered = sum(len(group) for group in groups) + calls_saved = sum(len(group) - 1 for group in groups) + by_shape = [] + for group in groups: + op = plan[group[0]] + sig = _matmul_signature(op) + by_shape.append((sig[1:4], len(group), group[:8])) + return { + "groups": len(groups), + "covered_ops": covered, + "calls_saved": calls_saved, + "by_shape": by_shape, + } + + +def contract_tree_cpp(tree, arrays): + """Contract a cotengra tree using the experimental C++ torch contractor.""" + + contractor = prepare_torch_cpp_contractor(tree) + return contractor.contract(list(arrays)) + + +def prepare_torch_cpp_contractor(tree): + """Load the extension and compile ``tree`` without running contraction.""" + + ext = _load_extension() + key = id(tree) + contractor = _CONTRACTORS.get(key) + if contractor is None: + plan, ntemps, root_id = compile_torch_plan(tree) + contractor = ext.Contractor(list(plan), ntemps, root_id) + _CONTRACTORS[key] = contractor + return contractor diff --git a/tests/test_cpu_backend.py b/tests/test_cpu_backend.py index 300c6fe..877ddd8 100644 --- a/tests/test_cpu_backend.py +++ b/tests/test_cpu_backend.py @@ -5,7 +5,10 @@ from qibo import Circuit, gates, hamiltonians from qibo.symbols import X, Z from qibotn.backends.cpu import CpuTensorNet -from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit +from qibotn.benchmark_cases import ( + build_circuit as build_benchmark_circuit, + exact_pauli_sum, +) def build_circuit(nqubits=6): @@ -109,6 +112,59 @@ def test_cpu_mps_sampling_uses_nshots(): assert set(result.frequencies()) <= {"0000", "1111"} +def test_cpu_mps_mpo_expectation_matches_statevector(): + circuit = build_circuit(nqubits=4) + x = np.array([[0, 1], [1, 0]], dtype=complex) + z = np.array([[1, 0], [0, -1]], dtype=complex) + i2 = np.eye(2, dtype=complex) + mpo = [ + x.reshape(1, 2, 2, 1), + z.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + ] + exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4) + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": True, + "NCCL_enabled": False, + "expectation_enabled": {"mpo_tensors": mpo}, + "max_bond_dimension": 64, + "tensor_module": "torch", + "torch_threads": 1, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, exact, abs_tol=1e-12) + + +def test_cpu_mps_dense_observable_dict_matches_known_value(): + circuit = Circuit(2) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 1)) + + bell = np.zeros((4, 4), dtype=complex) + bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5 + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": True, + "NCCL_enabled": False, + "expectation_enabled": {"matrix": bell, "qubits": [0, 1]}, + "max_bond_dimension": 16, + "tensor_module": "torch", + "torch_threads": 1, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, 1.0, abs_tol=1e-12) + + def test_cpu_generic_tn_long_pauli_string_matches_statevector(): circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42) observable = {"pauli_string_pattern": "XZ"} diff --git a/tests/test_vidal_backend.py b/tests/test_vidal_backend.py index 149cd0c..0af5736 100644 --- a/tests/test_vidal_backend.py +++ b/tests/test_vidal_backend.py @@ -2,10 +2,18 @@ import math import numpy as np from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z +from qibo.symbols import Symbol, X, Y, Z -from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason +from qibotn.benchmark_cases import exact_pauli_sum +from qibotn.backends.vidal import ( + VidalBackend, + _can_route_non_adjacent, + _unsupported_reason, + _operator_terms_to_mpo, + _symbolic_hamiltonian_to_operator_terms, +) from qibotn.backends.vidal_tebd import ( + VidalTEBDExecutor, _route_non_adjacent_gates, _gate_sites, ) @@ -37,6 +45,25 @@ def test_vidal_backend_expectation_matches_statevector(): np.testing.assert_allclose(value, exact, atol=1e-12) +def test_vidal_backend_accepts_unlimited_bond_and_no_cutoff(): + circuit = build_local_circuit(nqubits=6, nlayers=2) + observable = hamiltonians.SymbolicHamiltonian( + form=0.5 * X(0) * Z(1) - 0.7 * Z(5) + ) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=None, + cut_ratio=None, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + def test_vidal_backend_fallback_for_non_adjacent_gate(): """compile_circuit=False (default) → falls back to qmatchatea for non-adjacent.""" circuit = Circuit(4) @@ -128,6 +155,208 @@ def test_routing_non_adjacent_cnot(): np.testing.assert_allclose(value, exact, atol=1e-12) +def test_routing_preserves_reversed_non_adjacent_gate_order(): + circuit = Circuit(6) + circuit.add(gates.X(5)) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(5, 0)) + + observable = hamiltonians.SymbolicHamiltonian(form=X(0) + Z(5) + Z(0) * Z(5)) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=64, + tensor_module="torch", + compile_circuit=True, + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_backend_preprocesses_non_adjacent_circuit(): + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 3)) + observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3)) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=64, + tensor_module="torch", + compile_circuit=True, + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=True) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_backend_preprocesses_toffoli_locally(): + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.H(1)) + circuit.add(gates.TOFFOLI(0, 1, 3)) + observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3)) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=128, + tensor_module="torch", + compile_circuit=True, + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=True) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_expectation_preserves_complex_coefficients(): + circuit = Circuit(1) + observable = hamiltonians.SymbolicHamiltonian(form=(1.0 + 2.0j) * Z(0)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=8, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, 1.0 + 2.0j, atol=1e-12) + + +def test_vidal_expectation_supports_custom_local_symbols(): + circuit = build_local_circuit(nqubits=4, nlayers=2) + a0 = Symbol(0, np.array([[0.2, 1.0], [1.0, -0.3]], dtype=complex), name="A") + b2 = Symbol(2, np.array([[0.7, -0.4j], [0.4j, 0.1]], dtype=complex), name="B") + a3 = Symbol(3, np.array([[0.5, 0.2], [0.2, -0.8]], dtype=complex), name="A") + observable = hamiltonians.SymbolicHamiltonian(form=0.7 * a0 * b2 - 0.4 * a3) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=64, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_executor_mpo_expectation_matches_pauli_sum(): + circuit = build_local_circuit(nqubits=4, nlayers=2) + executor = VidalTEBDExecutor( + nqubits=circuit.nqubits, + max_bond=64, + tensor_module="torch", + ) + executor.run_circuit(circuit) + + x = np.array([[0, 1], [1, 0]], dtype=complex) + z = np.array([[1, 0], [0, -1]], dtype=complex) + i2 = np.eye(2, dtype=complex) + mpo = [ + x.reshape(1, 2, 2, 1), + z.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + ] + mpo_value = executor.expectation_mpo(mpo) + pauli_value = executor.expectation_pauli_sum([(1.0, (("X", 0), ("Z", 1)))]) + + np.testing.assert_allclose(mpo_value, pauli_value, atol=1e-12) + + +def test_vidal_backend_accepts_mpo_observable_dict(): + circuit = build_local_circuit(nqubits=4, nlayers=2) + x = np.array([[0, 1], [1, 0]], dtype=complex) + z = np.array([[1, 0], [0, -1]], dtype=complex) + i2 = np.eye(2, dtype=complex) + mpo = [ + x.reshape(1, 2, 2, 1), + z.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + i2.reshape(1, 2, 2, 1), + ] + exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=64, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, {"mpo_tensors": mpo}, preprocess=False) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_symbolic_hamiltonian_auto_mpo_matches_operator_sum(): + circuit = build_local_circuit(nqubits=5, nlayers=2) + observable = hamiltonians.SymbolicHamiltonian( + form=0.3 * X(0) * Z(1) - 0.2j * Y(2) + 0.7 * Z(3) * X(4) + ) + + executor = VidalTEBDExecutor( + nqubits=circuit.nqubits, + max_bond=64, + tensor_module="torch", + ) + executor.run_circuit(circuit) + terms = _symbolic_hamiltonian_to_operator_terms(observable) + + term_value = executor.expectation_operator_sum(terms) + mpo_value = executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits)) + + np.testing.assert_allclose(mpo_value, term_value, atol=1e-12) + + +def test_vidal_backend_accepts_dense_two_qubit_observable(): + circuit = Circuit(2) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 1)) + + bell = np.zeros((4, 4), dtype=complex) + bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5 + observable = {"matrix": bell, "qubits": [0, 1]} + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=16, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, 1.0, atol=1e-12) + + +def test_vidal_backend_dense_observable_preserves_complex_value(): + circuit = Circuit(2) + circuit.add(gates.H(0)) + circuit.add(gates.H(1)) + + op = np.zeros((4, 4), dtype=complex) + op[0, 3] = 1.0 + observable = {"coefficient": 1.0j, "matrix": op, "qubits": [0, 1]} + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=16, + tensor_module="torch", + fallback=False, + ) + value = backend.expectation(circuit, observable, preprocess=False) + + np.testing.assert_allclose(value, 0.25j, atol=1e-12) + + def test_truncation_error_no_truncation(): """With large bond, truncation error should be essentially zero.""" circuit = build_local_circuit(nqubits=6, nlayers=2) @@ -141,6 +370,10 @@ def test_truncation_error_no_truncation(): assert backend.last_truncation_error < 1e-14, ( f"Expected near-zero truncation error, got {backend.last_truncation_error}" ) + assert backend.last_max_truncation_error < 1e-14, ( + "Expected near-zero max truncation error, got " + f"{backend.last_max_truncation_error}" + ) def test_vidal_backend_matches_statevector_multiterm(): diff --git a/tools/baseline_mps_expectation.py b/tools/baseline_mps_expectation.py index 311cef7..ef12ae3 100644 --- a/tools/baseline_mps_expectation.py +++ b/tools/baseline_mps_expectation.py @@ -19,6 +19,22 @@ from qibotn.backends.qmatchatea import QMatchaTeaBackend from qibotn.backends.vidal_tebd import run_vidal_ring_xz +def optional_int(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return int(text) + + +def optional_float(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return float(text) + + +def format_optional(value, fmt="g"): + return "None" if value is None else format(value, fmt) + + def build_circuit(nqubits, nlayers, seed): return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed) @@ -35,7 +51,8 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--nqubits", type=int, default=40) parser.add_argument("--nlayers", type=int, default=30) - parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512) + parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512) + parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) parser.add_argument("--seed", type=int, default=42) parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch") parser.add_argument("--torch-threads", type=int, default=32) @@ -109,7 +126,8 @@ def main(): mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR" print( f"nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={args.bond} seed={args.seed} " + f"bond={format_optional(args.bond)} " + f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} " f"tensor_module={args.tensor_module} svd_control=E! " f"compile_circuit=True mpi={mpi_label} executor={args.executor}" ) @@ -128,7 +146,7 @@ def main(): value, timings = run_segment_vidal_mpi_ring_xz( circuit, max_bond=args.bond, - cut_ratio=1e-12, + cut_ratio=args.cut_ratio, tensor_module=args.tensor_module, comm=MPI.COMM_WORLD, ) @@ -136,7 +154,7 @@ def main(): value = run_vidal_ring_xz( circuit, max_bond=args.bond, - cut_ratio=1e-12, + cut_ratio=args.cut_ratio, tensor_module=args.tensor_module, ) else: @@ -144,7 +162,7 @@ def main(): backend.configure_tn_simulation( ansatz="MPS", max_bond_dimension=args.bond, - cut_ratio=1e-12, + cut_ratio=args.cut_ratio, svd_control="E!", tensor_module=args.tensor_module, compile_circuit=True, diff --git a/tools/example_tn_case.py b/tools/example_tn_case.py new file mode 100644 index 0000000..c35f057 --- /dev/null +++ b/tools/example_tn_case.py @@ -0,0 +1,33 @@ +"""Example custom case for tools/run_tn_custom.py.""" + +from __future__ import annotations + +import math + +import numpy as np +from qibo import Circuit, gates + + +def build_circuit(nqubits, nlayers, seed): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + for layer in range(nlayers): + for qubit in range(nqubits): + circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + return circuit + + +def build_observable(nqubits, seed): + return { + "terms": [ + { + "coefficient": 1.0 / max(1, nqubits - 1), + "operators": [("Z", site), ("Z", site + 1)], + } + for site in range(nqubits - 1) + ] + } diff --git a/tools/inspect_contraction_tree.py b/tools/inspect_contraction_tree.py new file mode 100644 index 0000000..a6422ba --- /dev/null +++ b/tools/inspect_contraction_tree.py @@ -0,0 +1,208 @@ +"""Inspect cotengra contraction trees for dominant torch matmul shapes.""" + +from __future__ import annotations + +import argparse +import importlib +import math +import pickle +from collections import Counter, defaultdict +from pathlib import Path + + +def _prod(values): + out = 1 + for value in values: + out *= int(value) + return out + + +def _broadcast_batch(a_batch, b_batch): + if a_batch == b_batch: + return _prod(a_batch) + if not a_batch: + return _prod(b_batch) + if not b_batch: + return _prod(a_batch) + + ndim = max(len(a_batch), len(b_batch)) + a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch) + b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch) + return _prod(max(a, b) for a, b in zip(a_batch, b_batch)) + + +def _load_tree(path, index): + with Path(path).open("rb") as f: + payload = pickle.load(f) + trees = payload["trees"] if isinstance(payload, dict) else payload + if not isinstance(trees, (list, tuple)): + trees = [trees] + return trees[index] + + +def _analyze_tree(tree): + contract_mod = importlib.import_module("cotengra.contract") + contractions = contract_mod.extract_contractions(tree) + size_dict = tree.size_dict + ops = [] + counts = Counter() + + for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions): + if left is None and right is None: + counts["preprocess"] += 1 + continue + + left_inds = tree.get_inds(left) + right_inds = tree.get_inds(right) + parent_inds = tree.get_inds(parent) + left_shape = tuple(size_dict[ix] for ix in left_inds) + right_shape = tuple(size_dict[ix] for ix in right_inds) + + if tdot: + parsed = contract_mod._parse_tensordot_axes_to_matmul( + arg, + left_shape, + right_shape, + ) + else: + parsed = contract_mod._parse_eq_to_batch_matmul( + arg, + left_shape, + right_shape, + ) + + ( + _eq_a, + _eq_b, + new_shape_a, + new_shape_b, + _new_shape_ab, + _perm_ab, + pure_multiplication, + ) = parsed + + matmul_shape = None + matmul_flops = 0 + if pure_multiplication: + kind = "mul" + else: + a_shape = tuple(new_shape_a or left_shape) + b_shape = tuple(new_shape_b or right_shape) + batch = _broadcast_batch(a_shape[:-2], b_shape[:-2]) + m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1]) + kind = "mm" if batch == 1 else "bmm" + matmul_shape = (batch, m, k, n) + matmul_flops = batch * m * k * n + + tree_flops = int(tree.get_flops(parent)) + out_size = int(tree.get_size(parent)) + ops.append( + { + "index": op_index, + "kind": kind, + "matmul_shape": matmul_shape, + "matmul_flops": matmul_flops, + "tree_flops": tree_flops, + "out_size": out_size, + "left_shape": left_shape, + "right_shape": right_shape, + "left_rank": len(left_inds), + "right_rank": len(right_inds), + "out_rank": len(parent_inds), + "perm": perm, + } + ) + counts[kind] += 1 + + return contractions, ops, counts + + +def _format_log(value, base): + return "-inf" if value <= 0 else f"{math.log(value, base):.3f}" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.") + parser.add_argument("--index", type=int, default=0, help="Tree index in the file.") + parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.") + parser.add_argument( + "--dtype-bytes", + type=int, + default=8, + help="Bytes per element for memory estimates, for example 8 for complex64.", + ) + args = parser.parse_args() + + tree = _load_tree(args.tree, args.index) + contractions, ops, counts = _analyze_tree(tree) + nslices = int(getattr(tree, "multiplicity", 1)) + per_slice_flops = sum(op["tree_flops"] for op in ops) + per_slice_write = sum(op["out_size"] for op in ops) + max_out = max((op["out_size"] for op in ops), default=0) + all_flops = per_slice_flops * nslices + all_write = per_slice_write * nslices + + print(f"tree={args.tree} index={args.index}") + print( + "summary " + f"slices={nslices} contractions={len(contractions)} " + f"counts={dict(counts)}" + ) + print( + "per_slice " + f"log10_flops={_format_log(per_slice_flops, 10)} " + f"log10_write={_format_log(per_slice_write, 10)} " + f"log2_max_output={_format_log(max_out, 2)} " + f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}" + ) + print( + "all_slices " + f"log10_flops={_format_log(all_flops, 10)} " + f"log10_write={_format_log(all_write, 10)}" + ) + + print(f"\ntop_{args.top}_ops_by_flops") + for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]: + print( + f"op={op['index']} kind={op['kind']} " + f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} " + f"matmul={op['matmul_shape']} " + f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) " + f"lhs={op['left_shape']} rhs={op['right_shape']}" + ) + + by_shape = defaultdict(lambda: [0, 0, 0]) + for op in ops: + shape = op["matmul_shape"] + if shape is None: + continue + by_shape[shape][0] += 1 + by_shape[shape][1] += op["tree_flops"] + by_shape[shape][2] += op["out_size"] + + print(f"\ntop_{args.top}_matmul_shapes_by_flops") + for shape, (count, flops, out_size) in sorted( + by_shape.items(), + key=lambda item: item[1][1], + reverse=True, + )[: args.top]: + print( + f"shape={shape} count={count} " + f"flops={flops:.6e} output={out_size:.6e}" + ) + + print(f"\ntop_{args.top}_matmul_shapes_by_count") + for shape, (count, flops, out_size) in sorted( + by_shape.items(), + key=lambda item: item[1][0], + reverse=True, + )[: args.top]: + print( + f"shape={shape} count={count} " + f"flops={flops:.6e} output={out_size:.6e}" + ) + + +if __name__ == "__main__": + main() diff --git a/tools/manage_tn_dask_cluster.sh b/tools/manage_tn_dask_cluster.sh new file mode 100755 index 0000000..2fb7446 --- /dev/null +++ b/tools/manage_tn_dask_cluster.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Manage the dask cluster used by TN path search. +# +# Defaults target two servers: +# scheduler: 10.20.1.103:8786 +# workers: 10.20.1.103, 10.20.1.102 +# +# Usage: +# tools/manage_tn_dask_cluster.sh start +# tools/manage_tn_dask_cluster.sh status +# tools/manage_tn_dask_cluster.sh stop +# +# Common overrides: +# SCHEDULER_HOST=10.20.1.103 +# WORKER_HOSTS="10.20.1.103 10.20.1.102" +# NWORKERS=48 +# NTHREADS=1 +# ROOT_DIR=/home/yx/qibotn +# PYTHON_BIN=.venv/bin/python + +ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}" +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}" +SCHEDULER_PORT="${SCHEDULER_PORT:-8786}" +DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}" +WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}" +NWORKERS="${NWORKERS:-48}" +NTHREADS="${NTHREADS:-1}" +MEMORY_LIMIT="${MEMORY_LIMIT:-0}" +LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}" +LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}" +SSH_BIN="${SSH_BIN:-ssh}" +DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}" +DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}" +DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}" + +SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}" + +is_local_host() { + local host="$1" + [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0 + [[ "$host" == "$(hostname)" ]] && return 0 + [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0 + hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host" +} + +run_on_host() { + local host="$1" + shift + local cmd="$*" + if is_local_host "$host"; then + bash -lc "$cmd" + else + "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")" + fi +} + +start_scheduler() { + local host="$SCHEDULER_HOST" + local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log" + local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" + run_on_host "$host" " + set -euo pipefail + cd '$ROOT_DIR' + mkdir -p '$LOG_DIR' + if [[ -s '$pid_file' ]]; then + pid=\$(cat '$pid_file') + if kill -0 \"\$pid\" 2>/dev/null; then + echo \"scheduler already running on $host pid=\$pid\" + exit 0 + fi + fi + DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ + DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ + DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ + setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \ + --host '$SCHEDULER_HOST' \ + --port '$SCHEDULER_PORT' \ + --dashboard-address '$DASHBOARD_ADDRESS' \ + > '$log' 2>&1 < /dev/null & + pid=\$! + echo \"\$pid\" > '$pid_file' + echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\" + " +} + +start_worker() { + local host="$1" + local log="$LOG_DIR/worker_${host}.log" + local pid_file="$LOG_DIR/worker_${host}.pid" + run_on_host "$host" " + set -euo pipefail + cd '$ROOT_DIR' + mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY' + if [[ -s '$pid_file' ]]; then + pid=\$(cat '$pid_file') + if kill -0 \"\$pid\" 2>/dev/null; then + echo \"worker already running on $host pid=\$pid\" + exit 0 + fi + fi + TCM_ENABLE=1 \ + DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ + DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ + DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ + setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \ + '$SCHEDULER_ADDR' \ + --host '$host' \ + --nworkers '$NWORKERS' \ + --nthreads '$NTHREADS' \ + --memory-limit '$MEMORY_LIMIT' \ + --local-directory '$LOCAL_DIRECTORY' \ + > '$log' 2>&1 < /dev/null & + pid=\$! + echo \"\$pid\" > '$pid_file' + echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\" + " +} + +stop_host() { + local host="$1" + local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" + local worker_pid_file="$LOG_DIR/worker_${host}.pid" + run_on_host "$host" " + set +e + for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do + [[ -f \"\$pid_file\" ]] || continue + if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then + continue + fi + pid=\$(cat \"\$pid_file\") + kill \"\$pid\" 2>/dev/null || true + rm -f \"\$pid_file\" + done + pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR' + pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT' + true + " +} + +status_host() { + local host="$1" + local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" + local worker_pid_file="$LOG_DIR/worker_${host}.pid" + echo "--------------------------------------------------------------------------------" + echo "host=$host" + run_on_host "$host" " + set +e + for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do + [[ -f \"\$pid_file\" ]] || continue + if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then + continue + fi + pid=\$(cat \"\$pid_file\") + if kill -0 \"\$pid\" 2>/dev/null; then + ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers + else + echo \"stale pid_file=\$pid_file pid=\$pid\" + fi + done + pgrep -af '[d]istributed.cli.dask' || true + " +} + +case "${1:-help}" in + start) + start_scheduler + sleep 2 + for host in $WORKER_HOSTS; do + start_worker "$host" + done + echo + echo "Dask scheduler: $SCHEDULER_ADDR" + echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS" + ;; + stop) + for host in $WORKER_HOSTS; do + stop_host "$host" + done + stop_host "$SCHEDULER_HOST" + ;; + status) + status_host "$SCHEDULER_HOST" + for host in $WORKER_HOSTS; do + [[ "$host" == "$SCHEDULER_HOST" ]] && continue + status_host "$host" + done + ;; + restart) + "$0" stop + sleep 2 + "$0" start + ;; + help|*) + cat < args.exact_max_qubits: + raise ValueError( + f"--exact is limited to {args.exact_max_qubits} qubits by default." + ) + exact = exact_for_observable(circuit, obs, args.nqubits) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + mpi_approach="CT", + mpi_num_procs=size, + fallback=False, + ) + + comm.Barrier() + start = time.perf_counter() + try: + value = backend.expectation( + circuit, + obs, + preprocess=True, + compile_circuit=False, + ) + status = "ok" + except Exception as exc: + value = np.nan + status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] + seconds = time.perf_counter() - start + + if rank == 0: + abs_error = float("nan") if exact is None else abs(value - exact) + rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) + exact_text = "nan" if exact is None else f"{exact:.16e}" + print( + f"{obs_name} {exact_text} {value!r} " + f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} " + f"{backend.last_truncation_error:.6e} " + f"{backend.last_max_truncation_error:.6e} {status}", + flush=True, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("mode", choices=("run", "validate", "list")) + parser.add_argument("--case", choices=sorted(CASES), default="main1") + parser.add_argument("--observables", nargs="+") + parser.add_argument("--obs-filter", default="") + parser.add_argument("--nqubits", type=int) + parser.add_argument("--nlayers", type=int) + parser.add_argument("--bond", "--bonds", dest="bond", default="case-default") + parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) + parser.add_argument("--seed", type=int) + parser.add_argument("--torch-threads", type=int, default=8) + parser.add_argument("--exact", action="store_true") + parser.add_argument("--exact-max-qubits", type=int, default=24) + args = parser.parse_args() + + if args.mode == "list": + for name, case in CASES.items(): + print( + f"{name}: circuit={case.circuit_kind} " + f"observables={','.join(case.observables)} " + f"nqubits={case.nqubits} nlayers={case.nlayers} " + f"bond={case.bond} seed={case.seed}" + ) + return + + apply_case_defaults(args) + if isinstance(args.bond, str): + args.bond = optional_int(args.bond) + + if args.mode == "validate": + args.exact = True + args.nqubits = min(args.nqubits, args.exact_max_qubits) + + run_case(args) + + +if __name__ == "__main__": + main() diff --git a/tools/run_tn_custom.py b/tools/run_tn_custom.py new file mode 100644 index 0000000..049ebed --- /dev/null +++ b/tools/run_tn_custom.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python +"""Run TN expectation for a user-provided circuit and observable. + +The case module should define: + + def build_circuit(nqubits, nlayers, seed): ... + def build_observable(nqubits, seed): ... + +``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn +dict form: + + {"terms": [ + {"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]}, + ]} + +For a single repeated Pauli string, pass ``--pauli-pattern`` instead of +defining ``build_observable``. +""" + +from __future__ import annotations + +import argparse +import importlib.util +import inspect +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +from qibotn.expectation_runner import ( # noqa: E402 + ExpectationConfig, + exact_for_observable, + run_cpu_expectation, +) + + +def optional_int(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return int(text) + + +def optional_float(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return float(text) + + +def load_module(path): + path = Path(path).resolve() + spec = importlib.util.spec_from_file_location(path.stem, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Cannot import case module from {path}.") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def call_builder(fn, **kwargs): + sig = inspect.signature(fn) + if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()): + return fn(**kwargs) + accepted = { + name: value + for name, value in kwargs.items() + if name in sig.parameters + } + return fn(**accepted) + + +def load_observable(args, module): + if args.pauli_pattern: + return {"pauli_string_pattern": args.pauli_pattern} + if args.observable_json: + with Path(args.observable_json).open() as f: + return json.load(f) + if hasattr(module, "build_observable"): + return call_builder( + module.build_observable, + nqubits=args.nqubits, + nlayers=args.nlayers, + seed=args.seed, + ) + if hasattr(module, "OBSERVABLE"): + return module.OBSERVABLE + raise ValueError( + "No observable supplied. Define build_observable/OBSERVABLE in the case " + "module, or pass --pauli-pattern / --observable-json." + ) + + +def build_parallel_opts(args): + slicing_opts = {} + if args.tn_target_slices is not None: + slicing_opts["target_slices"] = args.tn_target_slices + if args.tn_target_size is not None: + slicing_opts["target_size"] = args.tn_target_size + + opts = { + "slicing_opts": slicing_opts or None, + "search_workers": args.tn_search_workers or args.torch_threads, + "max_repeats": args.tn_search_repeats, + "max_time": args.tn_search_time, + "print_stats": not args.no_tn_stats, + } + if args.tn_search_backend is not None: + opts["search_backend"] = args.tn_search_backend + if args.dask_address is not None: + opts["dask_address"] = args.dask_address + if args.dask_close_workers: + opts["dask_close_workers"] = True + if args.tn_save_tree is not None: + opts["save_tree_path"] = args.tn_save_tree + if args.tn_load_tree is not None: + opts["load_tree_path"] = args.tn_load_tree + if args.tn_search_only: + opts["search_only"] = True + return opts + + +def main(): + parser = argparse.ArgumentParser( + description="Run CPU TN expectation for a custom qibo circuit module." + ) + parser.add_argument("case_module", help="Python file defining build_circuit.") + parser.add_argument("--nqubits", type=int, required=True) + parser.add_argument("--nlayers", type=int, default=0) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--mpi", action="store_true") + parser.add_argument("--exact", action="store_true") + parser.add_argument("--exact-max-qubits", type=int, default=24) + parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) + parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) + parser.add_argument("--torch-threads", type=int, default=8) + parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") + parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128") + parser.add_argument("--pauli-pattern") + parser.add_argument("--observable-json") + parser.add_argument("--tn-target-slices", type=int) + parser.add_argument("--tn-target-size", type=int, default=2**32) + parser.add_argument("--tn-search-workers", type=int) + parser.add_argument("--tn-search-repeats", type=int, default=128) + parser.add_argument("--tn-search-time", type=float, default=60.0) + parser.add_argument("--tn-search-backend", choices=("processpool", "dask")) + parser.add_argument("--dask-address") + parser.add_argument("--dask-close-workers", action="store_true") + parser.add_argument("--tn-save-tree") + parser.add_argument("--tn-load-tree") + parser.add_argument("--tn-search-only", action="store_true") + parser.add_argument("--no-tn-stats", action="store_true") + args = parser.parse_args() + + rank = 0 + if args.mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + module = load_module(args.case_module) + if not hasattr(module, "build_circuit"): + raise ValueError("case_module must define build_circuit.") + + circuit = call_builder( + module.build_circuit, + nqubits=args.nqubits, + nlayers=args.nlayers, + seed=args.seed, + ) + observable = load_observable(args, module) + + config = ExpectationConfig( + ansatz="tn", + mpi=args.mpi, + bond=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + quimb_backend=args.quimb_backend, + dtype=args.dtype, + torch_threads=args.torch_threads, + parallel_opts=build_parallel_opts(args), + ) + + if rank == 0: + mode = "MPI" if args.mpi else "serial" + print( + f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} " + f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} " + f"quimb_backend={args.quimb_backend} dtype={args.dtype} " + f"torch_threads={args.torch_threads}", + flush=True, + ) + print("observable exact value abs_error rel_error seconds", flush=True) + + exact = None + if args.exact and rank == 0: + if args.nqubits > args.exact_max_qubits: + raise ValueError( + f"--exact is limited to {args.exact_max_qubits} qubits by default." + ) + exact = exact_for_observable(circuit, observable, args.nqubits) + + result = run_cpu_expectation(circuit, observable, config) + if args.mpi and result.rank != 0: + return + + abs_error = float("nan") if exact is None else abs(result.value - exact) + rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) + exact_text = "nan" if exact is None else f"{exact:.16e}" + print( + f"custom {exact_text} {result.value:.16e} " + f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}", + flush=True, + ) + + for stat in result.parallel_stats or (): + cost = stat["path_cost"] + search_stats = stat.get("search_stats", {}) + print( + "tn_term_summary " + f"term={stat.get('term_index', 0)} " + f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " + f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " + f"completed_trials={search_stats.get('completed_trials', 'na')} " + f"finite_trials={search_stats.get('finite_trials', 'na')} " + f"failed_trials={search_stats.get('failed_trials', 'na')} " + f"requested_trials={search_stats.get('requested_trials', 'na')} " + f"best_score={search_stats.get('best_score', float('nan')):.6g} " + f"slices={cost.get('slices')} " + f"log10_flops={cost.get('log10_flops', float('nan')):.3f} " + f"log10_write={cost.get('log10_write', float('nan')):.3f} " + f"log2_size={cost.get('log2_size', float('nan')):.3f} " + f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} " + f"rank_slices={stat.get('rank_slices')}", + flush=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/run_vidal_mpi_contest_cases.sh b/tools/run_vidal_mpi_contest_cases.sh new file mode 100755 index 0000000..f2524e7 --- /dev/null +++ b/tools/run_vidal_mpi_contest_cases.sh @@ -0,0 +1,340 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Contest-style Vidal/MPI MPS cases. +# +# Usage: +# tools/run_vidal_mpi_contest_cases.sh main1 +# tools/run_vidal_mpi_contest_cases.sh main2 +# tools/run_vidal_mpi_contest_cases.sh strong +# tools/run_vidal_mpi_contest_cases.sh all +# +# Common overrides: +# PYTHON_BIN=.venv/bin/python +# MPIEXEC=mpiexec +# MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" +# HOSTFILE=hostfile # optional; used only if the file exists +# RANKS=8 +# TORCH_THREADS=8 +# CUT_RATIO=1e-12 +# OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0" +# +# Per-case overrides: +# MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001 +# MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002 +# STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001 + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +MPIEXEC="${MPIEXEC:-mpiexec}" +HOSTFILE="${HOSTFILE:-}" +RANKS="${RANKS:-4}" +TORCH_THREADS="${TORCH_THREADS:-1}" +CUT_RATIO="${CUT_RATIO:-1e-12}" +OBS_FILTER="${OBS_FILTER:-}" + +RUNNER_DIR="$ROOT_DIR/.tmp" +mkdir -p "$RUNNER_DIR" +RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")" +cleanup() { + rm -f "$RUNNER" +} +trap cleanup EXIT + +cat > "$RUNNER" <<'PY' +from __future__ import annotations + +import argparse +import math +import time + +import numpy as np +from mpi4py import MPI +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z + +from qibotn.backends.vidal import VidalBackend + + +def set_torch_threads(nthreads): + try: + import torch + + torch.set_num_threads(nthreads) + except Exception: + pass + + +def build_circuit(kind, nqubits, nlayers, seed): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + + for layer in range(nlayers): + for q in range(nqubits): + circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) + if kind in ("rxx_rzz", "scramble"): + circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) + + if kind == "reversed_cnot": + for q in range(0, nqubits - 1, 2): + circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1)) + for q in range(1, nqubits - 1, 2): + circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1)) + elif kind == "rxx_rzz": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9))) + circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9))) + elif kind == "scramble": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8))) + circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8))) + if layer % 5 == 4: + circuit.add(gates.SWAP(q, q + 1)) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + + return circuit + + +def ring_xz(nqubits): + form = 0 + for q in range(nqubits): + form += 0.5 * X(q) * Z((q + 1) % nqubits) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def open_zz(nqubits): + form = 0 + for q in range(nqubits - 1): + form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def range2_xx(nqubits): + form = 0 + for q in range(nqubits - 2): + form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def dense_observable(nqubits, qubits, seed, dim): + rng = np.random.default_rng(seed) + raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) + matrix = (raw + raw.conj().T) / 2.0 + matrix = matrix / np.linalg.norm(matrix) + return {"matrix": matrix, "qubits": list(qubits)} + + +def observables_for_case(nqubits, seed): + q1 = nqubits // 4 + q2 = nqubits // 2 + q3 = (3 * nqubits) // 4 + last = nqubits - 1 + + return [ + ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))), + ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))), + ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))), + ( + "long_Z_5_sites", + hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)), + ), + ( + "mixed_XZYZX", + hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)), + ), + ("ring_xz", ring_xz(nqubits)), + ("open_zz", open_zz(nqubits)), + ("range2_xx", range2_xx(nqubits)), + ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))), + ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)), + ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)), + ] + + +def run_case(args): + set_torch_threads(args.torch_threads) + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + + circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed) + observables = observables_for_case(args.nqubits, args.seed) + if args.obs_filter: + wanted = set(args.obs_filter.split(",")) + observables = [(name, obs) for name, obs in observables if name in wanted] + if not observables: + raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}") + + if rank == 0: + print("=" * 88, flush=True) + print( + "case " + f"label={args.label} kind={args.kind} ranks={size} " + f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} " + f"bond={args.bond} cut_ratio={args.cut_ratio:g} " + f"torch_threads={args.torch_threads} seed={args.seed} " + f"obs_filter={args.obs_filter or 'all'}", + flush=True, + ) + print( + "observable value seconds trunc_sum trunc_max status", + flush=True, + ) + + for obs_name, observable in observables: + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + mpi_approach="CT", + mpi_num_procs=size, + fallback=False, + ) + + comm.Barrier() + start = time.perf_counter() + try: + value = backend.expectation( + circuit, + observable, + preprocess=True, + compile_circuit=False, + ) + status = "ok" + except Exception as exc: # pragma: no cover - printed for manual runs + value = np.nan + status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] + seconds = time.perf_counter() - start + + if rank == 0: + print( + f"{obs_name} {value!r} {seconds:.3f} " + f"{backend.last_truncation_error:.6e} " + f"{backend.last_max_truncation_error:.6e} {status}", + flush=True, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--label", required=True) + parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True) + parser.add_argument("--nqubits", type=int, required=True) + parser.add_argument("--nlayers", type=int, required=True) + parser.add_argument("--bond", type=int, required=True) + parser.add_argument("--cut-ratio", type=float, required=True) + parser.add_argument("--seed", type=int, required=True) + parser.add_argument("--torch-threads", type=int, required=True) + parser.add_argument("--obs-filter", default="") + run_case(parser.parse_args()) + + +if __name__ == "__main__": + main() +PY + +if [[ -n "${MPIEXEC_FULL:-}" ]]; then + read -r -a mpi_prefix <<< "$MPIEXEC_FULL" +else + mpi_prefix=("$MPIEXEC") + if [[ -n "$HOSTFILE" && -f "$HOSTFILE" ]]; then + mpi_prefix+=("-hostfile" "$HOSTFILE") + fi + mpi_prefix+=("-n" "$RANKS") +fi + +run_case() { + local label="$1" + local kind="$2" + local nq="$3" + local layers="$4" + local bond="$5" + local seed="$6" + + echo + echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed" + echo "MPI: ${mpi_prefix[*]}" + "${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \ + --label "$label" \ + --kind "$kind" \ + --nqubits "$nq" \ + --nlayers "$layers" \ + --bond "$bond" \ + --cut-ratio "$CUT_RATIO" \ + --seed "$seed" \ + --torch-threads "$TORCH_THREADS" \ + --obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")" +} + +case "${1:-help}" in + main1) + run_case \ + "main1-reversed-cnot" \ + "reversed_cnot" \ + "${MAIN1_NQ:-128}" \ + "${MAIN1_LAYERS:-50}" \ + "${MAIN1_BOND:-1024}" \ + "${MAIN1_SEED:-31001}" + ;; + main2) + run_case \ + "main2-rxx-rzz" \ + "rxx_rzz" \ + "${MAIN2_NQ:-128}" \ + "${MAIN2_LAYERS:-64}" \ + "${MAIN2_BOND:-2048}" \ + "${MAIN2_SEED:-31002}" + ;; + strong) + run_case \ + "strong-scramble" \ + "scramble" \ + "${STRONG_NQ:-256}" \ + "${STRONG_LAYERS:-64}" \ + "${STRONG_BOND:-2048}" \ + "${STRONG_SEED:-41001}" + ;; + all) + "$0" main1 + "$0" main2 + "$0" strong + ;; + smoke) + MAIN1_NQ="${MAIN1_NQ:-32}" \ + MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \ + MAIN1_BOND="${MAIN1_BOND:-128}" \ + "$0" main1 + ;; + help|*) + cat >&2 <<'EOF' +Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke] + +Cases: + main1 128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024 + main2 128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048 + strong 256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048 + smoke Small syntax/runtime check of main1 + +Common overrides: + PYTHON_BIN=.venv/bin/python + MPIEXEC=mpiexec + MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" + HOSTFILE=hostfile + RANKS=8 + TORCH_THREADS=8 + CUT_RATIO=1e-12 + OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0" + +Per-case overrides: + MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001 + MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002 + STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001 +EOF + exit 2 + ;; +esac diff --git a/tools/slice_existing_tree.py b/tools/slice_existing_tree.py new file mode 100644 index 0000000..4e94e9c --- /dev/null +++ b/tools/slice_existing_tree.py @@ -0,0 +1,59 @@ +"""Slice an existing saved cotengra tree without re-running path search.""" + +from __future__ import annotations + +import argparse +import pickle +from pathlib import Path + +from qibotn.parallel import contraction_tree_costs + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("input", help="Input pickle saved by --tn-save-tree.") + parser.add_argument("output", help="Output pickle path.") + parser.add_argument("--term", type=int, default=0) + parser.add_argument("--target-slices", type=int, default=2) + parser.add_argument("--max-repeats", type=int, default=64) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + input_path = Path(args.input) + output_path = Path(args.output) + with input_path.open("rb") as f: + payload = pickle.load(f) + + trees = payload["trees"] if isinstance(payload, dict) else payload + if not isinstance(trees, (list, tuple)): + trees = [trees] + tree = trees[args.term] + + print("original", contraction_tree_costs(tree), flush=True) + sliced = tree.slice( + target_slices=args.target_slices, + max_repeats=args.max_repeats, + seed=args.seed, + ) + print("sliced", contraction_tree_costs(sliced), flush=True) + print(f"sliced_inds={sliced.sliced_inds}", flush=True) + + new_trees = list(trees) + new_trees[args.term] = sliced + + if isinstance(payload, dict): + out_payload = dict(payload) + out_payload["trees"] = new_trees + out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees] + out_payload["nterms"] = len(new_trees) + else: + out_payload = new_trees + + output_path.parent.mkdir(parents=True, exist_ok=True) + with output_path.open("wb") as f: + pickle.dump(out_payload, f) + print(f"saved {output_path}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/tools/tn_contest_runner.py b/tools/tn_contest_runner.py new file mode 100644 index 0000000..680cecd --- /dev/null +++ b/tools/tn_contest_runner.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python +"""Contest-style CPU TN path search and contraction runner. + +This file is intentionally self-contained: define contest circuits and +observables here, run path search once, then load the saved trees for repeated +MPI contractions. +""" + +from __future__ import annotations + +import argparse +import math +import os +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from urllib.parse import urlparse + +import numpy as np +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z + +ROOT = Path(__file__).resolve().parents[1] +SRC = ROOT / "src" +if str(SRC) not in sys.path: + sys.path.insert(0, str(SRC)) + +from qibotn.expectation_runner import ( # noqa: E402 + ExpectationConfig, + exact_for_observable, + run_cpu_expectation, +) + + +@dataclass(frozen=True) +class CaseSpec: + circuit_kind: str + observables: tuple[str, ...] + nqubits: int + nlayers: int + seed: int + target_slices: int | None = None + + +CASES = { + "main1": CaseSpec( + circuit_kind="rxx_rzz_chain", + observables=("ring_xz",), + nqubits=34, + nlayers=20, + seed=31001, + target_slices=None, + ), + "main2": CaseSpec( + circuit_kind="scramble_chain", + observables=("open_zz", "range2_xx"), + nqubits=36, + nlayers=18, + seed=31002, + target_slices=None, + ), + "strong": CaseSpec( + circuit_kind="reversed_cnot", + observables=("ring_xz", "long_z_string"), + nqubits=40, + nlayers=24, + seed=41001, + target_slices=None, + ), +} + + +def optional_int(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return int(text) + + +def optional_float(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return float(text) + + +def set_torch_threads(nthreads): + try: + import torch + + torch.set_num_threads(nthreads) + except Exception: + pass + + +def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False): + for qubit in range(nqubits): + circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) + if include_rx: + circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) + + +def build_circuit(kind, nqubits, nlayers, seed): + """Define contest circuits here.""" + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + + for layer in range(nlayers): + if kind == "rxx_rzz_chain": + add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) + + elif kind == "scramble_chain": + add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + if layer % 5 == 4: + circuit.add(gates.SWAP(qubit, qubit + 1)) + + elif kind == "reversed_cnot": + add_single_qubit_layer(circuit, nqubits, rng) + for qubit in range(0, nqubits - 1, 2): + gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1) + circuit.add(gate) + for qubit in range(1, nqubits - 1, 2): + gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1) + circuit.add(gate) + + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + + return circuit + + +def pauli_sum_observable(kind, nqubits, seed): + """Define contest observables here. + + TN path currently expects Pauli products / SymbolicHamiltonian terms. + Keep production contest observables Hermitian unless complex output is + explicitly required by the scoring rule. + """ + del seed + if kind == "ring_xz": + form = 0 + for qubit in range(nqubits): + form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) + return hamiltonians.SymbolicHamiltonian(form=form) + + if kind == "open_zz": + form = 0 + for qubit in range(nqubits - 1): + form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1) + return hamiltonians.SymbolicHamiltonian(form=form) + + if kind == "range2_xx": + form = 0 + for qubit in range(nqubits - 2): + form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2) + return hamiltonians.SymbolicHamiltonian(form=form) + + if kind == "long_z_string": + stride = max(1, nqubits // 16) + form = None + for qubit in range(0, nqubits, stride): + form = Z(qubit) if form is None else form * Z(qubit) + return hamiltonians.SymbolicHamiltonian(form=form) + + if kind == "mixed_local": + q1 = nqubits // 4 + q2 = nqubits // 2 + q3 = (3 * nqubits) // 4 + form = 0.25 * X(0) - 0.5 * Z(nqubits - 1) + form += 0.125 * X(q1) * Z(q2) * Y(q3) + return hamiltonians.SymbolicHamiltonian(form=form) + + raise ValueError(f"Unknown observable kind {kind!r}.") + + +def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices): + slice_label = "auto" if target_slices is None else f"s{target_slices}" + return ( + Path(tree_dir) + / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl" + ) + + +def build_parallel_opts(args, tree_file=None, search_only=False): + slicing_opts = {} + if args.tn_target_slices is not None: + slicing_opts["target_slices"] = args.tn_target_slices + if args.tn_target_size is not None: + slicing_opts["target_size"] = args.tn_target_size + + opts = { + "slicing_opts": slicing_opts or None, + "search_workers": args.tn_search_workers or args.torch_threads, + "max_repeats": args.tn_search_repeats, + "max_time": args.tn_search_time, + "print_stats": not args.no_tn_stats, + } + if args.tn_search_backend is not None: + opts["search_backend"] = args.tn_search_backend + if args.dask_address is not None: + opts["dask_address"] = args.dask_address + if args.dask_close_workers: + opts["dask_close_workers"] = True + if args.tn_debug_trials: + opts["debug_trials"] = True + if search_only: + opts["search_only"] = True + opts["save_tree_path"] = str(tree_file) + elif tree_file is not None: + opts["load_tree_path"] = str(tree_file) + return opts + + +def run_one(args, case_name, obs_name, mode): + case = CASES[case_name] + circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed) + observable = pauli_sum_observable(obs_name, args.nqubits, args.seed) + path = tree_path( + args.tree_dir, + case_name, + obs_name, + args.nqubits, + args.nlayers, + args.tn_target_slices, + ) + path.parent.mkdir(parents=True, exist_ok=True) + + rank = 0 + if args.mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + if rank == 0: + print("=" * 88, flush=True) + print( + f"mode={mode} case={case_name} circuit={case.circuit_kind} " + f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} " + f"seed={args.seed} gates={len(circuit.queue)} tree={path}", + flush=True, + ) + + if mode == "contract" and not path.exists(): + raise FileNotFoundError(f"Missing tree file: {path}. Run search first.") + + exact = None + if args.exact and rank == 0 and mode != "search": + if args.nqubits > args.exact_max_qubits: + raise ValueError( + f"--exact is limited to {args.exact_max_qubits} qubits by default." + ) + exact = exact_for_observable(circuit, observable, args.nqubits) + + config = ExpectationConfig( + ansatz="tn", + mpi=args.mpi, + bond=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + quimb_backend=args.quimb_backend, + dtype=args.dtype, + torch_threads=args.torch_threads, + parallel_opts=build_parallel_opts( + args, + tree_file=path, + search_only=(mode == "search"), + ), + ) + result = run_cpu_expectation(circuit, observable, config) + if args.mpi and result.rank != 0: + return + + if mode == "search": + print(f"searched observable={obs_name} tree={path}", flush=True) + else: + abs_error = float("nan") if exact is None else abs(result.value - exact) + rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) + exact_text = "nan" if exact is None else f"{exact:.16e}" + print( + f"result observable={obs_name} exact={exact_text} " + f"value={result.value:.16e} abs_error={abs_error:.6e} " + f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}", + flush=True, + ) + + for stat in result.parallel_stats or (): + cost = stat["path_cost"] + search_stats = stat.get("search_stats", {}) + print( + "tn_term_summary " + f"observable={obs_name} " + f"term={stat.get('term_index', 0)} " + f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " + f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " + f"completed_trials={search_stats.get('completed_trials', 'na')} " + f"finite_trials={search_stats.get('finite_trials', 'na')} " + f"failed_trials={search_stats.get('failed_trials', 'na')} " + f"requested_trials={search_stats.get('requested_trials', 'na')} " + f"best_score={search_stats.get('best_score', float('nan')):.6g} " + f"slices={cost.get('slices')} " + f"log10_flops={cost.get('log10_flops', float('nan')):.3f} " + f"log10_write={cost.get('log10_write', float('nan')):.3f} " + f"log2_size={cost.get('log2_size', float('nan')):.3f} " + f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} " + f"rank_slices={stat.get('rank_slices')}", + flush=True, + ) + + +def selected_observables(args, case): + if args.observables: + return tuple(args.observables) + if args.obs_filter: + return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip()) + return case.observables + + +def apply_case_defaults(args): + case = CASES[args.case] + if args.nqubits is None: + args.nqubits = case.nqubits + if args.nlayers is None: + args.nlayers = case.nlayers + if args.seed is None: + args.seed = case.seed + if args.tn_target_slices is None: + args.tn_target_slices = case.target_slices + args.observables = selected_observables(args, case) + + +def stop_dask_cluster(args): + if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address: + return + script = ROOT / "tools" / "manage_tn_dask_cluster.sh" + if not script.exists(): + print(f"dask_stop_skipped reason=missing_script path={script}", flush=True) + return + + env = os.environ.copy() + parsed = urlparse(args.dask_address) + if parsed.hostname: + env.setdefault("SCHEDULER_HOST", parsed.hostname) + if parsed.port: + env.setdefault("SCHEDULER_PORT", str(parsed.port)) + + print("dask_stop_after_search start", flush=True) + subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False) + print("dask_stop_after_search done", flush=True) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list")) + parser.add_argument("--case", choices=sorted(CASES), default="main1") + parser.add_argument("--observables", nargs="+") + parser.add_argument("--obs-filter", default="") + parser.add_argument("--tree-dir", default="trees/contest_tn") + parser.add_argument("--nqubits", type=int) + parser.add_argument("--nlayers", type=int) + parser.add_argument("--seed", type=int) + parser.add_argument("--mpi", action="store_true") + parser.add_argument("--exact", action="store_true") + parser.add_argument("--exact-max-qubits", type=int, default=24) + parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) + parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) + parser.add_argument("--torch-threads", type=int, default=8) + parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") + parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64") + parser.add_argument("--tn-target-slices", type=int) + parser.add_argument("--tn-target-size", type=int, default=2**32) + parser.add_argument("--tn-search-workers", type=int) + parser.add_argument("--tn-search-repeats", type=int, default=2048) + parser.add_argument("--tn-search-time", type=float, default=300.0) + parser.add_argument( + "--tn-search-backend", + choices=("processpool", "dask"), + default="dask", + help=( + "Path-search backend. Defaults to dask. Without --dask-address, " + "non-MPI search starts a local dask cluster." + ), + ) + parser.add_argument("--dask-address") + parser.add_argument("--dask-close-workers", action="store_true") + parser.add_argument( + "--keep-dask", + action="store_true", + help=( + "Keep an external dask cluster running after search. By default, " + "tools/manage_tn_dask_cluster.sh stop is called after search when " + "--dask-address is used." + ), + ) + parser.add_argument( + "--tn-debug-trials", + action="store_true", + help="Print dask worker summary and per-trial start/done logs.", + ) + parser.add_argument("--no-tn-stats", action="store_true") + args = parser.parse_args() + + if args.mode == "list": + for name, case in CASES.items(): + print( + f"{name}: circuit={case.circuit_kind} " + f"observables={','.join(case.observables)} " + f"nqubits={case.nqubits} nlayers={case.nlayers} " + f"seed={case.seed} target_slices={case.target_slices}" + ) + return + + apply_case_defaults(args) + set_torch_threads(args.torch_threads) + + modes = ("search", "contract") if args.mode == "all" else (args.mode,) + if args.mode == "validate": + args.exact = True + args.nqubits = min(args.nqubits, args.exact_max_qubits) + modes = ("search", "contract") + + for mode in modes: + for obs_name in args.observables: + run_one(args, args.case, obs_name, mode) + if mode == "search": + stop_dask_cluster(args) + + +if __name__ == "__main__": + main() diff --git a/tools/torch_profile_tn_complex64.py b/tools/torch_profile_tn_complex64.py new file mode 100644 index 0000000..b7392f9 --- /dev/null +++ b/tools/torch_profile_tn_complex64.py @@ -0,0 +1,114 @@ +"""Run the 34q/20L TN complex64 benchmark under torch.profiler briefly.""" + +from __future__ import annotations + +import argparse +import os +import signal +import sys +from pathlib import Path + +from mpi4py import MPI + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--seconds", type=float, default=30.0) + parser.add_argument("--out-dir", default="torch_profiles/tn_complex64") + parser.add_argument("--torch-threads", type=int, default=48) + args = parser.parse_args() + + repo_root = Path(__file__).resolve().parents[1] + os.chdir(repo_root) + sys.path.insert(0, str(repo_root)) + + import torch + from torch.profiler import ProfilerActivity, profile + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + out_dir = Path(args.out_dir) + if rank == 0: + out_dir.mkdir(parents=True, exist_ok=True) + comm.Barrier() + + torch.set_num_threads(args.torch_threads) + + def run_benchmark(): + import benchmark_cpu_expectation + + sys.argv = [ + "benchmark_cpu_expectation.py", + "--mpi", + "--ansatz", + "tn", + "--nqubits", + "34", + "--nlayers", + "20", + "--circuits", + "rxx_rzz", + "--pauli-pattern", + "XZ", + "--tn-load-tree", + "trees/rxx_rzz_34q20l_s4.pkl", + "--quimb-backend", + "torch", + "--torch-threads", + str(args.torch_threads), + "--dtype", + "complex64", + ] + benchmark_cpu_expectation.main() + + trace_path = out_dir / f"rank{rank}_trace.json" + stacks_path = out_dir / f"rank{rank}_stacks.txt" + summary_path = out_dir / f"rank{rank}_summary.txt" + + prof = profile( + activities=[ProfilerActivity.CPU], + record_shapes=True, + profile_memory=True, + with_stack=True, + ) + + class ProfileTimeout(Exception): + pass + + def alarm_handler(signum, frame): + raise ProfileTimeout() + + old_handler = signal.signal(signal.SIGALRM, alarm_handler) + signal.setitimer(signal.ITIMER_REAL, args.seconds) + try: + with prof: + try: + run_benchmark() + except ProfileTimeout: + pass + finally: + signal.setitimer(signal.ITIMER_REAL, 0) + signal.signal(signal.SIGALRM, old_handler) + + prof.export_chrome_trace(str(trace_path)) + try: + prof.export_stacks(str(stacks_path), "self_cpu_time_total") + except Exception as exc: # pragma: no cover - diagnostic only + stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8") + + summary = prof.key_averages(group_by_stack_n=5).table( + sort_by="self_cpu_time_total", + row_limit=40, + ) + summary_path.write_text(summary, encoding="utf-8") + + print( + f"torch_profile_done rank={rank}/{size} " + f"trace={trace_path} summary={summary_path}", + flush=True, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/vidal_mpi_contest_runner.py b/tools/vidal_mpi_contest_runner.py new file mode 100644 index 0000000..405f47c --- /dev/null +++ b/tools/vidal_mpi_contest_runner.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import argparse +import math +import time + +import numpy as np +from mpi4py import MPI +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z + +from qibotn.backends.vidal import VidalBackend + + +def optional_int(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return int(text) + + +def optional_float(text): + if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: + return None + return float(text) + + +def format_optional(value, fmt="g"): + return "None" if value is None else format(value, fmt) + + +def set_torch_threads(nthreads): + try: + import torch + + torch.set_num_threads(nthreads) + except Exception: + pass + + +def build_circuit(kind, nqubits, nlayers, seed): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + + for layer in range(nlayers): + for q in range(nqubits): + circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) + if kind in ("rxx_rzz", "scramble"): + circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) + + if kind == "reversed_cnot": + for q in range(0, nqubits - 1, 2): + circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1)) + for q in range(1, nqubits - 1, 2): + circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1)) + elif kind == "rxx_rzz": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9))) + circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9))) + elif kind == "scramble": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8))) + circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8))) + if layer % 5 == 4: + circuit.add(gates.SWAP(q, q + 1)) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + + return circuit + + +def ring_xz(nqubits): + form = 0 + for q in range(nqubits): + form += 0.5 * X(q) * Z((q + 1) % nqubits) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def open_zz(nqubits): + form = 0 + for q in range(nqubits - 1): + form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def range2_xx(nqubits): + form = 0 + for q in range(nqubits - 2): + form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def dense_observable(nqubits, qubits, seed, dim): + rng = np.random.default_rng(seed) + raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) + matrix = (raw + raw.conj().T) / 2.0 + matrix = matrix / np.linalg.norm(matrix) + return {"matrix": matrix, "qubits": list(qubits)} + + +def observables_for_case(nqubits, seed): + q1 = nqubits // 4 + q2 = nqubits // 2 + q3 = (3 * nqubits) // 4 + last = nqubits - 1 + + return [ + ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))), + ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))), + ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))), + ( + "long_Z_5_sites", + hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)), + ), + ( + "mixed_XZYZX", + hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)), + ), + ("ring_xz", ring_xz(nqubits)), + ("open_zz", open_zz(nqubits)), + ("range2_xx", range2_xx(nqubits)), + ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))), + ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)), + ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)), + ] + + +def run_case(args): + set_torch_threads(args.torch_threads) + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + + circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed) + observables = observables_for_case(args.nqubits, args.seed) + if args.obs_filter: + wanted = set(args.obs_filter.split(",")) + observables = [(name, obs) for name, obs in observables if name in wanted] + if not observables: + raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}") + + if rank == 0: + print("=" * 88, flush=True) + print( + "case " + f"label={args.label} kind={args.kind} ranks={size} " + f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} " + f"bond={format_optional(args.bond)} " + f"cut_ratio={format_optional(args.cut_ratio)} " + f"torch_threads={args.torch_threads} seed={args.seed} " + f"obs_filter={args.obs_filter or 'all'}", + flush=True, + ) + print( + "observable value seconds trunc_sum trunc_max status", + flush=True, + ) + + for obs_name, observable in observables: + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + mpi_approach="CT", + mpi_num_procs=size, + fallback=False, + ) + + comm.Barrier() + start = time.perf_counter() + try: + value = backend.expectation( + circuit, + observable, + preprocess=True, + compile_circuit=False, + ) + status = "ok" + except Exception as exc: # pragma: no cover - printed for manual runs + value = np.nan + status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] + seconds = time.perf_counter() - start + + if rank == 0: + print( + f"{obs_name} {value!r} {seconds:.3f} " + f"{backend.last_truncation_error:.6e} " + f"{backend.last_max_truncation_error:.6e} {status}", + flush=True, + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--label", required=True) + parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True) + parser.add_argument("--nqubits", type=int, required=True) + parser.add_argument("--nlayers", type=int, required=True) + parser.add_argument("--bond", type=optional_int, required=True) + parser.add_argument("--cut-ratio", type=optional_float, required=True) + parser.add_argument("--seed", type=int, required=True) + parser.add_argument("--torch-threads", type=int, required=True) + parser.add_argument("--obs-filter", default="") + run_case(parser.parse_args()) + + +if __name__ == "__main__": + main() diff --git a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl new file mode 100644 index 0000000..76eeedd Binary files /dev/null and b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl differ diff --git a/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl b/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl new file mode 100644 index 0000000..f2180f4 Binary files /dev/null and b/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl differ diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl new file mode 100644 index 0000000..9c221b7 Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl differ diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl new file mode 100644 index 0000000..34f7300 Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl differ diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl new file mode 100644 index 0000000..dd1b7a6 Binary files /dev/null and b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl differ diff --git a/trees/rxx_rzz_30q20l.pkl b/trees/rxx_rzz_30q20l.pkl new file mode 100644 index 0000000..ea1d3e3 Binary files /dev/null and b/trees/rxx_rzz_30q20l.pkl differ diff --git a/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl b/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl new file mode 100644 index 0000000..cee2158 Binary files /dev/null and b/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl differ diff --git a/trees/rxx_rzz_30q20l_from_existing_s4.pkl b/trees/rxx_rzz_30q20l_from_existing_s4.pkl new file mode 100644 index 0000000..cee2158 Binary files /dev/null and b/trees/rxx_rzz_30q20l_from_existing_s4.pkl differ diff --git a/trees/rxx_rzz_34q20l_s4.pkl b/trees/rxx_rzz_34q20l_s4.pkl new file mode 100644 index 0000000..f09fef0 Binary files /dev/null and b/trees/rxx_rzz_34q20l_s4.pkl differ