完善mps的vidal机制,多节点并行;补充tn搜索时dask集群搜索的方式
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled

This commit is contained in:
2026-05-12 15:44:19 +08:00
parent aa122964b4
commit 72f95599bb
32 changed files with 3529 additions and 320 deletions

18
tools/README.md Normal file
View File

@@ -0,0 +1,18 @@
# Tools
Auxiliary scripts for profiling, legacy comparisons, and scale probes.
The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
Files here are intentionally secondary:
- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
- `run_cpu_single_cases.sh`: single-node scale probes.
- `run_cpu_large_cases.sh`: two-node MPI scale probes.
- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
- `qibojit_reference_expectation.py`: state-vector reference helper.
- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.

View File

@@ -0,0 +1,183 @@
"""MPS expectation benchmark for qmatchatea and Vidal backends."""
import argparse
import json
import logging
import os
import socket
import time
import numpy as np
from qibotn.benchmark_cases import (
build_circuit as build_benchmark_circuit,
exact_pauli_sum,
observable_terms,
terms_to_dict,
)
from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal_tebd import run_vidal_ring_xz
def build_circuit(nqubits, nlayers, seed):
return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
def build_observable(nqubits):
return terms_to_dict(observable_terms("ring_xz", nqubits))
def exact_expectation(circuit, nqubits):
return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=40)
parser.add_argument("--nlayers", type=int, default=30)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument(
"--executor",
choices=("qmatchatea", "vidal", "vidal-mpi"),
default="qmatchatea",
)
parser.add_argument("--mpi-ct", action="store_true")
parser.add_argument("--mpi-barriers", type=int, default=-1)
parser.add_argument("--mpi-isometrization", type=int, default=-1)
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
parser.add_argument("--reference-file")
parser.add_argument(
"--mpi-rank-map",
action="store_true",
help="Print MPI rank, host, pid, and torch thread placement metadata.",
)
args = parser.parse_args()
logging.getLogger("qibo.config").setLevel(logging.ERROR)
logging.getLogger("qtealeaves").setLevel(logging.ERROR)
import torch
torch.set_num_threads(args.torch_threads)
rank = 0
size = 1
if args.mpi_ct:
from mpi4py import MPI
rank = MPI.COMM_WORLD.Get_rank()
size = MPI.COMM_WORLD.Get_size()
if args.mpi_rank_map:
rank_info = {
"rank": rank,
"size": size,
"host": socket.gethostname(),
"pid": os.getpid(),
"torch_threads": args.torch_threads,
"omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
"mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
}
rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
if rank == 0:
print("mpi_rank_map")
for item in sorted(rank_infos, key=lambda row: row["rank"]):
print(
"rank={rank} size={size} host={host} pid={pid} "
"torch_threads={torch_threads} "
"OMP_NUM_THREADS={omp_num_threads} "
"MKL_NUM_THREADS={mkl_num_threads}".format(**item)
)
circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
observable = build_observable(args.nqubits)
exact = None
if args.reference_file:
with open(args.reference_file, "r", encoding="utf-8") as f:
exact = float(json.load(f)["expectation"])
elif args.exact:
if args.nqubits > args.exact_max_qubits:
raise ValueError(
f"--exact is limited to {args.exact_max_qubits} qubits by default."
)
exact = exact_expectation(circuit, args.nqubits)
if rank == 0:
if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
mpi_label = f"VidalSegment/{size}"
else:
mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
print(
f"nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} seed={args.seed} "
f"tensor_module={args.tensor_module} svd_control=E! "
f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
)
if exact is not None:
print(f"exact={exact:.16e}")
print("expval abs_error rel_error seconds")
start = time.perf_counter()
timings = None
if args.executor in ("vidal", "vidal-mpi"):
if args.executor == "vidal-mpi" and not args.mpi_ct:
raise ValueError("--executor vidal-mpi requires --mpi-ct.")
if args.mpi_ct:
from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
value, timings = run_segment_vidal_mpi_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
comm=MPI.COMM_WORLD,
)
else:
value = run_vidal_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
)
else:
backend = QMatchaTeaBackend()
backend.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
svd_control="E!",
tensor_module=args.tensor_module,
compile_circuit=True,
track_memory=False,
mpi_approach="CT" if args.mpi_ct else "SR",
mpi_num_procs=size,
mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
mpi_isometrization=args.mpi_isometrization,
)
value = backend.expectation(
circuit,
observable,
preprocess=False,
compile_circuit=True,
)
max_timings = None
if timings:
max_timings = {
key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
for key, local_value in timings.items()
}
if rank != 0:
return
value = float(np.real(value))
elapsed = time.perf_counter() - start
abs_error = float("nan") if exact is None else abs(value - exact)
rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
if max_timings:
print("timing_section max_seconds")
for key, max_value in max_timings.items():
print(f"{key} {max_value:.6f}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,56 @@
"""MPI parallel sliced contraction using pre-sliced tree."""
import time, pickle, os
import numpy as np
from mpi4py import MPI
NQUBITS, NLAYERS, NCORES = 25, 10, 48
comm = MPI.COMM_WORLD
rank, size = comm.Get_rank(), comm.Get_size()
os.environ['OMP_NUM_THREADS'] = str(NCORES)
os.environ['MKL_NUM_THREADS'] = str(NCORES)
import torch
import qibo, quimb as qu
from qibotn.observables import build_random_circuit
torch.set_num_threads(NCORES)
circuit = build_random_circuit(NQUBITS, NLAYERS)
qibo.set_backend("qibotn", platform="quimb")
backend = qibo.get_backend()
backend.configure_tn_simulation(ansatz="tn")
qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
if rank == 0:
with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
tree = pickle.load(f)
else:
tree = None
tree = comm.bcast(tree, root=0)
arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors]
n_slices = tree.multiplicity
if rank == 0:
print(f"Slices: {n_slices}, Ranks: {size}, "
f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
f"Threads/rank: {NCORES}, Backend: torch")
t0 = time.time()
result = None
for i in range(rank, n_slices, size):
val = tree.contract_slice(arrays, i, backend='torch')
val_np = val.cpu().numpy().reshape(-1)
result = val_np if result is None else result + val_np
if result is None:
result = np.zeros(1, dtype=np.complex128)
total = np.zeros_like(result) if rank == 0 else None
comm.Reduce(result, total, root=0)
if rank == 0:
print(f"Contract: {time.time() - t0:.4f}s Expectation: {0.5 * total[0].real:.10f}")

34
tools/benchmark_search.py Normal file
View File

@@ -0,0 +1,34 @@
"""Search contraction path and save."""
import time, os, pickle
from qibotn.parallel import parallel_path_search
from qibotn.observables import build_random_circuit
import qibo, quimb as qu
from mpi4py import MPI
NQUBITS, NLAYERS, WORKERS = 20, 10, 96
comm = MPI.COMM_WORLD
rank, size = comm.Get_rank(), comm.Get_size()
method = 'mpi' if size > 1 else 'processpool'
circuit = build_random_circuit(NQUBITS, NLAYERS)
qibo.set_backend("qibotn", platform="quimb")
backend = qibo.get_backend()
backend.configure_tn_simulation(ansatz="tn")
qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
if rank == 0:
print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
t0 = time.time()
tree = parallel_path_search(tn, tn.outer_inds(), method=method,
total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60)
t_search = time.time() - t0
if rank == 0:
os.makedirs('data', exist_ok=True)
path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
with open(path, 'wb') as f:
pickle.dump(tree, f)
print(f"Search: {t_search:.2f}s Peak: {tree.max_size() * 16 / 1e9:.2f} GB Saved: {path}")

16
tools/benchmark_slice.py Normal file
View File

@@ -0,0 +1,16 @@
"""Slice saved tree and save."""
import pickle
NQUBITS, NLAYERS = 25, 10
with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
tree = pickle.load(f)
print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
tree_sliced = tree.slice_and_reconfigure(target_size=2**28)
with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
pickle.dump(tree_sliced, f)
print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB Slices: {tree_sliced.multiplicity}")

378
tools/benchmark_tn_mpi.py Normal file
View File

@@ -0,0 +1,378 @@
"""MPI-parallel TN benchmark: path search + contraction via MPI."""
import json
import pickle
import time
import argparse
import numpy as np
import cotengra as ctg
import qibo
from qibo import Circuit, gates
from mpi4py import MPI
from concurrent.futures import ProcessPoolExecutor, as_completed
from qibotn.observables import check_observable, extract_gates_and_qubits
def _load_observable(observable_file=None, observable_json=None):
if observable_file:
with open(observable_file, "r", encoding="utf8") as f:
return json.load(f)
if observable_json:
return json.loads(observable_json)
return None
def _term_to_quimb_operator(term):
"""Convert one extracted Hamiltonian term to a quimb operator."""
import quimb as qu
coeff = complex(term[0][2]) if term else 1.0
op = None
where = []
for qubit, gate_name, _ in term:
qubit = int(qubit)
gate_name = str(gate_name).upper()
if gate_name == "I":
continue
where.append(qubit)
op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
return complex(coeff), op, tuple(where)
def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
import pickle, cotengra as ctg, random
random.seed(seed)
tn = pickle.loads(tn_bytes)
opt = ctg.HyperOptimizer(
methods=['kahypar', 'kahypar-agglom', 'spinglass'],
max_repeats=repeats,
parallel=False,
minimize='combo-256',
max_time=max_time,
optlib="random",
slicing_opts={'target_size': 2**29, 'allow_outer': True},
progbar=False,
)
tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
return tree.combo_cost(factor=256), tree
def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
timeout):
import pickle, os, signal
from concurrent.futures import ProcessPoolExecutor, as_completed
tn_bytes = pickle.dumps(tn)
if n_workers <= 1:
return _run_serial_search(
tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
)[1]
repeats_per = max(1, total_repeats // n_workers)
best_cost, best_tree = float('inf'), None
pool = ProcessPoolExecutor(max_workers=n_workers)
futures = [
pool.submit(_run_serial_search, tn_bytes, output_inds,
repeats_per, seed, num_slices, n_ranks, timeout)
for seed in range(n_workers)
]
try:
for fut in as_completed(futures, timeout=timeout + 5):
try:
cost, tree = fut.result()
if cost < best_cost:
best_cost, best_tree = cost, tree
except Exception as e:
print(f" [worker failed] {e}")
except TimeoutError:
pass
finally:
for fut in futures:
fut.cancel()
for pid in list(pool._processes.keys()):
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
pool.shutdown(wait=False)
return best_tree
def make_circuit(circuit_type, nqubits, nlayers=1):
c = Circuit(nqubits)
if circuit_type == "qft":
from qibo.models import QFT
return QFT(nqubits)
elif circuit_type == "variational":
for layer in range(nlayers):
for q in range(nqubits):
c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
offset = layer % 2
for q in range(offset, nqubits - 1, 2):
c.add(gates.CZ(q, q + 1))
elif circuit_type == "ghz":
c.add(gates.H(0))
for q in range(nqubits - 1):
c.add(gates.CNOT(q, q + 1))
elif circuit_type == "brickwork":
for q in range(nqubits):
c.add(gates.H(q))
for layer in range(nlayers):
offset = layer % 2
for q in range(offset, nqubits - 1, 2):
c.add(gates.CNOT(q, q + 1))
c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
else:
raise ValueError(f"Unknown circuit: {circuit_type}")
return c
def _contract_mpi(tree, arrays, comm, root=0):
rank = comm.Get_rank()
size = comm.Get_size()
is_torch = type(arrays[0]).__module__.startswith("torch")
result_np = None
for i in range(rank, tree.multiplicity, size):
x = tree.contract_slice(arrays, i)
x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
result_np = x_np if result_np is None else result_np + x_np
if result_np is None:
result_np = np.zeros(1, dtype=np.complex128)
result = np.zeros_like(result_np) if rank == root else None
comm.Reduce(result_np, result, root=root)
if rank == root:
import torch
return torch.from_numpy(np.asarray(result)) if is_torch else result
return None
def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
load_path=None, save_path=None):
"""Each MPI rank runs serial path search over total_repeats/size trials,
rank 0 picks the global best, then all ranks contract in parallel."""
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
qibo.set_backend("qibotn", platform="quimb")
b = qibo.get_backend()
b.configure_tn_simulation(ansatz="tn")
import torch
qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
gate_opts={"max_bond": None, "cutoff": 1e-10})
qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128)
# --- path search: each rank serial, gather best to rank 0 ---
if load_path:
if rank == 0:
with open(load_path, "rb") as f:
saved = pickle.load(f)
tree, psi, t_search = saved["tree"], saved["psi"], 0.0
print(f" [path loaded] {load_path}")
else:
tree = psi = None
t_search = 0.0
else:
rank_repeats = max(1, total_repeats // size)
t0 = time.time()
# get TN object first (no contraction), then run parallel search
psi_tn = qc.to_dense(rehearse="tn")
local_tree = parallel_search(
psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
num_slices=num_slices, n_ranks=size, timeout=600,
)
t_search = time.time() - t0
local_psi = psi_tn
all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0)
if rank == 0:
_, tree, psi = min(all_results, key=lambda x: x[0])
print(f" [path search] {t_search:.3f}s "
f"flops~2^{tree.contraction_cost(log=2):.2f} "
f"size~2^{tree.contraction_width():.2f} "
f"slices={tree.multiplicity}")
if save_path:
with open(save_path, "wb") as f:
pickle.dump({"tree": tree, "psi": psi}, f)
print(f" [path saved] {save_path}")
else:
tree = psi = None
if save_path:
t_search = comm.bcast(t_search, root=0)
return None, t_search
tree = comm.bcast(tree, root=0)
psi = comm.bcast(psi, root=0)
t_search = comm.bcast(t_search, root=0)
# --- contraction: all ranks work in parallel ---
import torch
torch.set_num_threads(max(1, 96 // size))
arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
t0 = time.time()
sv = _contract_mpi(tree, arrays, comm, root=0)
t_contract = time.time() - t0
if rank == 0:
print(f" [contraction] {t_contract:.3f}s")
return np.array(sv).reshape(-1), t_search + t_contract
return None, t_search + t_contract
def run_mpi_expval(
circuit,
nqubits,
observable=None,
total_repeats=1024,
search_workers=1,
search_timeout=300,
):
"""Compute a Hamiltonian expectation value directly from TN via MPI.
MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
path search for each term."""
import torch
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
qibo.set_backend("qibotn", platform="quimb")
b = qibo.get_backend()
b.configure_tn_simulation(ansatz="tn")
observable = check_observable(observable, nqubits)
ham_gate_map = extract_gates_and_qubits(observable)
qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
gate_opts={"max_bond": None, "cutoff": 1e-10})
my_terms = ham_gate_map[rank::size]
torch.set_num_threads(max(1, 96 // size))
t0 = time.time()
my_exp = 0.0 + 0.0j
for term in my_terms:
coeff, op, where = _term_to_quimb_operator(term)
if op is None:
my_exp += coeff
continue
tn = qc.local_expectation_tn(op, where=where)
if len(tn.outer_inds()) == 0:
val = complex(tn.contract())
else:
tree = parallel_search(
tn,
tn.outer_inds(),
total_repeats,
n_workers=search_workers,
num_slices=1,
n_ranks=size,
timeout=search_timeout,
)
if tree is None:
raise RuntimeError("Failed to find a contraction tree for expectation TN.")
arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
val = complex(acc.item() if hasattr(acc, 'item') else acc)
my_exp += coeff * val
t_total = time.time() - t0
all_results = comm.gather(my_exp, root=0)
if rank == 0:
total_exp = sum(all_results)
print(f"\n[TN expval] time={t_total:.4f}s expval={total_exp.real:.12f}")
return np.real_if_close(total_exp), t_total
return None, t_total
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=30)
parser.add_argument("--circuit", type=str, default="qft",
choices=["qft", "variational", "ghz", "brickwork"])
parser.add_argument("--nlayers", type=int, default=3)
parser.add_argument("--num-slices", type=int, default=1)
parser.add_argument("--total-repeats", type=int, default=1024)
parser.add_argument("--search-workers", type=int, default=1)
parser.add_argument("--search-timeout", type=int, default=300)
parser.add_argument("--observable-file", type=str, default=None)
parser.add_argument("--observable-json", type=str, default=None)
parser.add_argument("--save-path", type=str, default=None)
parser.add_argument("--load-path", type=str, default=None)
parser.add_argument("--no-compare", action="store_true")
parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
args = parser.parse_args()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
if rank == 0:
print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
np.random.seed(42)
circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
observable = _load_observable(args.observable_file, args.observable_json)
if args.mode == "expval":
try:
expval, t_total = run_mpi_expval(
circuit,
args.nqubits,
observable=observable,
total_repeats=args.total_repeats,
search_workers=args.search_workers,
search_timeout=args.search_timeout,
)
except Exception as e:
if rank == 0:
print(f"[FAILED] {e}")
raise
if rank == 0:
np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
if not args.no_compare:
print("No built-in reference comparison for arbitrary observables.")
return
try:
sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
total_repeats=args.total_repeats,
load_path=args.load_path, save_path=args.save_path)
except Exception as e:
if rank == 0:
print(f"[FAILED] {e}")
raise
if rank == 0 and sv is not None:
print(f"\n[quimb TN MPI] time={t_total:.4f}s shape={sv.shape}")
np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
if not args.no_compare:
from qibotn.bak.benchmark_tn import run_qibojit
import gc
np.random.seed(42)
circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
sv_ref, t_ref = run_qibojit(circuit_ref)
np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
print(f"[qibojit] time={t_ref:.4f}s")
# free memory before loading via mmap for expval comparison
del sv, sv_ref
gc.collect()
from compare_jit_tn_quimb import check_results
ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
tn_path = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
check_results(ref_path, tn_path, args.nqubits)
if t_total > 0:
print(f"Speedup : {t_ref/t_total:.2f}x")
if __name__ == "__main__":
main()

25
tools/check_tree.py Normal file
View File

@@ -0,0 +1,25 @@
"""Check contraction tree statistics."""
import pickle, sys
path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
with open(path, 'rb') as f:
tree = pickle.load(f)
# Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2
# complex128 multiply-add = 6 real FLOPs
CORES = 96
FREQ = 2.1e9
AVX512_FP64 = 16
TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12 # ~6.45 TFLOPS real FP64
COMPLEX_FLOPS = TFLOPS / 6 # complex128 effective
flops = tree.total_flops()
slices = tree.multiplicity
est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12)
print(f"File: {path}")
print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")
print(f"Total FLOPs: {flops:.2e} x{slices} slices = {flops*slices:.2e}")
print(f"Contraction width: {tree.contraction_width()}")
print(f"Multiplicity (slices): {slices}")
print(f"Estimated time (96 cores): {est_seconds:.1f}s ({est_seconds/3600:.2f}h)")

View File

@@ -0,0 +1,137 @@
"""Compare QMatchaTeaBackend with the VidalBackend fast path."""
from __future__ import annotations
import argparse
import json
import math
import time
import numpy as np
import torch
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal import VidalBackend
def build_circuit(nqubits, nlayers, seed, kind):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "brickwall":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
elif kind == "shifted-cz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(q, q + 1))
elif kind == "reversed-cnot":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def build_observable(nqubits, kind):
form = 0
if kind == "ring-xz":
for q in range(nqubits):
form += 0.5 * X(q) * Z((q + 1) % nqubits)
elif kind == "open-zz":
for q in range(nqubits - 1):
form += Z(q) * Z(q + 1) / (nqubits - 1)
elif kind == "mixed":
form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
for q in range(0, nqubits - 1, 3):
form += 0.125 * Y(q) * Y(q + 1)
else:
raise ValueError(f"Unknown observable kind {kind!r}.")
return hamiltonians.SymbolicHamiltonian(form=form)
def run_backend(backend, circuit, observable):
start = time.perf_counter()
value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
return float(np.real(value)), time.perf_counter() - start
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=34)
parser.add_argument("--nlayers", type=int, default=20)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument(
"--circuit-kind",
choices=("brickwall", "shifted-cz", "reversed-cnot"),
default="brickwall",
)
parser.add_argument(
"--observable-kind",
choices=("ring-xz", "open-zz", "mixed"),
default="ring-xz",
)
parser.add_argument("--reference-file")
parser.add_argument("--skip-qmatchatea", action="store_true")
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
observable = build_observable(args.nqubits, args.observable_kind)
exact = None
if args.reference_file:
with open(args.reference_file, "r", encoding="utf-8") as f:
exact = float(json.load(f)["expectation"])
print(
f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
f"circuit={args.circuit_kind} observable={args.observable_kind} "
f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
)
if exact is not None:
print(f"exact={exact:.16e}")
print("backend value abs_error seconds")
if not args.skip_qmatchatea:
qmt = QMatchaTeaBackend()
qmt.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
svd_control="E!",
tensor_module=args.tensor_module,
compile_circuit=True,
track_memory=False,
)
value, seconds = run_backend(qmt, circuit, observable)
error = float("nan") if exact is None else abs(value - exact)
print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
vidal = VidalBackend()
vidal.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
tensor_module=args.tensor_module,
compile_circuit=True,
fallback=True,
)
value, seconds = run_backend(vidal, circuit, observable)
error = float("nan") if exact is None else abs(value - exact)
print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,72 @@
"""Chrome trace profiler for the VidalBackend fast path."""
from __future__ import annotations
import argparse
from pathlib import Path
import torch
from torch.profiler import ProfilerActivity, profile
from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=34)
parser.add_argument("--nlayers", type=int, default=20)
parser.add_argument("--bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument("--cut-ratio", type=float, default=1e-12)
parser.add_argument("--profile-memory", action="store_true")
parser.add_argument("--rows", type=int, default=60)
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
trace_path = Path(f"{prefix}.json")
table_path = Path(f"{prefix}.txt")
trace_path.parent.mkdir(parents=True, exist_ok=True)
circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
config = ExpectationConfig(
ansatz="mps",
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
torch_threads=args.torch_threads,
)
print(
f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} threads={args.torch_threads}"
)
with profile(
activities=[ProfilerActivity.CPU],
record_shapes=args.profile_memory,
profile_memory=args.profile_memory,
with_stack=args.profile_memory,
) as prof:
result = run_cpu_expectation(circuit, observable, config)
table = (
f"expval={result.value:.16e}\n\n"
f"# sorted by self_cpu_time_total\n"
f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
f"# sorted by cpu_time_total\n"
f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
)
print(table, end="")
table_path.write_text(table, encoding="utf-8")
prof.export_chrome_trace(str(trace_path))
print(f"trace={trace_path}\ntable={table_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,109 @@
"""Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
import argparse
import json
import math
import time
from pathlib import Path
import numpy as np
import qibo
from qibo import Circuit, gates
def build_circuit(nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for _ in range(nlayers):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
for qubit in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(qubit, qubit + 1))
for qubit in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(qubit, qubit + 1))
return circuit
def ring_xz_expectation(state, nqubits, chunk_size):
value = 0.0
for qubit in range(nqubits):
next_qubit = (qubit + 1) % nqubits
x_flip = 1 << (nqubits - 1 - qubit)
z_shift = nqubits - 1 - next_qubit
term = 0.0
for start in range(0, state.size, chunk_size):
stop = min(start + chunk_size, state.size)
indices = np.arange(start, stop, dtype=np.int64)
z_bit = (indices >> z_shift) & 1
z_phase = 1 - 2 * z_bit
term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
value += 0.5 * term
return float(value)
def default_output_path(nqubits, nlayers, seed):
return Path("references") / (
f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=32)
parser.add_argument("--nlayers", type=int, default=3)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--output")
parser.add_argument("--force", action="store_true")
parser.add_argument("--allow-large", action="store_true")
parser.add_argument("--max-state-gb", type=float, default=32.0)
parser.add_argument("--chunk-size", type=int, default=1 << 20)
args = parser.parse_args()
output = Path(args.output) if args.output else default_output_path(
args.nqubits, args.nlayers, args.seed
)
if output.exists() and not args.force:
with open(output, "r", encoding="utf-8") as f:
data = json.load(f)
print(f"loaded {output}")
print(f"expectation={float(data['expectation']):.16e}")
return
state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
if state_gb > args.max_state_gb and not args.allow_large:
raise MemoryError(
f"Estimated state vector alone is {state_gb:.1f} GiB. "
"Pass --allow-large after confirming the node has enough memory."
)
qibo.set_backend("qibojit")
circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
start = time.perf_counter()
state = circuit().state(numpy=True).reshape(-1)
expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
elapsed = time.perf_counter() - start
data = {
"backend": "qibojit",
"observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
"nqubits": args.nqubits,
"nlayers": args.nlayers,
"seed": args.seed,
"expectation": expectation,
"seconds": elapsed,
"state_vector_gib_estimate": state_gb,
}
output.parent.mkdir(parents=True, exist_ok=True)
with open(output, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, sort_keys=True)
f.write("\n")
print(f"saved {output}")
print(f"expectation={expectation:.16e}")
print(f"seconds={elapsed:.3f}")
if __name__ == "__main__":
main()

127
tools/run_cpu_large_cases.sh Executable file
View File

@@ -0,0 +1,127 @@
#!/usr/bin/env bash
set -euo pipefail
# Large CPU expectation benchmarks for two-server runs.
#
# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
# each. Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
# needed.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
HOSTFILE="${HOSTFILE:-hostfile}"
MPS_RANKS="${MPS_RANKS:-8}"
MPS_THREADS="${MPS_THREADS:-12}"
TN_RANKS="${TN_RANKS:-12}"
TN_THREADS="${TN_THREADS:-8}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
run_mpi() {
local ranks="$1"
shift
"$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
}
run_case() {
local title="$1"
shift
echo
echo "================================================================================"
echo "$title"
echo "================================================================================"
echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
echo "$*"
"$@"
}
case "${1:-help}" in
smoke)
run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_SMOKE_NQ:-40}" \
--nlayers "${MPS_SMOKE_LAYERS:-30}" \
--bond "${MPS_SMOKE_BOND:-2048}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz \
--observables ring_xz open_zz range2_xx
run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
--mpi \
--nqubits "${TN_SMOKE_NQ:-32}" \
--nlayers "${TN_SMOKE_LAYERS:-16}" \
--torch-threads "$TN_THREADS" \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices "${TN_SMOKE_SLICES:-12}"
;;
mps-long)
run_case "MPS MPI long: n=64 layers=48 bond=4096" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_LONG_NQ:-64}" \
--nlayers "${MPS_LONG_LAYERS:-48}" \
--bond "${MPS_LONG_BOND:-4096}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
mps-pressure)
run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "${MPS_PRESSURE_NQ:-80}" \
--nlayers "${MPS_PRESSURE_LAYERS:-64}" \
--bond "${MPS_PRESSURE_BOND:-4096}" \
--torch-threads "$MPS_THREADS" \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
--observables ring_xz open_zz mixed_local range2_xx long_z_string
;;
tn-long)
run_case "TN MPI long: n=36 layers=20 target_slices=24" \
run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
--mpi \
--nqubits "${TN_LONG_NQ:-36}" \
--nlayers "${TN_LONG_LAYERS:-20}" \
--torch-threads "$TN_THREADS" \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices "${TN_LONG_SLICES:-24}"
;;
all)
"$0" smoke
"$0" mps-long
"$0" tn-long
;;
help|*)
cat >&2 <<'EOF'
Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
Common overrides:
HOSTFILE=hostfile
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPS_RANKS=8 MPS_THREADS=12
TN_RANKS=12 TN_THREADS=8
Scale overrides:
MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
EOF
exit 2
;;
esac

148
tools/run_cpu_single_cases.sh Executable file
View File

@@ -0,0 +1,148 @@
#!/usr/bin/env bash
set -euo pipefail
# Single-node CPU scale probes for expectation benchmarks.
#
# Intended for one 96-core / ~500 GiB RAM node. The default "probe" mode runs
# moderate MPS and TN cases first. Larger modes are available after checking
# runtime and memory from the probe output.
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
MPIEXEC="${MPIEXEC:-mpiexec}"
TIME_BIN="${TIME_BIN:-/usr/bin/time}"
MPS_RANKS="${MPS_RANKS:-8}"
MPS_THREADS="${MPS_THREADS:-12}"
TN_RANKS="${TN_RANKS:-8}"
TN_THREADS="${TN_THREADS:-12}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
estimate_mps_memory() {
local nqubits="$1"
local bond="$2"
"$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
import sys
n = int(sys.argv[1])
chi = int(sys.argv[2])
ranks = int(sys.argv[3])
resident = n * 2 * chi * chi * 16
per_rank = resident / ranks
print(
"MPS rough resident memory: "
f"total={resident / 1024**3:.1f} GiB "
f"per_rank={per_rank / 1024**3:.1f} GiB "
"(temporary eig/SVD workspaces are additional)"
)
PY
}
run_timed() {
echo
echo "--------------------------------------------------------------------------------"
echo "$*"
echo "--------------------------------------------------------------------------------"
"$TIME_BIN" -v "$@"
}
run_mps_case() {
local label="$1"
local nqubits="$2"
local nlayers="$3"
local bond="$4"
shift 4
echo
echo "================================================================================"
echo "$label"
echo "================================================================================"
echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
estimate_mps_memory "$nqubits" "$bond"
run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
--mpi --mps \
--nqubits "$nqubits" \
--nlayers "$nlayers" \
--bond "$bond" \
--torch-threads "$MPS_THREADS" \
"$@"
}
run_tn_case() {
local label="$1"
local nqubits="$2"
local nlayers="$3"
shift 3
echo
echo "================================================================================"
echo "$label"
echo "================================================================================"
echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
--mpi \
--nqubits "$nqubits" \
--nlayers "$nlayers" \
--torch-threads "$TN_THREADS" \
"$@"
}
case "${1:-help}" in
probe)
run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
--circuits brickwall_cnot \
--observables ring_xz
run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
--circuits brickwall_cnot \
--observables ring_xz \
--tn-target-slices 8
;;
mps-medium)
run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
mps-long)
run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
--circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz mixed_local range2_xx
;;
tn-medium)
run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices 16
;;
tn-long)
run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
--circuits brickwall_cnot shifted_cz rxx_rzz \
--observables ring_xz open_zz range2_xx \
--tn-target-slices 32
;;
help|*)
cat >&2 <<'EOF'
Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
Common overrides:
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPS_RANKS=8 MPS_THREADS=12
TN_RANKS=8 TN_THREADS=12
OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
EOF
exit 2
;;
esac

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env bash
set -euo pipefail
NQ="${NQ:-34}"
LAYERS="${LAYERS:-20}"
BOND="${BOND:-512}"
SEED="${SEED:-42}"
RANKS="${RANKS:-1 2 4}"
THREADS="${THREADS:-32 32 16}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
CIRCUIT="${CIRCUIT:-brickwall_cnot}"
OBSERVABLE="${OBSERVABLE:-ring_xz}"
EXACT="${EXACT:-0}"
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
if [[ "${1:-help}" != "run" ]]; then
cat >&2 <<'EOF'
Usage: tools/run_vidal_segment_mpi_scan.sh run
Overrides:
NQ=34 LAYERS=20 BOND=512 SEED=42
RANKS="1 2 4" THREADS="32 32 16"
CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
EXACT=1
PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
EOF
if [[ "${1:-help}" == "help" ]]; then
exit 0
fi
exit 2
fi
read -r -a ranks <<< "$RANKS"
read -r -a threads <<< "$THREADS"
if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
echo "RANKS and THREADS must have the same number of entries." >&2
exit 2
fi
common=(
--nqubits "$NQ"
--nlayers "$LAYERS"
--bond "$BOND"
--seed "$SEED"
--mps
--circuits "$CIRCUIT"
--observables "$OBSERVABLE"
)
if [[ "$EXACT" == "1" ]]; then
common+=(--exact)
fi
for idx in "${!ranks[@]}"; do
nrank="${ranks[$idx]}"
nthr="${threads[$idx]}"
if [[ "$nrank" == "1" ]]; then
echo "== Vidal serial ranks=1 torch_threads=$nthr =="
"$PYTHON_BIN" -u benchmark_cpu_expectation.py \
"${common[@]}" --torch-threads "$nthr"
else
echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
"$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
"${common[@]}" --torch-threads "$nthr" --mpi
fi
done

View File

@@ -0,0 +1,202 @@
"""Correctness checks for the Vidal/TEBD MPS fast path.
The cases here intentionally cover more than the benchmark ring-XZ observable:
different nearest-neighbor gate orientations and several Pauli-sum observables.
Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
the segmented Vidal executor.
"""
from __future__ import annotations
import argparse
import math
import time
import numpy as np
import torch
from qibo import Circuit, gates
from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
from qibotn.backends.vidal_tebd import VidalTEBDExecutor
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "rx_ry_cz":
circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
if kind in ("brickwall", "reversed_cnot"):
for q in range(0, nqubits - 1, 2):
if kind == "reversed_cnot" and (layer % 2):
circuit.add(gates.CNOT(q + 1, q))
else:
circuit.add(gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
if kind == "reversed_cnot" and not (layer % 2):
circuit.add(gates.CNOT(q + 1, q))
else:
circuit.add(gates.CNOT(q, q + 1))
elif kind == "rx_ry_cz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.CZ(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def observable_terms(kind, nqubits):
if kind == "ring_xz":
return [
(0.5, (("X", site), ("Z", (site + 1) % nqubits)))
for site in range(nqubits)
]
if kind == "open_zz":
return [
(1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
for site in range(nqubits - 1)
]
if kind == "mixed_local":
terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
terms += [
(0.125, (("Y", site), ("Y", site + 1)))
for site in range(0, nqubits - 1, 3)
]
return terms
raise ValueError(f"Unknown observable kind {kind!r}.")
def exact_pauli_sum(circuit, terms, nqubits):
state = circuit().state(numpy=True).reshape(-1)
indices = np.arange(state.size, dtype=np.int64)
value = 0.0 + 0.0j
for coeff, ops in terms:
flipped = indices.copy()
phase = np.ones(state.size, dtype=np.complex128)
for name, site in ops:
shift = nqubits - 1 - site
bit = (indices >> shift) & 1
name = name.upper()
if name == "X":
flipped ^= 1 << shift
elif name == "Y":
flipped ^= 1 << shift
phase *= 1j * (1 - 2 * bit)
elif name == "Z":
phase *= 1 - 2 * bit
elif name != "I":
raise ValueError(f"Unsupported Pauli {name!r}.")
value += coeff * np.vdot(state[flipped], phase * state)
return float(value.real)
def run_vidal(circuit, terms, nqubits, bond, tensor_module):
executor = VidalTEBDExecutor(
nqubits=nqubits,
max_bond=bond,
cut_ratio=1e-12,
tensor_module=tensor_module,
)
executor.run_circuit(circuit)
return float(executor.expectation_pauli_sum(terms))
def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
executor = SegmentVidalMPIExecutor(
nqubits=nqubits,
max_bond=bond,
cut_ratio=1e-12,
tensor_module=tensor_module,
comm=comm,
)
executor.run_circuit(circuit)
return executor.expectation_pauli_sum_root(terms)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=16)
parser.add_argument("--nlayers", type=int, default=6)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
parser.add_argument("--mpi", action="store_true")
parser.add_argument(
"--circuits",
nargs="+",
default=("brickwall", "reversed_cnot", "rx_ry_cz"),
)
parser.add_argument(
"--observables",
nargs="+",
default=("ring_xz", "open_zz", "mixed_local"),
)
args = parser.parse_args()
torch.set_num_threads(args.torch_threads)
comm = None
rank = 0
size = 1
if args.mpi:
from mpi4py import MPI
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
if rank == 0:
mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
print(
f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} tensor_module={args.tensor_module}"
)
print("circuit observable exact value abs_error seconds")
for circuit_kind in args.circuits:
circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
exact = None
if rank == 0:
exact_values = {
obs: exact_pauli_sum(
circuit, observable_terms(obs, args.nqubits), args.nqubits
)
for obs in args.observables
}
else:
exact_values = None
if comm is not None:
exact_values = comm.bcast(exact_values, root=0)
for obs_kind in args.observables:
terms = observable_terms(obs_kind, args.nqubits)
start = time.perf_counter()
if args.mpi:
value = run_segment_mpi(
circuit,
terms,
args.nqubits,
args.bond,
args.tensor_module,
comm,
)
else:
value = run_vidal(
circuit, terms, args.nqubits, args.bond, args.tensor_module
)
if rank != 0:
continue
elapsed = time.perf_counter() - start
exact = exact_values[obs_kind]
print(
f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
f"{abs(value - exact):.6e} {elapsed:.3f}"
)
if __name__ == "__main__":
main()