This commit is contained in:
tankya2
2024-02-01 14:42:17 +08:00
parent 6f4ffa777a
commit 31fe1ef0c9
4 changed files with 49 additions and 139 deletions

View File

@@ -2,6 +2,8 @@ import cupy as cp
from cuquantum.cutensornet.experimental import contract_decompose
from cuquantum import contract
# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
def initial(num_qubits, dtype):
"""

View File

@@ -1,6 +1,8 @@
import cupy as cp
import numpy as np
# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
class QiboCircuitToEinsum:
"""Convert a circuit to a Tensor Network (TN) representation.
@@ -159,9 +161,7 @@ class QiboCircuitToEinsum:
return gates
def expectation_operands(self, pauli_string):
# assign pauli string to qubit
# _get_forward_inverse_metadata()
input_bitstring = "0" * self.circuit.nqubits # Need all qubits!
input_bitstring = "0" * self.circuit.nqubits
input_operands = self._get_bitstring_tensors(input_bitstring)
pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
@@ -185,8 +185,6 @@ class QiboCircuitToEinsum:
next_frontier = max(qubits_frontier.values()) + 1
# input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
pauli_gates = self.get_pauli_gates(
pauli_map, dtype=self.dtype, backend=self.backend
)
@@ -208,18 +206,4 @@ class QiboCircuitToEinsum:
operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
# expec = contract(*operand_exp_interleave)
# print(expec)
"""
gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates,
qubits_frontier,
next_frontier)
mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
operands = input_operands + gate_operands + input_operands[:n_qubits]
output_mode_labels = []
expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
"""
return operand_exp_interleave

View File

@@ -1,7 +1,5 @@
from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
from cuquantum import contract
from cuquantum import cutensornet as cutn
import multiprocessing
from cupy.cuda.runtime import getDeviceCount
import cupy as cp
@@ -10,15 +8,17 @@ from qibotn.mps_contraction_helper import MPSContractionHelper
def dense_vector_tn(qibo_circ, datatype):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.state_vector_operands())
def expectation_pauli_tn(qibo_circ, datatype, pauli_string):
def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(
*myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string)
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
)
@@ -30,54 +30,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
"""
from mpi4py import MPI # this line initializes MPI
import socket
from mpi4py import MPI
from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data.
# operands = comm.bcast(operands, root)
operands = myconvertor.state_vector_operands()
# Assign the device for each process.
device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object.
network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender)
@@ -95,45 +76,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
"""
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
num_slices = opt_info.num_slices#Andy
chunk, extra = num_slices // size, num_slices % size#Andy
slice_begin = rank * chunk + min(rank, extra)#Andy
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
slices = range(slice_begin, slice_end)#Andy
result = network.contract(slices=slices)
"""
return result, rank
def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
"""Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
"""
from mpi4py import MPI
from cuquantum import Network
from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
@@ -145,27 +111,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender)
@@ -183,12 +140,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr
@@ -205,21 +158,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
return result, rank
def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
"""
from mpi4py import MPI
from cuquantum import Network
from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
@@ -231,30 +185,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string)
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender)
@@ -272,12 +216,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr
@@ -294,57 +234,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
return result, rank
def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
"""
from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string)
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data.
# operands = comm.bcast(operands, root)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
# Assign the device for each process.
device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object.
network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender)
@@ -362,12 +289,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -376,8 +299,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
def dense_vector_mps(qibo_circ, gate_algo, datatype):
"""Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.
"""
"""Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
mps_helper = MPSContractionHelper(myconvertor.num_qubits)
@@ -387,7 +309,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
def pauli_string_gen(nqubits, pauli_string_pattern):
""" Used internally to generate the string based on given pattern and number of qubit.
"""Used internally to generate the string based on given pattern and number of qubit.
Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
"""
if nqubits <= 0:

View File

@@ -1,5 +1,7 @@
from cuquantum import contract, contract_path, CircuitToEinsum, tensor
# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
class MPSContractionHelper:
"""
@@ -85,7 +87,7 @@ class MPSContractionHelper:
self, mps_tensors, operator, qubits, options=None, normalize=False
):
"""
Contract the corresponding tensor network to form the state vector representation of the MPS.
Contract the corresponding tensor network to form the expectation of the MPS.
Args:
mps_tensors: A list of rank-3 ndarray-like tensor objects.