This commit is contained in:
tankya2
2024-02-01 14:42:17 +08:00
committed by yangliwei
parent 50f1fcd6e8
commit 09c741ea80
4 changed files with 51 additions and 138 deletions

View File

@@ -2,6 +2,8 @@ import cupy as cp
from cuquantum import contract from cuquantum import contract
from cuquantum.cutensornet.experimental import contract_decompose from cuquantum.cutensornet.experimental import contract_decompose
# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
def initial(num_qubits, dtype): def initial(num_qubits, dtype):
""" """

View File

@@ -1,6 +1,8 @@
import cupy as cp import cupy as cp
import numpy as np import numpy as np
# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
class QiboCircuitToEinsum: class QiboCircuitToEinsum:
"""Convert a circuit to a Tensor Network (TN) representation. """Convert a circuit to a Tensor Network (TN) representation.
@@ -159,9 +161,7 @@ class QiboCircuitToEinsum:
return gates return gates
def expectation_operands(self, pauli_string): def expectation_operands(self, pauli_string):
# assign pauli string to qubit input_bitstring = "0" * self.circuit.nqubits
# _get_forward_inverse_metadata()
input_bitstring = "0" * self.circuit.nqubits # Need all qubits!
input_operands = self._get_bitstring_tensors(input_bitstring) input_operands = self._get_bitstring_tensors(input_bitstring)
pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
@@ -185,8 +185,6 @@ class QiboCircuitToEinsum:
next_frontier = max(qubits_frontier.values()) + 1 next_frontier = max(qubits_frontier.values()) + 1
# input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
pauli_gates = self.get_pauli_gates( pauli_gates = self.get_pauli_gates(
pauli_map, dtype=self.dtype, backend=self.backend pauli_map, dtype=self.dtype, backend=self.backend
) )
@@ -208,18 +206,4 @@ class QiboCircuitToEinsum:
operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
# expec = contract(*operand_exp_interleave)
# print(expec)
"""
gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates,
qubits_frontier,
next_frontier)
mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
operands = input_operands + gate_operands + input_operands[:n_qubits]
output_mode_labels = []
expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
"""
return operand_exp_interleave return operand_exp_interleave

View File

@@ -3,7 +3,8 @@ import multiprocessing
import cupy as cp import cupy as cp
from cupy.cuda.runtime import getDeviceCount from cupy.cuda.runtime import getDeviceCount
from cuquantum import contract from cuquantum import contract
from cuquantum import cutensornet as cutn from cupy.cuda.runtime import getDeviceCount
import cupy as cp
from qibotn.mps_contraction_helper import MPSContractionHelper from qibotn.mps_contraction_helper import MPSContractionHelper
from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
@@ -11,15 +12,17 @@ from qibotn.QiboCircuitToMPS import QiboCircuitToMPS
def dense_vector_tn(qibo_circ, datatype): def dense_vector_tn(qibo_circ, datatype):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.state_vector_operands()) return contract(*myconvertor.state_vector_operands())
def expectation_pauli_tn(qibo_circ, datatype, pauli_string): def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract( return contract(
*myconvertor.expectation_operands( *myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string) pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
) )
) )
@@ -31,54 +34,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN. After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
""" """
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI
import socket
from cuquantum import Network from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0 root = 0
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data. operands = myconvertor.state_vector_operands()
# operands = comm.bcast(operands, root)
# Assign the device for each process. # Assign the device for each process.
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object. # Create network object.
network = Network(*operands, options={"device_id": device_id}) network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path( path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
) )
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender) info = comm.bcast(info, sender)
@@ -96,45 +80,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
) )
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
"""
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
num_slices = opt_info.num_slices#Andy
chunk, extra = num_slices // size, num_slices % size#Andy
slice_begin = rank * chunk + min(rank, extra)#Andy
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
slices = range(slice_begin, slice_end)#Andy
result = network.contract(slices=slices)
"""
return result, rank return result, rank
def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
import socket The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
"""
from mpi4py import MPI
from cuquantum import Network from cuquantum import Network
from cupy.cuda import nccl from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0 root = 0
comm_mpi = MPI.COMM_WORLD comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank() rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size() size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
@@ -146,27 +115,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands() operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands) network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path( path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
) )
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender) info = comm_mpi.bcast(info, sender)
@@ -184,12 +144,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
) )
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr stream_ptr = cp.cuda.get_current_stream().ptr
@@ -206,21 +162,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
return result, rank return result, rank
def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8): def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
from mpi4py import MPI # this line initializes MPI """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
import socket The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
"""
from mpi4py import MPI
from cuquantum import Network from cuquantum import Network
from cupy.cuda import nccl from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0 root = 0
comm_mpi = MPI.COMM_WORLD comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank() rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size() size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
@@ -232,30 +189,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands( operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string) pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
) )
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands) network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path( path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
) )
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender) info = comm_mpi.bcast(info, sender)
@@ -273,12 +220,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
) )
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr stream_ptr = cp.cuda.get_current_stream().ptr
@@ -295,57 +238,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
return result, rank return result, rank
def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8): def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
"""
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0 root = 0
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(
pauli_string_gen(qibo_circ.nqubits, pauli_string)
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data. operands = myconvertor.expectation_operands(
# operands = comm.bcast(operands, root) pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
)
# Assign the device for each process. # Assign the device for each process.
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object. # Create network object.
network = Network(*operands, options={"device_id": device_id}) network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path( path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
) )
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender) info = comm.bcast(info, sender)
@@ -363,12 +293,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
) )
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -377,8 +303,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
def dense_vector_mps(qibo_circ, gate_algo, datatype): def dense_vector_mps(qibo_circ, gate_algo, datatype):
"""Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector. """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
"""
myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype) myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
mps_helper = MPSContractionHelper(myconvertor.num_qubits) mps_helper = MPSContractionHelper(myconvertor.num_qubits)
@@ -388,7 +313,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
def pauli_string_gen(nqubits, pauli_string_pattern): def pauli_string_gen(nqubits, pauli_string_pattern):
""" Used internally to generate the string based on given pattern and number of qubit. """Used internally to generate the string based on given pattern and number of qubit.
Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
""" """
if nqubits <= 0: if nqubits <= 0:

View File

@@ -1,5 +1,7 @@
from cuquantum import CircuitToEinsum, contract, contract_path, tensor from cuquantum import CircuitToEinsum, contract, contract_path, tensor
# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
class MPSContractionHelper: class MPSContractionHelper:
""" """
@@ -85,7 +87,7 @@ class MPSContractionHelper:
self, mps_tensors, operator, qubits, options=None, normalize=False self, mps_tensors, operator, qubits, options=None, normalize=False
): ):
""" """
Contract the corresponding tensor network to form the state vector representation of the MPS. Contract the corresponding tensor network to form the expectation of the MPS.
Args: Args:
mps_tensors: A list of rank-3 ndarray-like tensor objects. mps_tensors: A list of rank-3 ndarray-like tensor objects.