|
|
|
|
@@ -1,7 +1,5 @@
|
|
|
|
|
from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
|
|
|
|
|
from cuquantum import contract
|
|
|
|
|
from cuquantum import cutensornet as cutn
|
|
|
|
|
import multiprocessing
|
|
|
|
|
from cupy.cuda.runtime import getDeviceCount
|
|
|
|
|
import cupy as cp
|
|
|
|
|
|
|
|
|
|
@@ -10,15 +8,17 @@ from qibotn.mps_contraction_helper import MPSContractionHelper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dense_vector_tn(qibo_circ, datatype):
|
|
|
|
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
return contract(*myconvertor.state_vector_operands())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expectation_pauli_tn(qibo_circ, datatype, pauli_string):
|
|
|
|
|
def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
|
|
|
|
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
return contract(
|
|
|
|
|
*myconvertor.expectation_operands(
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string)
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@@ -30,54 +30,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from mpi4py import MPI # this line initializes MPI
|
|
|
|
|
import socket
|
|
|
|
|
from mpi4py import MPI
|
|
|
|
|
from cuquantum import Network
|
|
|
|
|
|
|
|
|
|
# Get the hostname
|
|
|
|
|
# hostname = socket.gethostname()
|
|
|
|
|
|
|
|
|
|
root = 0
|
|
|
|
|
comm = MPI.COMM_WORLD
|
|
|
|
|
rank = comm.Get_rank()
|
|
|
|
|
size = comm.Get_size()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
|
|
|
|
|
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
# Perform circuit conversion
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
|
|
|
|
|
operands = myconvertor.state_vector_operands()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
# Broadcast the operand data.
|
|
|
|
|
# operands = comm.bcast(operands, root)
|
|
|
|
|
operands = myconvertor.state_vector_operands()
|
|
|
|
|
|
|
|
|
|
# Assign the device for each process.
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
# dev = cp.cuda.Device(device_id)
|
|
|
|
|
# free_mem, total_mem = dev.mem_info
|
|
|
|
|
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
# Create network object.
|
|
|
|
|
network = Network(*operands, options={"device_id": device_id})
|
|
|
|
|
|
|
|
|
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
|
|
|
|
path, info = network.contract_path(
|
|
|
|
|
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
)
|
|
|
|
|
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Select the best path from all ranks.
|
|
|
|
|
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
|
|
|
|
|
|
|
|
|
|
# if rank == root:
|
|
|
|
|
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Broadcast info from the sender to all other ranks.
|
|
|
|
|
info = comm.bcast(info, sender)
|
|
|
|
|
|
|
|
|
|
@@ -95,45 +76,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
)
|
|
|
|
|
slices = range(slice_begin, slice_end)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} is processing slice range: {slices}.")
|
|
|
|
|
|
|
|
|
|
# Contract the group of slices the process is responsible for.
|
|
|
|
|
result = network.contract(slices=slices)
|
|
|
|
|
# print(f"Process {rank} result shape is : {result.shape}.")
|
|
|
|
|
# print(f"Process {rank} result size is : {result.nbytes}.")
|
|
|
|
|
|
|
|
|
|
# Sum the partial contribution from each process on root.
|
|
|
|
|
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
|
|
|
|
|
|
|
|
|
|
num_slices = opt_info.num_slices#Andy
|
|
|
|
|
chunk, extra = num_slices // size, num_slices % size#Andy
|
|
|
|
|
slice_begin = rank * chunk + min(rank, extra)#Andy
|
|
|
|
|
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
|
|
|
|
|
slices = range(slice_begin, slice_end)#Andy
|
|
|
|
|
result = network.contract(slices=slices)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
return result, rank
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
from mpi4py import MPI # this line initializes MPI
|
|
|
|
|
import socket
|
|
|
|
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
|
|
|
|
|
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
|
|
|
|
|
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
|
|
|
|
|
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
|
|
|
|
|
"""
|
|
|
|
|
from mpi4py import MPI
|
|
|
|
|
from cuquantum import Network
|
|
|
|
|
from cupy.cuda import nccl
|
|
|
|
|
|
|
|
|
|
# Get the hostname
|
|
|
|
|
# hostname = socket.gethostname()
|
|
|
|
|
|
|
|
|
|
root = 0
|
|
|
|
|
comm_mpi = MPI.COMM_WORLD
|
|
|
|
|
rank = comm_mpi.Get_rank()
|
|
|
|
|
size = comm_mpi.Get_size()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
|
|
|
|
|
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
cp.cuda.Device(device_id).use()
|
|
|
|
|
@@ -145,27 +111,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
|
|
|
|
|
# Perform circuit conversion
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
|
|
|
|
|
operands = myconvertor.state_vector_operands()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
network = Network(*operands)
|
|
|
|
|
|
|
|
|
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
|
|
|
|
path, info = network.contract_path(
|
|
|
|
|
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Select the best path from all ranks.
|
|
|
|
|
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
|
|
|
|
|
|
|
|
|
|
# if rank == root:
|
|
|
|
|
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Broadcast info from the sender to all other ranks.
|
|
|
|
|
info = comm_mpi.bcast(info, sender)
|
|
|
|
|
|
|
|
|
|
@@ -183,12 +140,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
)
|
|
|
|
|
slices = range(slice_begin, slice_end)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} is processing slice range: {slices}.")
|
|
|
|
|
|
|
|
|
|
# Contract the group of slices the process is responsible for.
|
|
|
|
|
result = network.contract(slices=slices)
|
|
|
|
|
# print(f"Process {rank} result shape is : {result.shape}.")
|
|
|
|
|
# print(f"Process {rank} result size is : {result.nbytes}.")
|
|
|
|
|
|
|
|
|
|
# Sum the partial contribution from each process on root.
|
|
|
|
|
stream_ptr = cp.cuda.get_current_stream().ptr
|
|
|
|
|
@@ -205,21 +158,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
|
|
|
|
|
return result, rank
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
from mpi4py import MPI # this line initializes MPI
|
|
|
|
|
import socket
|
|
|
|
|
def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
|
|
|
|
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
|
|
|
|
|
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
|
|
|
|
|
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
|
|
|
|
|
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
|
|
|
|
|
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
|
|
|
|
|
"""
|
|
|
|
|
from mpi4py import MPI
|
|
|
|
|
from cuquantum import Network
|
|
|
|
|
from cupy.cuda import nccl
|
|
|
|
|
|
|
|
|
|
# Get the hostname
|
|
|
|
|
# hostname = socket.gethostname()
|
|
|
|
|
|
|
|
|
|
root = 0
|
|
|
|
|
comm_mpi = MPI.COMM_WORLD
|
|
|
|
|
rank = comm_mpi.Get_rank()
|
|
|
|
|
size = comm_mpi.Get_size()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
|
|
|
|
|
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
cp.cuda.Device(device_id).use()
|
|
|
|
|
@@ -231,30 +185,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
|
|
|
|
|
# Perform circuit conversion
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
|
|
|
|
|
operands = myconvertor.expectation_operands(
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string)
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
network = Network(*operands)
|
|
|
|
|
|
|
|
|
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
|
|
|
|
path, info = network.contract_path(
|
|
|
|
|
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Select the best path from all ranks.
|
|
|
|
|
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
|
|
|
|
|
|
|
|
|
|
# if rank == root:
|
|
|
|
|
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Broadcast info from the sender to all other ranks.
|
|
|
|
|
info = comm_mpi.bcast(info, sender)
|
|
|
|
|
|
|
|
|
|
@@ -272,12 +216,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
)
|
|
|
|
|
slices = range(slice_begin, slice_end)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} is processing slice range: {slices}.")
|
|
|
|
|
|
|
|
|
|
# Contract the group of slices the process is responsible for.
|
|
|
|
|
result = network.contract(slices=slices)
|
|
|
|
|
# print(f"Process {rank} result shape is : {result.shape}.")
|
|
|
|
|
# print(f"Process {rank} result size is : {result.nbytes}.")
|
|
|
|
|
|
|
|
|
|
# Sum the partial contribution from each process on root.
|
|
|
|
|
stream_ptr = cp.cuda.get_current_stream().ptr
|
|
|
|
|
@@ -294,57 +234,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
return result, rank
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
|
|
|
|
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
|
|
|
|
|
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
|
|
|
|
|
The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
|
|
|
|
|
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
|
|
|
|
|
After pathfinding the optimal path is used in the actual contraction to give an expectation value.
|
|
|
|
|
"""
|
|
|
|
|
from mpi4py import MPI # this line initializes MPI
|
|
|
|
|
import socket
|
|
|
|
|
from cuquantum import Network
|
|
|
|
|
|
|
|
|
|
# Get the hostname
|
|
|
|
|
# hostname = socket.gethostname()
|
|
|
|
|
|
|
|
|
|
root = 0
|
|
|
|
|
comm = MPI.COMM_WORLD
|
|
|
|
|
rank = comm.Get_rank()
|
|
|
|
|
size = comm.Get_size()
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
|
|
|
|
|
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
# Perform circuit conversion
|
|
|
|
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
|
|
|
|
|
operands = myconvertor.expectation_operands(
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string)
|
|
|
|
|
)
|
|
|
|
|
# mem_avail = cp.cuda.Device().mem_info[0]
|
|
|
|
|
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
# Broadcast the operand data.
|
|
|
|
|
# operands = comm.bcast(operands, root)
|
|
|
|
|
operands = myconvertor.expectation_operands(
|
|
|
|
|
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Assign the device for each process.
|
|
|
|
|
device_id = rank % getDeviceCount()
|
|
|
|
|
|
|
|
|
|
# dev = cp.cuda.Device(device_id)
|
|
|
|
|
# free_mem, total_mem = dev.mem_info
|
|
|
|
|
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
|
|
|
|
|
|
|
|
|
|
# Create network object.
|
|
|
|
|
network = Network(*operands, options={"device_id": device_id})
|
|
|
|
|
|
|
|
|
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
|
|
|
|
path, info = network.contract_path(
|
|
|
|
|
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
|
|
|
|
)
|
|
|
|
|
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Select the best path from all ranks.
|
|
|
|
|
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
|
|
|
|
|
|
|
|
|
|
# if rank == root:
|
|
|
|
|
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
|
|
|
|
|
|
|
|
|
|
# Broadcast info from the sender to all other ranks.
|
|
|
|
|
info = comm.bcast(info, sender)
|
|
|
|
|
|
|
|
|
|
@@ -362,12 +289,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
)
|
|
|
|
|
slices = range(slice_begin, slice_end)
|
|
|
|
|
|
|
|
|
|
# print(f"Process {rank} is processing slice range: {slices}.")
|
|
|
|
|
|
|
|
|
|
# Contract the group of slices the process is responsible for.
|
|
|
|
|
result = network.contract(slices=slices)
|
|
|
|
|
# print(f"Process {rank} result shape is : {result.shape}.")
|
|
|
|
|
# print(f"Process {rank} result size is : {result.nbytes}.")
|
|
|
|
|
|
|
|
|
|
# Sum the partial contribution from each process on root.
|
|
|
|
|
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
|
|
|
|
|
@@ -376,8 +299,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def dense_vector_mps(qibo_circ, gate_algo, datatype):
|
|
|
|
|
"""Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.
|
|
|
|
|
"""
|
|
|
|
|
"""Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
|
|
|
|
|
myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
|
|
|
|
|
mps_helper = MPSContractionHelper(myconvertor.num_qubits)
|
|
|
|
|
|
|
|
|
|
@@ -387,7 +309,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pauli_string_gen(nqubits, pauli_string_pattern):
|
|
|
|
|
""" Used internally to generate the string based on given pattern and number of qubit.
|
|
|
|
|
"""Used internally to generate the string based on given pattern and number of qubit.
|
|
|
|
|
Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
|
|
|
|
|
"""
|
|
|
|
|
if nqubits <= 0:
|
|
|
|
|
|