Files
final-qibotn/src/qibotn/cutn.py
2024-01-30 10:22:39 +08:00

533 lines
20 KiB
Python

from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
from cuquantum import contract
from cuquantum import cutensornet as cutn
import multiprocessing
from cupy.cuda.runtime import getDeviceCount
import cupy as cp
from qibotn.QiboCircuitToMPS import QiboCircuitToMPS
from qibotn.mps_contraction_helper import MPSContractionHelper
def eval(qibo_circ, datatype):
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.state_vector_operands())
def eval_expectation(qibo_circ, datatype):
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(
*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
)
def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
"""
from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data.
# operands = comm.bcast(operands, root)
# Assign the device for each process.
device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object.
network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender)
# Set path and slices.
path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices.
num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra)
slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
"""
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
num_slices = opt_info.num_slices#Andy
chunk, extra = num_slices // size, num_slices % size#Andy
slice_begin = rank * chunk + min(rank, extra)#Andy
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
slices = range(slice_begin, slice_end)#Andy
result = network.contract(slices=slices)
"""
return result, rank
def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network
from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
# Set up the NCCL communicator.
nccl_id = nccl.get_unique_id() if rank == root else None
nccl_id = comm_mpi.bcast(nccl_id, root)
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender)
# Set path and slices.
path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices.
num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra)
slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr
comm_nccl.reduce(
result.data.ptr,
result.data.ptr,
result.size,
nccl.NCCL_FLOAT64,
nccl.NCCL_SUM,
root,
stream_ptr,
)
return result, rank
def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network
from cupy.cuda import nccl
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
# Set up the NCCL communicator.
nccl_id = nccl.get_unique_id() if rank == root else None
nccl_id = comm_mpi.bcast(nccl_id, root)
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender)
# Set path and slices.
path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices.
num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra)
slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr
comm_nccl.reduce(
result.data.ptr,
result.data.ptr,
result.size,
nccl.NCCL_FLOAT64,
nccl.NCCL_SUM,
root,
stream_ptr,
)
return result, rank
def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
from cuquantum import Network
# Get the hostname
# hostname = socket.gethostname()
root = 0
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data.
# operands = comm.bcast(operands, root)
# Assign the device for each process.
device_id = rank % getDeviceCount()
# dev = cp.cuda.Device(device_id)
# free_mem, total_mem = dev.mem_info
# print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object.
network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
# if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender)
# Set path and slices.
path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices.
num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra)
slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end)
# print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for.
result = network.contract(slices=slices)
# print(f"Process {rank} result shape is : {result.shape}.")
# print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
return result, rank
def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI
import socket
# Get the hostname
# hostname = socket.gethostname()
ncpu_threads = multiprocessing.cpu_count() // 2
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
handle = cutn.create()
network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft network opts",mem_avail, "rank =",rank)
cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
operands_interleave = myconvertor.expectation_operands(
PauliStringGen(qibo_circ.nqubits)
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
network = cutn.Network(*operands_interleave, options=network_opts)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
path, opt_info = network.contract_path(
optimize={
"samples": n_samples,
"threads": ncpu_threads,
"slicing": {"min_slices": max(16, size)},
}
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract path",mem_avail, "rank =",rank)
# Execution: To execute the contraction using the optimal path found previously
# print("opt_cost",opt_info.opt_cost, "Process =",rank)
num_slices = opt_info.num_slices # Andy
chunk, extra = num_slices // size, num_slices % size # Andy
slice_begin = rank * chunk + min(rank, extra) # Andy
slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
) # Andy
slices = range(slice_begin, slice_end) # Andy
result = network.contract(slices=slices)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract",mem_avail, "rank =",rank)
cutn.destroy(handle)
return result, rank
def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
"""
from mpi4py import MPI # this line initializes MPI
import socket
# Get the hostname
# hostname = socket.gethostname()
ncpu_threads = multiprocessing.cpu_count() // 2
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use()
handle = cutn.create()
network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft network opts",mem_avail, "rank =",rank)
cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
# Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands_interleave = myconvertor.state_vector_operands()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
network = cutn.Network(*operands_interleave, options=network_opts)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
network.contract_path(
optimize={
"samples": n_samples,
"threads": ncpu_threads,
"slicing": {"min_slices": max(16, size)},
}
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract path",mem_avail, "rank =",rank)
# Execution: To execute the contraction using the optimal path found previously
# print("opt_cost",opt_info.opt_cost, "Process =",rank)
"""
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
num_slices = opt_info.num_slices#Andy
chunk, extra = num_slices // size, num_slices % size#Andy
slice_begin = rank * chunk + min(rank, extra)#Andy
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
slices = range(slice_begin, slice_end)#Andy
result = network.contract(slices=slices)
"""
result = network.contract()
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract",mem_avail, "rank =",rank)
cutn.destroy(handle)
return result, rank
def eval_mps(qibo_circ, gate_algo, datatype):
myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
mps_helper = MPSContractionHelper(myconvertor.num_qubits)
return mps_helper.contract_state_vector(
myconvertor.mps_tensors, {"handle": myconvertor.handle}
)
def PauliStringGen(nqubits):
if nqubits <= 0:
return "Invalid input. N should be a positive integer."
# characters = 'IXYZ'
characters = "XXXZ"
result = ""
for i in range(nqubits):
char_to_add = characters[i % len(characters)]
result += char_to_add
return result