Format with black

This commit is contained in:
tankya2
2024-01-24 11:47:32 +08:00
committed by yangliwei
parent fe36a84e74
commit c2d2c8318f
3 changed files with 275 additions and 214 deletions

View File

@@ -110,7 +110,6 @@ class QiboCircuitToEinsum:
self.basis_map = {"0": state_0, "1": state_1} self.basis_map = {"0": state_0, "1": state_1}
def init_inverse_circuit(self, circuit): def init_inverse_circuit(self, circuit):
self.gate_tensors_inverse = [] self.gate_tensors_inverse = []
gates_qubits_inverse = [] gates_qubits_inverse = []
@@ -131,14 +130,13 @@ class QiboCircuitToEinsum:
# self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
self.active_qubits_inverse = np.unique(gates_qubits_inverse) self.active_qubits_inverse = np.unique(gates_qubits_inverse)
def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp):
def get_pauli_gates(self, pauli_map, dtype='complex128', backend=cp):
""" """
Populate the gates for all pauli operators. Populate the gates for all pauli operators.
Args: Args:
pauli_map: A dictionary mapping qubits to pauli operators. pauli_map: A dictionary mapping qubits to pauli operators.
dtype: Data type for the tensor operands. dtype: Data type for the tensor operands.
backend: The package the tensor operands belong to. backend: The package the tensor operands belong to.
@@ -146,70 +144,74 @@ class QiboCircuitToEinsum:
A sequence of pauli gates. A sequence of pauli gates.
""" """
asarray = backend.asarray asarray = backend.asarray
pauli_i = asarray([[1,0], [0,1]], dtype=dtype) pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
pauli_x = asarray([[0,1], [1,0]], dtype=dtype) pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
pauli_y = asarray([[0,-1j], [1j,0]], dtype=dtype) pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
pauli_z = asarray([[1,0], [0,-1]], dtype=dtype) pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
operand_map = {'I': pauli_i, operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
'X': pauli_x,
'Y': pauli_y,
'Z': pauli_z}
gates = [] gates = []
for qubit, pauli_char in pauli_map.items(): for qubit, pauli_char in pauli_map.items():
operand = operand_map.get(pauli_char) operand = operand_map.get(pauli_char)
if operand is None: if operand is None:
raise ValueError('pauli string character must be one of I/X/Y/Z') raise ValueError("pauli string character must be one of I/X/Y/Z")
gates.append((operand, (qubit,))) gates.append((operand, (qubit,)))
return gates return gates
def expectation_operands(self, pauli_string): def expectation_operands(self, pauli_string):
#assign pauli string to qubit # assign pauli string to qubit
#_get_forward_inverse_metadata() # _get_forward_inverse_metadata()
input_bitstring = "0" * self.circuit.nqubits #Need all qubits! input_bitstring = "0" * self.circuit.nqubits # Need all qubits!
input_operands = self._get_bitstring_tensors(input_bitstring) input_operands = self._get_bitstring_tensors(input_bitstring)
pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
pauli_map = pauli_string pauli_map = pauli_string
coned_qubits = pauli_map.keys() coned_qubits = pauli_map.keys()
( (
mode_labels, mode_labels,
qubits_frontier, qubits_frontier,
next_frontier, next_frontier,
) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits)) ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
self.gate_tensors, qubits_frontier, next_frontier self.gate_tensors, qubits_frontier, next_frontier
) )
operands = input_operands + gate_operands operands = input_operands + gate_operands
mode_labels += gate_mode_labels mode_labels += gate_mode_labels
self.init_inverse_circuit(self.circuit.invert()) self.init_inverse_circuit(self.circuit.invert())
next_frontier = max(qubits_frontier.values()) + 1 next_frontier = max(qubits_frontier.values()) + 1
#input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits) # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
pauli_gates = self.get_pauli_gates(
pauli_map, dtype=self.dtype, backend=self.backend
)
pauli_gates = self.get_pauli_gates(pauli_map, dtype=self.dtype, backend=self.backend)
gates_inverse = pauli_gates + self.gate_tensors_inverse gates_inverse = pauli_gates + self.gate_tensors_inverse
gate_mode_labels_inverse, gate_operands_inverse = self._parse_gates_to_mode_labels_operands( (
gate_mode_labels_inverse,
gate_operands_inverse,
) = self._parse_gates_to_mode_labels_operands(
gates_inverse, qubits_frontier, next_frontier gates_inverse, qubits_frontier, next_frontier
) )
mode_labels = mode_labels + gate_mode_labels_inverse + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] mode_labels = (
operands = operands + gate_operands_inverse + operands[:self.circuit.nqubits] mode_labels
+ gate_mode_labels_inverse
operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
)
#expec = contract(*operand_exp_interleave) operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
#print(expec)
''' operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
# expec = contract(*operand_exp_interleave)
# print(expec)
"""
gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates,
qubits_frontier, qubits_frontier,
next_frontier) next_frontier)
@@ -219,5 +221,5 @@ class QiboCircuitToEinsum:
output_mode_labels = [] output_mode_labels = []
expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels) expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
''' """
return operand_exp_interleave return operand_exp_interleave

View File

@@ -19,8 +19,6 @@ class QiboTNBackend(NumpyBackend):
or platform == "cu_tensornet_expectation" or platform == "cu_tensornet_expectation"
or platform == "cu_tensornet_nccl" or platform == "cu_tensornet_nccl"
or platform == "cu_tensornet_nccl_expectation" or platform == "cu_tensornet_nccl_expectation"
): # pragma: no cover ): # pragma: no cover
self.platform = platform self.platform = platform
else: else:
@@ -72,45 +70,44 @@ class QiboTNBackend(NumpyBackend):
state = cutn.eval_mps(circuit, gate_algo, self.dtype) state = cutn.eval_mps(circuit, gate_algo, self.dtype)
if self.platform == "qu_tensornet": if self.platform == "qu_tensornet":
# init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits)
#init_state = np.random.random(2**circuit.nqubits) + 1j * np.random.random(2**circuit.nqubits) # init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum())
#init_state = init_state / np.sqrt((np.abs(init_state) ** 2).sum())
init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype) init_state = np.zeros(2**circuit.nqubits, dtype=self.dtype)
init_state[0] = 1.0 init_state[0] = 1.0
state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy") state = quimb.eval(circuit.to_qasm(), init_state, backend="numpy")
if self.platform == "cu_tensornet_mpi": if self.platform == "cu_tensornet_mpi":
if initial_state is not None: if initial_state is not None:
raise_error(NotImplementedError, "QiboTN cannot support initial state.") raise_error(NotImplementedError, "QiboTN cannot support initial state.")
#state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype,32) state, rank = cutn.eval_tn_MPI_2(circuit, self.dtype, 32)
if rank > 0: if rank > 0:
state = np.array(0) state = np.array(0)
if self.platform == "cu_tensornet_nccl": if self.platform == "cu_tensornet_nccl":
if initial_state is not None: if initial_state is not None:
raise_error(NotImplementedError, "QiboTN cannot support initial state.") raise_error(NotImplementedError, "QiboTN cannot support initial state.")
#state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
state, rank = cutn.eval_tn_nccl(circuit, self.dtype,32) state, rank = cutn.eval_tn_nccl(circuit, self.dtype, 32)
if rank > 0: if rank > 0:
state = np.array(0) state = np.array(0)
if self.platform == "cu_tensornet_expectation": if self.platform == "cu_tensornet_expectation":
if initial_state is not None: if initial_state is not None:
raise_error(NotImplementedError, "QiboTN cannot support initial state.") raise_error(NotImplementedError, "QiboTN cannot support initial state.")
state = cutn.eval_expectation(circuit, self.dtype) state = cutn.eval_expectation(circuit, self.dtype)
if self.platform == "cu_tensornet_mpi_expectation": if self.platform == "cu_tensornet_mpi_expectation":
if initial_state is not None: if initial_state is not None:
raise_error(NotImplementedError, "QiboTN cannot support initial state.") raise_error(NotImplementedError, "QiboTN cannot support initial state.")
#state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
#state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype,32) state, rank = cutn.eval_tn_MPI_2_expectation(circuit, self.dtype, 32)
if rank > 0: if rank > 0:
state = np.array(0) state = np.array(0)
@@ -118,10 +115,10 @@ class QiboTNBackend(NumpyBackend):
if initial_state is not None: if initial_state is not None:
raise_error(NotImplementedError, "QiboTN cannot support initial state.") raise_error(NotImplementedError, "QiboTN cannot support initial state.")
#state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI(circuit, self.dtype,32)
#state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32) # state, rank = cutn.eval_tn_MPI_expectation(circuit, self.dtype,32)
state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype,32) state, rank = cutn.eval_tn_nccl_expectation(circuit, self.dtype, 32)
if rank > 0: if rank > 0:
state = np.array(0) state = np.array(0)

View File

@@ -14,9 +14,13 @@ def eval(qibo_circ, datatype):
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.state_vector_operands()) return contract(*myconvertor.state_vector_operands())
def eval_expectation(qibo_circ, datatype): def eval_expectation(qibo_circ, datatype):
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))) return contract(
*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
)
def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8): def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
@@ -24,73 +28,79 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
from cuquantum import Network from cuquantum import Network
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
root = 0 root = 0
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands() operands = myconvertor.state_vector_operands()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data. # Broadcast the operand data.
#operands = comm.bcast(operands, root) # operands = comm.bcast(operands, root)
# Assign the device for each process. # Assign the device for each process.
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
#dev = cp.cuda.Device(device_id) # dev = cp.cuda.Device(device_id)
#free_mem, total_mem = dev.mem_info # free_mem, total_mem = dev.mem_info
#print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object. # Create network object.
network = Network(*operands, options={'device_id' : device_id}) network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) path, info = network.contract_path(
#print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
#if rank == root: # if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender) info = comm.bcast(info, sender)
# Set path and slices. # Set path and slices.
path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices. # Calculate this process's share of the slices.
num_slices = info.num_slices num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra) slice_begin = rank * chunk + min(rank, extra)
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
#print(f"Process {rank} is processing slice range: {slices}.") # print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
#print(f"Process {rank} result shape is : {result.shape}.") # print(f"Process {rank} result shape is : {result.shape}.")
#print(f"Process {rank} result size is : {result.nbytes}.") # print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
return result, rank return result, rank
def eval_tn_nccl(qibo_circ, datatype, n_samples=8): def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
import socket import socket
@@ -98,18 +108,18 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
from cupy.cuda import nccl from cupy.cuda import nccl
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
root = 0 root = 0
comm_mpi = MPI.COMM_WORLD comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank() rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size() size = comm_mpi.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
# Set up the NCCL communicator. # Set up the NCCL communicator.
nccl_id = nccl.get_unique_id() if rank == root else None nccl_id = nccl.get_unique_id() if rank == root else None
nccl_id = comm_mpi.bcast(nccl_id, root) nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -117,51 +127,66 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.state_vector_operands() operands = myconvertor.state_vector_operands()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands) network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
#print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
#if rank == root: # if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender) info = comm_mpi.bcast(info, sender)
# Set path and slices. # Set path and slices.
path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices. # Calculate this process's share of the slices.
num_slices = info.num_slices num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra) slice_begin = rank * chunk + min(rank, extra)
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
#print(f"Process {rank} is processing slice range: {slices}.") # print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
#print(f"Process {rank} result shape is : {result.shape}.") # print(f"Process {rank} result shape is : {result.shape}.")
#print(f"Process {rank} result size is : {result.nbytes}.") # print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr stream_ptr = cp.cuda.get_current_stream().ptr
comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) comm_nccl.reduce(
result.data.ptr,
result.data.ptr,
result.size,
nccl.NCCL_FLOAT64,
nccl.NCCL_SUM,
root,
stream_ptr,
)
return result, rank return result, rank
def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8): def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
import socket import socket
@@ -169,18 +194,18 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
from cupy.cuda import nccl from cupy.cuda import nccl
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
root = 0 root = 0
comm_mpi = MPI.COMM_WORLD comm_mpi = MPI.COMM_WORLD
rank = comm_mpi.Get_rank() rank = comm_mpi.Get_rank()
size = comm_mpi.Get_size() size = comm_mpi.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
# Set up the NCCL communicator. # Set up the NCCL communicator.
nccl_id = nccl.get_unique_id() if rank == root else None nccl_id = nccl.get_unique_id() if rank == root else None
nccl_id = comm_mpi.bcast(nccl_id, root) nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -188,50 +213,64 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
network = Network(*operands) network = Network(*operands)
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) path, info = network.contract_path(
optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
#print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
#if rank == root: # if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm_mpi.bcast(info, sender) info = comm_mpi.bcast(info, sender)
# Set path and slices. # Set path and slices.
path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices. # Calculate this process's share of the slices.
num_slices = info.num_slices num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra) slice_begin = rank * chunk + min(rank, extra)
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
#print(f"Process {rank} is processing slice range: {slices}.") # print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
#print(f"Process {rank} result shape is : {result.shape}.") # print(f"Process {rank} result shape is : {result.shape}.")
#print(f"Process {rank} result size is : {result.nbytes}.") # print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
stream_ptr = cp.cuda.get_current_stream().ptr stream_ptr = cp.cuda.get_current_stream().ptr
comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr) comm_nccl.reduce(
result.data.ptr,
result.data.ptr,
result.size,
nccl.NCCL_FLOAT64,
nccl.NCCL_SUM,
root,
stream_ptr,
)
return result, rank return result, rank
@@ -241,128 +280,144 @@ def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8):
from cuquantum import Network from cuquantum import Network
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
root = 0 root = 0
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Broadcast the operand data. # Broadcast the operand data.
#operands = comm.bcast(operands, root) # operands = comm.bcast(operands, root)
# Assign the device for each process. # Assign the device for each process.
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
#dev = cp.cuda.Device(device_id) # dev = cp.cuda.Device(device_id)
#free_mem, total_mem = dev.mem_info # free_mem, total_mem = dev.mem_info
#print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
# Create network object. # Create network object.
network = Network(*operands, options={'device_id' : device_id}) network = Network(*operands, options={"device_id": device_id})
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}}) path, info = network.contract_path(
#print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
)
# print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.")
# Select the best path from all ranks. # Select the best path from all ranks.
opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
#if rank == root: # if rank == root:
# print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
# Broadcast info from the sender to all other ranks. # Broadcast info from the sender to all other ranks.
info = comm.bcast(info, sender) info = comm.bcast(info, sender)
# Set path and slices. # Set path and slices.
path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices}) path, info = network.contract_path(
optimize={"path": info.path, "slicing": info.slices}
)
# Calculate this process's share of the slices. # Calculate this process's share of the slices.
num_slices = info.num_slices num_slices = info.num_slices
chunk, extra = num_slices // size, num_slices % size chunk, extra = num_slices // size, num_slices % size
slice_begin = rank * chunk + min(rank, extra) slice_begin = rank * chunk + min(rank, extra)
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra) slice_end = (
num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
)
slices = range(slice_begin, slice_end) slices = range(slice_begin, slice_end)
#print(f"Process {rank} is processing slice range: {slices}.") # print(f"Process {rank} is processing slice range: {slices}.")
# Contract the group of slices the process is responsible for. # Contract the group of slices the process is responsible for.
result = network.contract(slices=slices) result = network.contract(slices=slices)
#print(f"Process {rank} result shape is : {result.shape}.") # print(f"Process {rank} result shape is : {result.shape}.")
#print(f"Process {rank} result size is : {result.nbytes}.") # print(f"Process {rank} result size is : {result.nbytes}.")
# Sum the partial contribution from each process on root. # Sum the partial contribution from each process on root.
result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
return result, rank return result, rank
def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8): def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
import socket import socket
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
ncpu_threads = multiprocessing.cpu_count() // 2 ncpu_threads = multiprocessing.cpu_count() // 2
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
handle = cutn.create() handle = cutn.create()
network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft network opts",mem_avail, "rank =",rank) # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)) operands_interleave = myconvertor.expectation_operands(
#mem_avail = cp.cuda.Device().mem_info[0] PauliStringGen(qibo_circ.nqubits)
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) )
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
network = cutn.Network(*operands_interleave, options=network_opts) network = cutn.Network(*operands_interleave, options=network_opts)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) path, opt_info = network.contract_path(
#mem_avail = cp.cuda.Device().mem_info[0] optimize={
#print("Mem avail: aft contract path",mem_avail, "rank =",rank) "samples": n_samples,
"threads": ncpu_threads,
"slicing": {"min_slices": max(16, size)},
}
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract path",mem_avail, "rank =",rank)
# Execution: To execute the contraction using the optimal path found previously # Execution: To execute the contraction using the optimal path found previously
#print("opt_cost",opt_info.opt_cost, "Process =",rank) # print("opt_cost",opt_info.opt_cost, "Process =",rank)
num_slices = opt_info.num_slices # Andy
num_slices = opt_info.num_slices#Andy chunk, extra = num_slices // size, num_slices % size # Andy
chunk, extra = num_slices // size, num_slices % size#Andy slice_begin = rank * chunk + min(rank, extra) # Andy
slice_begin = rank * chunk + min(rank, extra)#Andy slice_end = (
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
slices = range(slice_begin, slice_end)#Andy ) # Andy
slices = range(slice_begin, slice_end) # Andy
result = network.contract(slices=slices) result = network.contract(slices=slices)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft contract",mem_avail, "rank =",rank) # print("Mem avail: aft contract",mem_avail, "rank =",rank)
cutn.destroy(handle) cutn.destroy(handle)
return result, rank return result, rank
def eval_tn_MPI(qibo_circ, datatype, n_samples=8): def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
"""Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI. """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
@@ -372,45 +427,52 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
from mpi4py import MPI # this line initializes MPI from mpi4py import MPI # this line initializes MPI
import socket import socket
# Get the hostname # Get the hostname
#hostname = socket.gethostname() # hostname = socket.gethostname()
ncpu_threads = multiprocessing.cpu_count() // 2 ncpu_threads = multiprocessing.cpu_count() // 2
comm = MPI.COMM_WORLD comm = MPI.COMM_WORLD
rank = comm.Get_rank() rank = comm.Get_rank()
size = comm.Get_size() size = comm.Get_size()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
device_id = rank % getDeviceCount() device_id = rank % getDeviceCount()
cp.cuda.Device(device_id).use() cp.cuda.Device(device_id).use()
handle = cutn.create() handle = cutn.create()
network_opts = cutn.NetworkOptions(handle=handle, blocking="auto") network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft network opts",mem_avail, "rank =",rank) # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm)) cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank) # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
# Perform circuit conversion # Perform circuit conversion
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft convetor",mem_avail, "rank =",rank) # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
operands_interleave = myconvertor.state_vector_operands() operands_interleave = myconvertor.state_vector_operands()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
# Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object. # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
network = cutn.Network(*operands_interleave, options=network_opts) network = cutn.Network(*operands_interleave, options=network_opts)
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank) # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) network.contract_path(
#mem_avail = cp.cuda.Device().mem_info[0] optimize={
#print("Mem avail: aft contract path",mem_avail, "rank =",rank) "samples": n_samples,
"threads": ncpu_threads,
"slicing": {"min_slices": max(16, size)},
}
)
# mem_avail = cp.cuda.Device().mem_info[0]
# print("Mem avail: aft contract path",mem_avail, "rank =",rank)
# Execution: To execute the contraction using the optimal path found previously # Execution: To execute the contraction using the optimal path found previously
#print("opt_cost",opt_info.opt_cost, "Process =",rank) # print("opt_cost",opt_info.opt_cost, "Process =",rank)
''' """
path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
num_slices = opt_info.num_slices#Andy num_slices = opt_info.num_slices#Andy
@@ -419,11 +481,11 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
slices = range(slice_begin, slice_end)#Andy slices = range(slice_begin, slice_end)#Andy
result = network.contract(slices=slices) result = network.contract(slices=slices)
''' """
result = network.contract() result = network.contract()
#mem_avail = cp.cuda.Device().mem_info[0] # mem_avail = cp.cuda.Device().mem_info[0]
#print("Mem avail: aft contract",mem_avail, "rank =",rank) # print("Mem avail: aft contract",mem_avail, "rank =",rank)
cutn.destroy(handle) cutn.destroy(handle)
return result, rank return result, rank
@@ -437,18 +499,18 @@ def eval_mps(qibo_circ, gate_algo, datatype):
myconvertor.mps_tensors, {"handle": myconvertor.handle} myconvertor.mps_tensors, {"handle": myconvertor.handle}
) )
def PauliStringGen(nqubits): def PauliStringGen(nqubits):
if nqubits <= 0: if nqubits <= 0:
return "Invalid input. N should be a positive integer." return "Invalid input. N should be a positive integer."
#characters = 'IXYZ' # characters = 'IXYZ'
characters = 'XXXZ' characters = "XXXZ"
result = '' result = ""
for i in range(nqubits): for i in range(nqubits):
char_to_add = characters[i % len(characters)] char_to_add = characters[i % len(characters)]
result += char_to_add result += char_to_add
return result return result