diff --git a/src/qibotn/MPSUtils.py b/src/qibotn/MPSUtils.py index fd1b4c7..4f84f67 100644 --- a/src/qibotn/MPSUtils.py +++ b/src/qibotn/MPSUtils.py @@ -2,6 +2,8 @@ import cupy as cp from cuquantum.cutensornet.experimental import contract_decompose from cuquantum import contract +# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb + def initial(num_qubits, dtype): """ diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py index d3a0569..c59745b 100644 --- a/src/qibotn/QiboCircuitConvertor.py +++ b/src/qibotn/QiboCircuitConvertor.py @@ -1,6 +1,8 @@ import cupy as cp import numpy as np +# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter + class QiboCircuitToEinsum: """Convert a circuit to a Tensor Network (TN) representation. @@ -159,9 +161,7 @@ class QiboCircuitToEinsum: return gates def expectation_operands(self, pauli_string): - # assign pauli string to qubit - # _get_forward_inverse_metadata() - input_bitstring = "0" * self.circuit.nqubits # Need all qubits! + input_bitstring = "0" * self.circuit.nqubits input_operands = self._get_bitstring_tensors(input_bitstring) pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string)) @@ -185,8 +185,6 @@ class QiboCircuitToEinsum: next_frontier = max(qubits_frontier.values()) + 1 - # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits) - pauli_gates = self.get_pauli_gates( pauli_map, dtype=self.dtype, backend=self.backend ) @@ -208,18 +206,4 @@ class QiboCircuitToEinsum: operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] - # expec = contract(*operand_exp_interleave) - # print(expec) - - """ - gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, - qubits_frontier, - next_frontier) - - mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits] - operands = input_operands + gate_operands + input_operands[:n_qubits] - - output_mode_labels = [] - expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels) - """ return operand_exp_interleave diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py index a9aeaac..b73418a 100644 --- a/src/qibotn/eval.py +++ b/src/qibotn/eval.py @@ -1,7 +1,5 @@ from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum from cuquantum import contract -from cuquantum import cutensornet as cutn -import multiprocessing from cupy.cuda.runtime import getDeviceCount import cupy as cp @@ -10,15 +8,17 @@ from qibotn.mps_contraction_helper import MPSContractionHelper def dense_vector_tn(qibo_circ, datatype): + """Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector.""" myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract(*myconvertor.state_vector_operands()) -def expectation_pauli_tn(qibo_circ, datatype, pauli_string): +def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern): + """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string.""" myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) return contract( *myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string) + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) ) ) @@ -30,54 +30,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8): After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN. """ - from mpi4py import MPI # this line initializes MPI - import socket + from mpi4py import MPI from cuquantum import Network - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - operands = myconvertor.state_vector_operands() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - # Broadcast the operand data. - # operands = comm.bcast(operands, root) + operands = myconvertor.state_vector_operands() # Assign the device for each process. device_id = rank % getDeviceCount() - # dev = cp.cuda.Device(device_id) - # free_mem, total_mem = dev.mem_info - # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) - # Create network object. network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") - # Broadcast info from the sender to all other ranks. info = comm.bcast(info, sender) @@ -95,45 +76,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) - """ - path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}}) - - num_slices = opt_info.num_slices#Andy - chunk, extra = num_slices // size, num_slices % size#Andy - slice_begin = rank * chunk + min(rank, extra)#Andy - slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy - slices = range(slice_begin, slice_end)#Andy - result = network.contract(slices=slices) - """ - return result, rank def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket + """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL. + The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. + The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread. + After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN. + """ + from mpi4py import MPI from cuquantum import Network from cupy.cuda import nccl - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm_mpi = MPI.COMM_WORLD rank = comm_mpi.Get_rank() size = comm_mpi.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() @@ -145,27 +111,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.state_vector_operands() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") - # Select the best path from all ranks. opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") - # Broadcast info from the sender to all other ranks. info = comm_mpi.bcast(info, sender) @@ -183,12 +140,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. stream_ptr = cp.cuda.get_current_stream().ptr @@ -205,21 +158,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): return result, rank -def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8): - from mpi4py import MPI # this line initializes MPI - import socket +def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8): + """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL. + The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. + The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system. + The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread. + After pathfinding the optimal path is used in the actual contraction to give an expectation value. + """ + from mpi4py import MPI from cuquantum import Network from cupy.cuda import nccl - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm_mpi = MPI.COMM_WORLD rank = comm_mpi.Get_rank() size = comm_mpi.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() cp.cuda.Device(device_id).use() @@ -231,30 +185,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8): # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) operands = myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string) + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) ) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") - # Select the best path from all ranks. opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") - # Broadcast info from the sender to all other ranks. info = comm_mpi.bcast(info, sender) @@ -272,12 +216,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. stream_ptr = cp.cuda.get_current_stream().ptr @@ -294,57 +234,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8): return result, rank -def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8): +def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8): + """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI. + The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution. + The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system. + The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread. + After pathfinding the optimal path is used in the actual contraction to give an expectation value. + """ from mpi4py import MPI # this line initializes MPI - import socket from cuquantum import Network - # Get the hostname - # hostname = socket.gethostname() - root = 0 comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname) + device_id = rank % getDeviceCount() # Perform circuit conversion myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft convetor",mem_avail, "rank =",rank) - operands = myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string) - ) - # mem_avail = cp.cuda.Device().mem_info[0] - # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank) - # Broadcast the operand data. - # operands = comm.bcast(operands, root) + operands = myconvertor.expectation_operands( + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) + ) # Assign the device for each process. device_id = rank % getDeviceCount() - # dev = cp.cuda.Device(device_id) - # free_mem, total_mem = dev.mem_info - # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank) - # Create network object. network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}} + optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} ) - # print(f"Process {rank} has the path with the FLOP count {info.opt_cost}.") # Select the best path from all ranks. opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC) - # if rank == root: - # print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.") - # Broadcast info from the sender to all other ranks. info = comm.bcast(info, sender) @@ -362,12 +289,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8): ) slices = range(slice_begin, slice_end) - # print(f"Process {rank} is processing slice range: {slices}.") - # Contract the group of slices the process is responsible for. result = network.contract(slices=slices) - # print(f"Process {rank} result shape is : {result.shape}.") - # print(f"Process {rank} result size is : {result.nbytes}.") # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) @@ -376,8 +299,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8): def dense_vector_mps(qibo_circ, gate_algo, datatype): - """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector. - """ + """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.""" myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype) mps_helper = MPSContractionHelper(myconvertor.num_qubits) @@ -387,7 +309,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype): def pauli_string_gen(nqubits, pauli_string_pattern): - """ Used internally to generate the string based on given pattern and number of qubit. + """Used internally to generate the string based on given pattern and number of qubit. Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX """ if nqubits <= 0: diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py index ee8e4e4..29d5e25 100644 --- a/src/qibotn/mps_contraction_helper.py +++ b/src/qibotn/mps_contraction_helper.py @@ -1,5 +1,7 @@ from cuquantum import contract, contract_path, CircuitToEinsum, tensor +# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb + class MPSContractionHelper: """ @@ -85,7 +87,7 @@ class MPSContractionHelper: self, mps_tensors, operator, qubits, options=None, normalize=False ): """ - Contract the corresponding tensor network to form the state vector representation of the MPS. + Contract the corresponding tensor network to form the expectation of the MPS. Args: mps_tensors: A list of rank-3 ndarray-like tensor objects.