Tidy up

2024-02-01 14:42:17 +08:00
parent 50f1fcd6e8
commit 09c741ea80
4 changed files with 51 additions and 138 deletions
--- a/src/qibotn/MPSUtils.py
+++ b/src/qibotn/MPSUtils.py
@@ -2,6 +2,8 @@ import cupy as cp
 from cuquantum import contract
 from cuquantum.cutensornet.experimental import contract_decompose
 # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
 def initial(num_qubits, dtype):
    """
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -1,6 +1,8 @@
 import cupy as cp
 import numpy as np
 # Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
 class QiboCircuitToEinsum:
    """Convert a circuit to a Tensor Network (TN) representation.
@@ -159,9 +161,7 @@ class QiboCircuitToEinsum:
        return gates
    def expectation_operands(self, pauli_string):
-        # assign pauli string to qubit
+        input_bitstring = "0" * self.circuit.nqubits 
        # _get_forward_inverse_metadata()
        input_bitstring = "0" * self.circuit.nqubits  # Need all qubits!
        input_operands = self._get_bitstring_tensors(input_bitstring)
        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
@@ -185,8 +185,6 @@ class QiboCircuitToEinsum:
        next_frontier = max(qubits_frontier.values()) + 1
        # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
        pauli_gates = self.get_pauli_gates(
            pauli_map, dtype=self.dtype, backend=self.backend
        )
@@ -208,18 +206,4 @@ class QiboCircuitToEinsum:
        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
        # expec = contract(*operand_exp_interleave)
        # print(expec)
        """
        gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, 
                                                                                         qubits_frontier, 
                                                                                         next_frontier)
        mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
        operands = input_operands + gate_operands + input_operands[:n_qubits]
        output_mode_labels = []
        expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
        """
        return operand_exp_interleave
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -3,7 +3,8 @@ import multiprocessing
 import cupy as cp
 from cupy.cuda.runtime import getDeviceCount
 from cuquantum import contract
-from cuquantum import cutensornet as cutn
+from cupy.cuda.runtime import getDeviceCount
 import cupy as cp
 from qibotn.mps_contraction_helper import MPSContractionHelper
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
@@ -11,15 +12,17 @@ from qibotn.QiboCircuitToMPS import QiboCircuitToMPS
 def dense_vector_tn(qibo_circ, datatype):
    """Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(*myconvertor.state_vector_operands())
-def expectation_pauli_tn(qibo_circ, datatype, pauli_string):
+def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(
        *myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string)
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
        )
    )
@@ -31,54 +34,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
    """
-    from mpi4py import MPI  # this line initializes MPI
+    from mpi4py import MPI
    import socket
    from cuquantum import Network
    # Get the hostname
    # hostname = socket.gethostname()
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
+
    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.state_vector_operands()
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    # Broadcast the operand data.
+    operands = myconvertor.state_vector_operands()
    # operands = comm.bcast(operands, root)
    # Assign the device for each process.
    device_id = rank % getDeviceCount()
    # dev = cp.cuda.Device(device_id)
    # free_mem, total_mem = dev.mem_info
    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
    # Create network object.
    network = Network(*operands, options={"device_id": device_id})
    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)
@@ -96,45 +80,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    )
    slices = range(slice_begin, slice_end)
    # print(f"Process {rank} is processing slice range: {slices}.")
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
    # print(f"Process {rank} result shape is : {result.shape}.")
    # print(f"Process {rank} result size is : {result.nbytes}.")
    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
    """
    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
    num_slices = opt_info.num_slices#Andy
    chunk, extra = num_slices // size, num_slices % size#Andy
    slice_begin = rank * chunk + min(rank, extra)#Andy
    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
    slices = range(slice_begin, slice_end)#Andy
    result = network.contract(slices=slices)
    """
    return result, rank
 def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
+    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
-    import socket
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
    """
    from mpi4py import MPI
    from cuquantum import Network
    from cupy.cuda import nccl
    # Get the hostname
    # hostname = socket.gethostname()
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
+
    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()
@@ -146,27 +115,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.state_vector_operands()
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
    network = Network(*operands)
    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)
@@ -184,12 +144,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    )
    slices = range(slice_begin, slice_end)
    # print(f"Process {rank} is processing slice range: {slices}.")
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
    # print(f"Process {rank} result shape is : {result.shape}.")
    # print(f"Process {rank} result size is : {result.nbytes}.")
    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
@@ -206,21 +162,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    return result, rank
-def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
+def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
-    import socket
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
    """
    from mpi4py import MPI
    from cuquantum import Network
    from cupy.cuda import nccl
    # Get the hostname
    # hostname = socket.gethostname()
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
+
    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()
@@ -232,30 +189,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string)
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
    )
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
    network = Network(*operands)
    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)
@@ -273,12 +220,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
    )
    slices = range(slice_begin, slice_end)
    # print(f"Process {rank} is processing slice range: {slices}.")
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
    # print(f"Process {rank} result shape is : {result.shape}.")
    # print(f"Process {rank} result size is : {result.nbytes}.")
    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
@@ -295,57 +238,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
    return result, rank
-def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
+def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
    """
    from mpi4py import MPI  # this line initializes MPI
    import socket
    from cuquantum import Network
    # Get the hostname
    # hostname = socket.gethostname()
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
+
    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.expectation_operands(
        pauli_string_gen(qibo_circ.nqubits, pauli_string)
    )
    # mem_avail = cp.cuda.Device().mem_info[0]
    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    # Broadcast the operand data.
+    operands = myconvertor.expectation_operands(
-    # operands = comm.bcast(operands, root)
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
    )
    # Assign the device for each process.
    device_id = rank % getDeviceCount()
    # dev = cp.cuda.Device(device_id)
    # free_mem, total_mem = dev.mem_info
    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
    # Create network object.
    network = Network(*operands, options={"device_id": device_id})
    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)
@@ -363,12 +293,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
    )
    slices = range(slice_begin, slice_end)
    # print(f"Process {rank} is processing slice range: {slices}.")
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
    # print(f"Process {rank} result shape is : {result.shape}.")
    # print(f"Process {rank} result size is : {result.nbytes}.")
    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -377,8 +303,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
 def dense_vector_mps(qibo_circ, gate_algo, datatype):
-    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.
+    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
    """
    myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
    mps_helper = MPSContractionHelper(myconvertor.num_qubits)
@@ -388,7 +313,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
 def pauli_string_gen(nqubits, pauli_string_pattern):
-    """ Used internally to generate the string based on given pattern and number of qubit.
+    """Used internally to generate the string based on given pattern and number of qubit.
    Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
    """
    if nqubits <= 0:
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,5 +1,7 @@
 from cuquantum import CircuitToEinsum, contract, contract_path, tensor
 # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
 class MPSContractionHelper:
    """
@@ -85,7 +87,7 @@ class MPSContractionHelper:
        self, mps_tensors, operator, qubits, options=None, normalize=False
    ):
        """
-        Contract the corresponding tensor network to form the state vector representation of the MPS.
+        Contract the corresponding tensor network to form the expectation of the MPS.
        Args:
            mps_tensors: A list of rank-3 ndarray-like tensor objects.