Tidy up

2024-02-01 14:42:17 +08:00
parent 6f4ffa777a
commit 31fe1ef0c9
4 changed files with 49 additions and 139 deletions
--- a/src/qibotn/MPSUtils.py
+++ b/src/qibotn/MPSUtils.py
@@ -2,6 +2,8 @@ import cupy as cp
 from cuquantum.cutensornet.experimental import contract_decompose
 from cuquantum import contract

+# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
+

 def initial(num_qubits, dtype):
    """
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -1,6 +1,8 @@
 import cupy as cp
 import numpy as np

+# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
+

 class QiboCircuitToEinsum:
    """Convert a circuit to a Tensor Network (TN) representation.
@@ -159,9 +161,7 @@ class QiboCircuitToEinsum:
        return gates

    def expectation_operands(self, pauli_string):
-        # assign pauli string to qubit
-        # _get_forward_inverse_metadata()
-        input_bitstring = "0" * self.circuit.nqubits  # Need all qubits!
+        input_bitstring = "0" * self.circuit.nqubits 

        input_operands = self._get_bitstring_tensors(input_bitstring)
        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
@@ -185,8 +185,6 @@ class QiboCircuitToEinsum:

        next_frontier = max(qubits_frontier.values()) + 1

-        # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
-
        pauli_gates = self.get_pauli_gates(
            pauli_map, dtype=self.dtype, backend=self.backend
        )
@@ -208,18 +206,4 @@ class QiboCircuitToEinsum:

        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]

-        # expec = contract(*operand_exp_interleave)
-        # print(expec)
-
-        """
-        gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, 
-                                                                                         qubits_frontier, 
-                                                                                         next_frontier)
-        
-        mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
-        operands = input_operands + gate_operands + input_operands[:n_qubits]
-
-        output_mode_labels = []
-        expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
-        """
        return operand_exp_interleave
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,7 +1,5 @@
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
 from cuquantum import contract
-from cuquantum import cutensornet as cutn
-import multiprocessing
 from cupy.cuda.runtime import getDeviceCount
 import cupy as cp

@@ -10,15 +8,17 @@ from qibotn.mps_contraction_helper import MPSContractionHelper


 def dense_vector_tn(qibo_circ, datatype):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(*myconvertor.state_vector_operands())


-def expectation_pauli_tn(qibo_circ, datatype, pauli_string):
+def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(
        *myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string)
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
        )
    )

@@ -30,54 +30,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
    """

-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+    from mpi4py import MPI
    from cuquantum import Network

-    # Get the hostname
-    # hostname = socket.gethostname()
-
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
    device_id = rank % getDeviceCount()

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    operands = myconvertor.state_vector_operands()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

-    # Broadcast the operand data.
-    # operands = comm.bcast(operands, root)
+    operands = myconvertor.state_vector_operands()

    # Assign the device for each process.
    device_id = rank % getDeviceCount()

-    # dev = cp.cuda.Device(device_id)
-    # free_mem, total_mem = dev.mem_info
-    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
-
    # Create network object.
    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

@@ -95,45 +76,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    )
    slices = range(slice_begin, slice_end)

-    # print(f"Process {rank} is processing slice range: {slices}.")
-
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)

-    """
-    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-
-    num_slices = opt_info.num_slices#Andy
-    chunk, extra = num_slices // size, num_slices % size#Andy
-    slice_begin = rank * chunk + min(rank, extra)#Andy
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
-    slices = range(slice_begin, slice_end)#Andy
-    result = network.contract(slices=slices)
-    """
-
    return result, rank


 def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
+    """
+    from mpi4py import MPI
    from cuquantum import Network
    from cupy.cuda import nccl

-    # Get the hostname
-    # hostname = socket.gethostname()
-
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
    device_id = rank % getDeviceCount()

    cp.cuda.Device(device_id).use()
@@ -145,27 +111,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.state_vector_operands()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )

-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
-
    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)

@@ -183,12 +140,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    )
    slices = range(slice_begin, slice_end)

-    # print(f"Process {rank} is processing slice range: {slices}.")
-
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
@@ -205,21 +158,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    return result, rank


-def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
+    """
+    from mpi4py import MPI
    from cuquantum import Network
    from cupy.cuda import nccl

-    # Get the hostname
-    # hostname = socket.gethostname()
-
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
    device_id = rank % getDeviceCount()

    cp.cuda.Device(device_id).use()
@@ -231,30 +185,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string)
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
    )

-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-
    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )

-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
-
    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)

@@ -272,12 +216,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
    )
    slices = range(slice_begin, slice_end)

-    # print(f"Process {rank} is processing slice range: {slices}.")
-
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
@@ -294,57 +234,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
    return result, rank


-def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
+def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
+    """
    from mpi4py import MPI  # this line initializes MPI
-    import socket
    from cuquantum import Network

-    # Get the hostname
-    # hostname = socket.gethostname()
-
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
    device_id = rank % getDeviceCount()

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string)
-    )
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

-    # Broadcast the operand data.
-    # operands = comm.bcast(operands, root)
+    operands = myconvertor.expectation_operands(
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+    )

    # Assign the device for each process.
    device_id = rank % getDeviceCount()

-    # dev = cp.cuda.Device(device_id)
-    # free_mem, total_mem = dev.mem_info
-    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
-
    # Create network object.
    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
    )
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

@@ -362,12 +289,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
    )
    slices = range(slice_begin, slice_end)

-    # print(f"Process {rank} is processing slice range: {slices}.")
-
    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -376,8 +299,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):


 def dense_vector_mps(qibo_circ, gate_algo, datatype):
-    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.
-    """
+    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
    myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
    mps_helper = MPSContractionHelper(myconvertor.num_qubits)

@@ -387,7 +309,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):


 def pauli_string_gen(nqubits, pauli_string_pattern):
-    """ Used internally to generate the string based on given pattern and number of qubit.
+    """Used internally to generate the string based on given pattern and number of qubit.
    Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
    """
    if nqubits <= 0:
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,5 +1,7 @@
 from cuquantum import contract, contract_path, CircuitToEinsum, tensor

+# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
+

 class MPSContractionHelper:
    """
@@ -85,7 +87,7 @@ class MPSContractionHelper:
        self, mps_tensors, operator, qubits, options=None, normalize=False
    ):
        """
-        Contract the corresponding tensor network to form the state vector representation of the MPS.
+        Contract the corresponding tensor network to form the expectation of the MPS.

        Args:
            mps_tensors: A list of rank-3 ndarray-like tensor objects.