Merge pull request #19 from qiboteam/multi-node-multi-GPU

Multi node multi gpu
2023-10-17 15:31:51 +08:00
parent cb0a6f0d19 fa0ed842b5
commit c617cf5659
2 changed files with 42 additions and 1 deletions
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -94,7 +94,8 @@ class QiboCircuitToEinsum:
            required_shape = self.op_shape_from_qubits(len(gate_qubits))
            self.gate_tensors.append(
                (
-                    cp.asarray(gate.matrix).reshape(required_shape),
+                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(
                        required_shape),
                    gate_qubits,
                )
            )
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -1,5 +1,9 @@
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
 from cuquantum import contract
 from cuquantum import cutensornet as cutn
 import multiprocessing
 from cupy.cuda.runtime import getDeviceCount
 import cupy as cp
 from qibotn.QiboCircuitToMPS import QiboCircuitToMPS
 from qibotn.mps_contraction_helper import MPSContractionHelper
@@ -10,6 +14,42 @@ def eval(qibo_circ, datatype):
    return contract(*myconvertor.state_vector_operands())
 def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
    """
    from mpi4py import MPI  # this line initializes MPI
    ncpu_threads = multiprocessing.cpu_count() // 2
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()
    handle = cutn.create()
    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
    network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    operands_interleave = myconvertor.state_vector_operands()
    # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
    network = cutn.Network(*operands_interleave, options=network_opts)
    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads})
    # Execution: To execute the contraction using the optimal path found previously
    result = network.contract()
    cutn.destroy(handle)
    return result, rank
 def eval_mps(qibo_circ, gate_algo, datatype):
    myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
    mps_helper = MPSContractionHelper(myconvertor.num_qubits)