From ec4784d09fe831c91c490a4b5a275c8294992596 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Fri, 14 Jul 2023 12:13:27 +0800
Subject: [PATCH 01/11] added draft code for multi node

---
 src/qibotn/test_multinode.py | 126 +++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 src/qibotn/test_multinode.py

diff --git a/src/qibotn/test_multinode.py b/src/qibotn/test_multinode.py
new file mode 100644
index 0000000..0570b38
--- /dev/null
+++ b/src/qibotn/test_multinode.py
@@ -0,0 +1,126 @@
+import qibo
+#import qibotn.cutn as cutn
+from cuquantum import cutensornet as cutn
+
+from qibo import gates
+from qibo.models import Circuit, QFT
+import numpy as np
+from mpi4py import MPI  # this line initializes MPI
+import cupy as cp
+from cupy.cuda.runtime import getDeviceCount
+from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
+import cuquantum
+
+def qibo_qft(nqubits, swaps):
+    circ_qibo = QFT(nqubits, swaps)
+    state_vec = np.array(circ_qibo())
+    return circ_qibo, state_vec
+
+print("QiboTN")
+
+root = 0
+comm = MPI.COMM_WORLD
+rank, size = comm.Get_rank(), comm.Get_size()
+print("Andy: Rank ", rank," size ", size)
+# Assign the device for each process.
+device_id = rank % getDeviceCount()
+cp.cuda.Device(device_id).use()
+
+datatype = 'complex128'
+nqubits = 10
+'''
+qibo_circ = Circuit(nqubits)
+qibo_circ.add(gates.H(0))
+#qibo_circ.add(gates.CZ(3,4))
+qibo_circ.add(gates.CZ(2,4))
+#qibo_circ.add(gates.CNOT(0,4))
+#qibo_circ.add(gates.SWAP(0,4))
+qibo_circ.add(gates.H(2))
+qibo_circ.add(gates.H(4))
+'''
+qibo_circ = QFT(nqubits)
+
+'''
+expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+print("Andy: expr =",expr)
+if rank == root:
+    operands = [cp.random.rand(*shape) for shape in shapes]
+else:
+    operands = [cp.empty(shape) for shape in shapes]
+'''
+
+myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+expr, mode_label, q_frontier, operands = myconvertor.state_vector()
+shapes = [tensor.shape for tensor in operands]
+print("expr ", expr)
+print("Operands ", operands)
+print("Shape", shapes)
+# Set the operand data on root. Since we use the buffer interface APIs offered by mpi4py for communicating array
+#  objects, we can directly use device arrays (cupy.ndarray, for example) as we assume mpi4py is built against
+#  a CUDA-aware MPI.
+if rank != root:
+    operands = [cp.empty(shape,dtype="complex128") for shape in shapes]
+
+'''    
+if rank == root:
+    operands = [cp.random.rand(*shape) for shape in shapes]
+    print("Operands random", operands)
+
+else:
+    operands = [cp.empty(shape) for shape in shapes]
+'''
+
+for operand in operands:
+    print("Is CUPY array? ", cp.get_array_module(operand), " Operand size = ", operand.nbytes)
+
+for operand in operands:
+    comm.Bcast(operand, root)
+
+# Bind the communicator to the library handle
+handle = cutn.create()
+print("Andy cutn.create()")
+print("Andy ", cutn.get_mpi_comm_pointer(comm))
+cutn.distributed_reset_configuration(
+    handle, *cutn.get_mpi_comm_pointer(comm)
+)
+print("Andy cutn.distributed_reset_configuration")
+
+operands_interleave = myconvertor.get_interleave_format( mode_label, q_frontier, operands)
+print("new function interkeave ", operands_interleave)
+print("Ori function interleave", myconvertor.state_vector_operands())
+
+result = cuquantum.contract(*operands_interleave, options={'device_id' : device_id, 'handle': handle})
+#result = cuquantum.contract(expr, *operands, options={'device_id' : device_id, 'handle': handle})
+
+'''
+
+# Create a new GPU buffer for verification
+result_cp = cp.empty_like(result)
+
+# Sum the partial contribution from each process on root, with GPU
+if rank == root:
+    comm.Reduce(sendbuf=MPI.IN_PLACE, recvbuf=result_cp, op=MPI.SUM, root=root)
+else:
+    comm.Reduce(sendbuf=result_cp, recvbuf=None, op=MPI.SUM, root=root)
+'''
+# Check correctness.
+if rank == root:
+    #operands = myconvertor.state_vector_operands()
+    #result_cp = cp.einsum(*operands, optimize=True)
+    #result_cp = np.einsum(*operands, optimize=True)
+    (qibo_circ, result_sv) =  qibo_qft(nqubits, swaps=True)
+    print("Does the cuQuantum parallel contraction result match the cupy.einsum result?", cp.allclose(result.flatten(), result_sv))
+
+
+'''
+result_tn = cutn.eval(qibo_circ, datatype)
+
+qibo.set_backend(backend="qibojit", platform="numpy")
+(qibo_circ, result_sv) = qibo_qft(nqubits, swaps=True)
+#print(result_tn)
+#print(result_sv)
+
+assert np.allclose(
+        result_sv, result_tn.flatten()), "Resulting dense vectors do not match"
+'''
\ No newline at end of file

From 5d65149271aa9c7c0a24596de56f131a0f0351fe Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Fri, 21 Jul 2023 16:47:16 +0800
Subject: [PATCH 02/11] Update

---
 src/qibotn/test_multinode.py | 131 ++++++++++-------------------------
 1 file changed, 35 insertions(+), 96 deletions(-)

diff --git a/src/qibotn/test_multinode.py b/src/qibotn/test_multinode.py
index 0570b38..8b234ec 100644
--- a/src/qibotn/test_multinode.py
+++ b/src/qibotn/test_multinode.py
@@ -1,126 +1,65 @@
-import qibo
-#import qibotn.cutn as cutn
-from cuquantum import cutensornet as cutn
-
-from qibo import gates
-from qibo.models import Circuit, QFT
+import os
+import sys
+from timeit import default_timer as timer
 import numpy as np
-from mpi4py import MPI  # this line initializes MPI
 import cupy as cp
+import cuquantum
+from cuquantum import cutensornet as cutn
+from qibo import gates
+from qibo.models import QFT
+from mpi4py import MPI  # this line initializes MPI
 from cupy.cuda.runtime import getDeviceCount
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
-import cuquantum
 
 def qibo_qft(nqubits, swaps):
     circ_qibo = QFT(nqubits, swaps)
     state_vec = np.array(circ_qibo())
     return circ_qibo, state_vec
 
-print("QiboTN")
+args = sys.argv
+
+if len(args) < 2:
+    print("Usage: python script.py [nqubits] ")
+    sys.exit(1)
+    
+nqubits = int(args[1])
 
 root = 0
 comm = MPI.COMM_WORLD
 rank, size = comm.Get_rank(), comm.Get_size()
-print("Andy: Rank ", rank," size ", size)
-# Assign the device for each process.
 device_id = rank % getDeviceCount()
 cp.cuda.Device(device_id).use()
+#print("Andy: Rank ", rank," size ", size, 'Device count',getDeviceCount())
+
+# Check if the env var is set
+if not "CUTENSORNET_COMM_LIB" in os.environ:
+    raise RuntimeError("need to set CUTENSORNET_COMM_LIB to the path of the MPI wrapper library")
+
+if not os.path.isfile(os.environ["CUTENSORNET_COMM_LIB"]):
+    raise RuntimeError("CUTENSORNET_COMM_LIB does not point to the path of the MPI wrapper library")
 
 datatype = 'complex128'
-nqubits = 10
-'''
-qibo_circ = Circuit(nqubits)
-qibo_circ.add(gates.H(0))
-#qibo_circ.add(gates.CZ(3,4))
-qibo_circ.add(gates.CZ(2,4))
-#qibo_circ.add(gates.CNOT(0,4))
-#qibo_circ.add(gates.SWAP(0,4))
-qibo_circ.add(gates.H(2))
-qibo_circ.add(gates.H(4))
-'''
 qibo_circ = QFT(nqubits)
-
-'''
-expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
-shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
-print("Andy: expr =",expr)
-if rank == root:
-    operands = [cp.random.rand(*shape) for shape in shapes]
-else:
-    operands = [cp.empty(shape) for shape in shapes]
-'''
-
 myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-expr, mode_label, q_frontier, operands = myconvertor.state_vector()
-shapes = [tensor.shape for tensor in operands]
-print("expr ", expr)
-print("Operands ", operands)
-print("Shape", shapes)
-# Set the operand data on root. Since we use the buffer interface APIs offered by mpi4py for communicating array
-#  objects, we can directly use device arrays (cupy.ndarray, for example) as we assume mpi4py is built against
-#  a CUDA-aware MPI.
-if rank != root:
-    operands = [cp.empty(shape,dtype="complex128") for shape in shapes]
-
-'''    
-if rank == root:
-    operands = [cp.random.rand(*shape) for shape in shapes]
-    print("Operands random", operands)
-
-else:
-    operands = [cp.empty(shape) for shape in shapes]
-'''
-
-for operand in operands:
-    print("Is CUPY array? ", cp.get_array_module(operand), " Operand size = ", operand.nbytes)
-
-for operand in operands:
-    comm.Bcast(operand, root)
 
 # Bind the communicator to the library handle
 handle = cutn.create()
-print("Andy cutn.create()")
-print("Andy ", cutn.get_mpi_comm_pointer(comm))
 cutn.distributed_reset_configuration(
     handle, *cutn.get_mpi_comm_pointer(comm)
 )
-print("Andy cutn.distributed_reset_configuration")
 
-operands_interleave = myconvertor.get_interleave_format( mode_label, q_frontier, operands)
-print("new function interkeave ", operands_interleave)
-print("Ori function interleave", myconvertor.state_vector_operands())
-
-result = cuquantum.contract(*operands_interleave, options={'device_id' : device_id, 'handle': handle})
-#result = cuquantum.contract(expr, *operands, options={'device_id' : device_id, 'handle': handle})
-
-'''
-
-# Create a new GPU buffer for verification
-result_cp = cp.empty_like(result)
-
-# Sum the partial contribution from each process on root, with GPU
 if rank == root:
-    comm.Reduce(sendbuf=MPI.IN_PLACE, recvbuf=result_cp, op=MPI.SUM, root=root)
-else:
-    comm.Reduce(sendbuf=result_cp, recvbuf=None, op=MPI.SUM, root=root)
-'''
+    start = timer()
+    
+result = cuquantum.contract(*myconvertor.state_vector_operands(), options={'device_id' : device_id, 'handle': handle})
+
+if rank == root:
+    end = timer()
+
 # Check correctness.
 if rank == root:
-    #operands = myconvertor.state_vector_operands()
-    #result_cp = cp.einsum(*operands, optimize=True)
-    #result_cp = np.einsum(*operands, optimize=True)
-    (qibo_circ, result_sv) =  qibo_qft(nqubits, swaps=True)
-    print("Does the cuQuantum parallel contraction result match the cupy.einsum result?", cp.allclose(result.flatten(), result_sv))
-
-
-'''
-result_tn = cutn.eval(qibo_circ, datatype)
-
-qibo.set_backend(backend="qibojit", platform="numpy")
-(qibo_circ, result_sv) = qibo_qft(nqubits, swaps=True)
-#print(result_tn)
-#print(result_sv)
-
-assert np.allclose(
-        result_sv, result_tn.flatten()), "Resulting dense vectors do not match"
-'''
\ No newline at end of file
+    #(qibo_circ, result_sv) =  qibo_qft(nqubits, swaps=True)
+    time = end - start
+    #print("Does the cuQuantum parallel contraction result match the cupy.einsum result?", cp.allclose(result.flatten(), result_sv))
+    print("nqubit", nqubits, "time taken = ", time, 's')
+    
\ No newline at end of file

From cef8fb833e3fa0fcd97be54a23c75a12c6c5bc09 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 30 Aug 2023 10:39:17 +0800
Subject: [PATCH 03/11] Add eval_tn_mpi function

---
 src/qibotn/cutn.py | 56 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index e6f3e8c..5790772 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -1,8 +1,62 @@
-# from qibotn import quimb as qiboquimb
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
 from cuquantum import contract
+from cuquantum import cutensornet as cutn
+from mpi4py import MPI  # this line initializes MPI
+import multiprocessing
+from cupy.cuda.runtime import getDeviceCount
 
 
 def eval(qibo_circ, datatype):
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(*myconvertor.state_vector_operands())
+
+
+def eval_tn_MPI(qibo_circ, datatype):
+    
+    ncpu_threads = multiprocessing.cpu_count() // 2
+    n_samples = 8
+    
+    root = 0
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    
+    device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
+    
+    handle = cutn.create()
+    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
+    network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
+
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    operands_interleave = myconvertor.state_vector_operands()
+    
+    network = cutn.Network(*operands_interleave, options=network_opts)
+    network.contract_path(optimize={'samples': n_samples, 'threads': ncpu_threads}) # Calculate path, info
+    
+    result = network.contract()
+    
+    cutn.destroy(handle)
+    
+    if rank == root:
+        return result, rank
+
+
+if __name__ == "__main__":
+    
+    from qibo.models import QFT 
+    import cupy as cp
+    import numpy as np
+    
+    num_qubits  = 10
+    swaps = True
+    circ_qibo = QFT(num_qubits, swaps)
+    
+    dtype="complex128"
+    sv_mpi, rank = eval_tn_MPI(circ_qibo, dtype)
+    
+    if rank == 0:
+        sv_reference = eval(circ_qibo, dtype)
+        state_vec = np.array(circ_qibo())
+        print(f"State vector difference: {abs(sv_mpi-sv_reference).max():0.3e}")
+        assert cp.allclose(sv_mpi, sv_reference)
+        assert cp.allclose(sv_mpi.flatten(), state_vec)
\ No newline at end of file

From 000c4a1b8e5bf433bcbd2b631ddaf658252942c2 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 30 Aug 2023 17:24:34 +0800
Subject: [PATCH 04/11] Remove unuse file

---
 src/qibotn/test_multinode.py | 65 ------------------------------------
 1 file changed, 65 deletions(-)
 delete mode 100644 src/qibotn/test_multinode.py

diff --git a/src/qibotn/test_multinode.py b/src/qibotn/test_multinode.py
deleted file mode 100644
index 8b234ec..0000000
--- a/src/qibotn/test_multinode.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-import sys
-from timeit import default_timer as timer
-import numpy as np
-import cupy as cp
-import cuquantum
-from cuquantum import cutensornet as cutn
-from qibo import gates
-from qibo.models import QFT
-from mpi4py import MPI  # this line initializes MPI
-from cupy.cuda.runtime import getDeviceCount
-from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
-
-def qibo_qft(nqubits, swaps):
-    circ_qibo = QFT(nqubits, swaps)
-    state_vec = np.array(circ_qibo())
-    return circ_qibo, state_vec
-
-args = sys.argv
-
-if len(args) < 2:
-    print("Usage: python script.py [nqubits] ")
-    sys.exit(1)
-    
-nqubits = int(args[1])
-
-root = 0
-comm = MPI.COMM_WORLD
-rank, size = comm.Get_rank(), comm.Get_size()
-device_id = rank % getDeviceCount()
-cp.cuda.Device(device_id).use()
-#print("Andy: Rank ", rank," size ", size, 'Device count',getDeviceCount())
-
-# Check if the env var is set
-if not "CUTENSORNET_COMM_LIB" in os.environ:
-    raise RuntimeError("need to set CUTENSORNET_COMM_LIB to the path of the MPI wrapper library")
-
-if not os.path.isfile(os.environ["CUTENSORNET_COMM_LIB"]):
-    raise RuntimeError("CUTENSORNET_COMM_LIB does not point to the path of the MPI wrapper library")
-
-datatype = 'complex128'
-qibo_circ = QFT(nqubits)
-myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-
-# Bind the communicator to the library handle
-handle = cutn.create()
-cutn.distributed_reset_configuration(
-    handle, *cutn.get_mpi_comm_pointer(comm)
-)
-
-if rank == root:
-    start = timer()
-    
-result = cuquantum.contract(*myconvertor.state_vector_operands(), options={'device_id' : device_id, 'handle': handle})
-
-if rank == root:
-    end = timer()
-
-# Check correctness.
-if rank == root:
-    #(qibo_circ, result_sv) =  qibo_qft(nqubits, swaps=True)
-    time = end - start
-    #print("Does the cuQuantum parallel contraction result match the cupy.einsum result?", cp.allclose(result.flatten(), result_sv))
-    print("nqubit", nqubits, "time taken = ", time, 's')
-    
\ No newline at end of file

From 1c9df2647264f4419fcf31f3a785bf58523e11ed Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 30 Aug 2023 17:25:04 +0800
Subject: [PATCH 05/11] Update with multi node code

---
 src/qibotn/cutn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 5790772..7d9984f 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -19,7 +19,6 @@ def eval_tn_MPI(qibo_circ, datatype):
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
-    
     device_id = rank % getDeviceCount()
     cp.cuda.Device(device_id).use()
     
@@ -31,7 +30,7 @@ def eval_tn_MPI(qibo_circ, datatype):
     operands_interleave = myconvertor.state_vector_operands()
     
     network = cutn.Network(*operands_interleave, options=network_opts)
-    network.contract_path(optimize={'samples': n_samples, 'threads': ncpu_threads}) # Calculate path, info
+    network.contract_path(optimize={'samples': n_samples, 'threads': ncpu_threads}) # Calculate optimal path, returns path and info
     
     result = network.contract()
     

From d12b8ab882d1a629eec60e0c69f49d7f4d3b41c3 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 30 Aug 2023 17:26:07 +0800
Subject: [PATCH 06/11] Black formate

---
 src/qibotn/cutn.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 7d9984f..2fc9079 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -12,50 +12,50 @@ def eval(qibo_circ, datatype):
 
 
 def eval_tn_MPI(qibo_circ, datatype):
-    
     ncpu_threads = multiprocessing.cpu_count() // 2
     n_samples = 8
-    
+
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     device_id = rank % getDeviceCount()
     cp.cuda.Device(device_id).use()
-    
+
     handle = cutn.create()
     cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
     network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
 
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     operands_interleave = myconvertor.state_vector_operands()
-    
+
     network = cutn.Network(*operands_interleave, options=network_opts)
-    network.contract_path(optimize={'samples': n_samples, 'threads': ncpu_threads}) # Calculate optimal path, returns path and info
-    
+    network.contract_path(
+        optimize={"samples": n_samples, "threads": ncpu_threads}
+    )  # Calculate optimal path, returns path and info
+
     result = network.contract()
-    
+
     cutn.destroy(handle)
-    
+
     if rank == root:
         return result, rank
 
 
 if __name__ == "__main__":
-    
-    from qibo.models import QFT 
+    from qibo.models import QFT
     import cupy as cp
     import numpy as np
-    
-    num_qubits  = 10
+
+    num_qubits = 10
     swaps = True
     circ_qibo = QFT(num_qubits, swaps)
-    
-    dtype="complex128"
+
+    dtype = "complex128"
     sv_mpi, rank = eval_tn_MPI(circ_qibo, dtype)
-    
+
     if rank == 0:
         sv_reference = eval(circ_qibo, dtype)
         state_vec = np.array(circ_qibo())
         print(f"State vector difference: {abs(sv_mpi-sv_reference).max():0.3e}")
         assert cp.allclose(sv_mpi, sv_reference)
-        assert cp.allclose(sv_mpi.flatten(), state_vec)
\ No newline at end of file
+        assert cp.allclose(sv_mpi.flatten(), state_vec)

From f59b1b0bc7346bd1aa4d6ee086c06c350f748238 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 30 Aug 2023 17:29:08 +0800
Subject: [PATCH 07/11] Update return for all ranks

---
 src/qibotn/cutn.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 2fc9079..9bc1d67 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -15,7 +15,6 @@ def eval_tn_MPI(qibo_circ, datatype):
     ncpu_threads = multiprocessing.cpu_count() // 2
     n_samples = 8
 
-    root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     device_id = rank % getDeviceCount()
@@ -37,8 +36,7 @@ def eval_tn_MPI(qibo_circ, datatype):
 
     cutn.destroy(handle)
 
-    if rank == root:
-        return result, rank
+    return result, rank
 
 
 if __name__ == "__main__":

From fc665fcfc57144e4e559aebc0b14068ad6b9c4a9 Mon Sep 17 00:00:00 2001
From: Liwei Yang <yang0345@e.ntu.edu.sg>
Date: Wed, 27 Sep 2023 16:41:52 +0800
Subject: [PATCH 08/11] Fix the compatibility issue with qibo-0.2.0 during
 circuit-to-TN conversion

---
 src/qibotn/QiboCircuitConvertor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py
index c30cfb6..ba8212f 100644
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -95,7 +95,8 @@ class QiboCircuitToEinsum:
             required_shape = self.op_shape_from_qubits(len(gate_qubits))
             self.gate_tensors.append(
                 (
-                    cp.asarray(gate.matrix).reshape(required_shape),
+                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(
+                        required_shape),
                     gate_qubits,
                 )
             )

From b2a2bfedf1fd424607219c091345a7b6362ca3eb Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Tue, 3 Oct 2023 14:25:28 +0800
Subject: [PATCH 09/11] Removed main and added cupy import

---
 src/qibotn/cutn.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 9bc1d67..3d42eb7 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -4,6 +4,7 @@ from cuquantum import cutensornet as cutn
 from mpi4py import MPI  # this line initializes MPI
 import multiprocessing
 from cupy.cuda.runtime import getDeviceCount
+import cupy as cp
 
 
 def eval(qibo_circ, datatype):
@@ -37,23 +38,3 @@ def eval_tn_MPI(qibo_circ, datatype):
     cutn.destroy(handle)
 
     return result, rank
-
-
-if __name__ == "__main__":
-    from qibo.models import QFT
-    import cupy as cp
-    import numpy as np
-
-    num_qubits = 10
-    swaps = True
-    circ_qibo = QFT(num_qubits, swaps)
-
-    dtype = "complex128"
-    sv_mpi, rank = eval_tn_MPI(circ_qibo, dtype)
-
-    if rank == 0:
-        sv_reference = eval(circ_qibo, dtype)
-        state_vec = np.array(circ_qibo())
-        print(f"State vector difference: {abs(sv_mpi-sv_reference).max():0.3e}")
-        assert cp.allclose(sv_mpi, sv_reference)
-        assert cp.allclose(sv_mpi.flatten(), state_vec)

From 15e90ebcc77096b43b875481e40655c85614bbc4 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Wed, 4 Oct 2023 11:18:05 +0800
Subject: [PATCH 10/11] Added comments

---
 src/qibotn/cutn.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 3d42eb7..5267bc0 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -1,7 +1,6 @@
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
 from cuquantum import contract
 from cuquantum import cutensornet as cutn
-from mpi4py import MPI  # this line initializes MPI
 import multiprocessing
 from cupy.cuda.runtime import getDeviceCount
 import cupy as cp
@@ -12,9 +11,16 @@ def eval(qibo_circ, datatype):
     return contract(*myconvertor.state_vector_operands())
 
 
-def eval_tn_MPI(qibo_circ, datatype):
+def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
+    The conversion is performed by QiboCircuitToEinsum() afterwhich it goes through 2 steps: pathfinder and execution.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
+    """
+
+    from mpi4py import MPI  # this line initializes MPI
+
     ncpu_threads = multiprocessing.cpu_count() // 2
-    n_samples = 8
 
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
@@ -25,14 +31,15 @@ def eval_tn_MPI(qibo_circ, datatype):
     cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
     network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
 
+    # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     operands_interleave = myconvertor.state_vector_operands()
 
+    # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
     network = cutn.Network(*operands_interleave, options=network_opts)
-    network.contract_path(
-        optimize={"samples": n_samples, "threads": ncpu_threads}
-    )  # Calculate optimal path, returns path and info
+    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads})
 
+    # Execution: To execute the contraction using the optimal path found previously
     result = network.contract()
 
     cutn.destroy(handle)

From f97e1f95cc1912152b290acebe0b0816e803fdac Mon Sep 17 00:00:00 2001
From: Liwei Yang <yang0345@e.ntu.edu.sg>
Date: Tue, 17 Oct 2023 10:58:26 +0800
Subject: [PATCH 11/11] Minor typo fix

---
 src/qibotn/cutn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qibotn/cutn.py b/src/qibotn/cutn.py
index 5267bc0..343e2e3 100644
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -13,7 +13,7 @@ def eval(qibo_circ, datatype):
 
 def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
     """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
-    The conversion is performed by QiboCircuitToEinsum() afterwhich it goes through 2 steps: pathfinder and execution.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
     The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
     After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
     """