Format with black

2024-01-24 11:47:32 +08:00
parent 0420cbfa52
commit 37212a388d
3 changed files with 277 additions and 217 deletions
--- a/src/qibotn/cutn.py
+++ b/src/qibotn/cutn.py
@@ -13,9 +13,13 @@ def eval(qibo_circ, datatype):
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(*myconvertor.state_vector_operands())

+
 def eval_expectation(qibo_circ, datatype):
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(*myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits)))
+    return contract(
+        *myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
+    )
+

 def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
    from mpi4py import MPI  # this line initializes MPI
@@ -23,73 +27,79 @@ def eval_tn_MPI_2(qibo_circ, datatype, n_samples=8):
    from cuquantum import Network

    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
-    
-    
+
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
    # Broadcast the operand data.
-    #operands = comm.bcast(operands, root)
-        
+    # operands = comm.bcast(operands, root)
+
    # Assign the device for each process.
    device_id = rank % getDeviceCount()
-    
-    #dev = cp.cuda.Device(device_id)
-    #free_mem, total_mem = dev.mem_info
-    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # dev = cp.cuda.Device(device_id)
+    # free_mem, total_mem = dev.mem_info
+    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)

    # Create network object.
-    network = Network(*operands, options={'device_id' : device_id})
+    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    #if rank == root:
+    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

    # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
    slices = range(slice_begin, slice_end)

-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-    
+
    return result, rank

+
 def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
    from mpi4py import MPI  # this line initializes MPI
    import socket
@@ -97,18 +107,18 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):
    from cupy.cuda import nccl

    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
-    
+
    cp.cuda.Device(device_id).use()
-    
+
    # Set up the NCCL communicator.
    nccl_id = nccl.get_unique_id() if rank == root else None
    nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -116,51 +126,66 @@ def eval_tn_nccl(qibo_circ, datatype, n_samples=8):

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )

-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    #if rank == root:
+    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)

    # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
    slices = range(slice_begin, slice_end)

-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
-    
+    comm_nccl.reduce(
+        result.data.ptr,
+        result.data.ptr,
+        result.size,
+        nccl.NCCL_FLOAT64,
+        nccl.NCCL_SUM,
+        root,
+        stream_ptr,
+    )
+
    return result, rank

+
 def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
    from mpi4py import MPI  # this line initializes MPI
    import socket
@@ -168,18 +193,18 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):
    from cupy.cuda import nccl

    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    root = 0
    comm_mpi = MPI.COMM_WORLD
    rank = comm_mpi.Get_rank()
    size = comm_mpi.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
-    
+
    cp.cuda.Device(device_id).use()
-    
+
    # Set up the NCCL communicator.
    nccl_id = nccl.get_unique_id() if rank == root else None
    nccl_id = comm_mpi.bcast(nccl_id, root)
@@ -187,50 +212,64 @@ def eval_tn_nccl_expectation(qibo_circ, datatype, n_samples=8):

    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))

-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )

-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    #if rank == root:
+    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm_mpi.bcast(info, sender)

    # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
    slices = range(slice_begin, slice_end)

-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(result.data.ptr, result.data.ptr, result.size, nccl.NCCL_FLOAT64, nccl.NCCL_SUM, root, stream_ptr)
-    
+    comm_nccl.reduce(
+        result.data.ptr,
+        result.data.ptr,
+        result.size,
+        nccl.NCCL_FLOAT64,
+        nccl.NCCL_SUM,
+        root,
+        stream_ptr,
+    )
+
    return result, rank


@@ -240,128 +279,144 @@ def eval_tn_MPI_2_expectation(qibo_circ, datatype, n_samples=8):
    from cuquantum import Network

    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    root = 0
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
-    
-    
+
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-    
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+
    # Broadcast the operand data.
-    #operands = comm.bcast(operands, root)
-        
+    # operands = comm.bcast(operands, root)
+
    # Assign the device for each process.
    device_id = rank % getDeviceCount()
-    
-    #dev = cp.cuda.Device(device_id)
-    #free_mem, total_mem = dev.mem_info
-    #print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
+
+    # dev = cp.cuda.Device(device_id)
+    # free_mem, total_mem = dev.mem_info
+    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)

    # Create network object.
-    network = Network(*operands, options={'device_id' : device_id})
+    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(optimize={'samples': 8, 'slicing': {'min_slices': max(32, size)}})
-    #print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
+    path, info = network.contract_path(
+        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+    )
+    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")

    # Select the best path from all ranks.
    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)

-    #if rank == root:
+    # if rank == root:
    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")

    # Broadcast info from the sender to all other ranks.
    info = comm.bcast(info, sender)

    # Set path and slices.
-    path, info = network.contract_path(optimize={'path': info.path, 'slicing': info.slices})
+    path, info = network.contract_path(
+        optimize={"path": info.path, "slicing": info.slices}
+    )

    # Calculate this process's share of the slices.
    num_slices = info.num_slices
    chunk, extra = num_slices // size, num_slices % size
    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
    slices = range(slice_begin, slice_end)

-    #print(f"Process {rank} is processing slice range: {slices}.")
+    # print(f"Process {rank} is processing slice range: {slices}.")

    # Contract the group of slices the process is responsible for.
    result = network.contract(slices=slices)
-    #print(f"Process {rank} result shape is : {result.shape}.")
-    #print(f"Process {rank} result size is : {result.nbytes}.")
+    # print(f"Process {rank} result shape is : {result.shape}.")
+    # print(f"Process {rank} result size is : {result.nbytes}.")

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-    
+
    return result, rank


 def eval_tn_MPI_expectation(qibo_circ, datatype, n_samples=8):
    from mpi4py import MPI  # this line initializes MPI
    import socket
+
    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    ncpu_threads = multiprocessing.cpu_count() // 2
-    
+
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()

    handle = cutn.create()
    network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    operands_interleave = myconvertor.expectation_operands(PauliStringGen(qibo_circ.nqubits))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    operands_interleave = myconvertor.expectation_operands(
+        PauliStringGen(qibo_circ.nqubits)
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

    # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
    network = cutn.Network(*operands_interleave, options=network_opts)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
-    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    path, opt_info = network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "threads": ncpu_threads,
+            "slicing": {"min_slices": max(16, size)},
+        }
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract path",mem_avail, "rank =",rank)
    # Execution: To execute the contraction using the optimal path found previously
-    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
+    # print("opt_cost",opt_info.opt_cost, "Process =",rank)

-    
-    num_slices = opt_info.num_slices#Andy
-    chunk, extra = num_slices // size, num_slices % size#Andy
-    slice_begin = rank * chunk + min(rank, extra)#Andy
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
-    slices = range(slice_begin, slice_end)#Andy
+    num_slices = opt_info.num_slices  # Andy
+    chunk, extra = num_slices // size, num_slices % size  # Andy
+    slice_begin = rank * chunk + min(rank, extra)  # Andy
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )  # Andy
+    slices = range(slice_begin, slice_end)  # Andy
    result = network.contract(slices=slices)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract",mem_avail, "rank =",rank)
    cutn.destroy(handle)

    return result, rank

+
 def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through MPI.
    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
@@ -371,45 +426,52 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):

    from mpi4py import MPI  # this line initializes MPI
    import socket
+
    # Get the hostname
-    #hostname = socket.gethostname()
-    
+    # hostname = socket.gethostname()
+
    ncpu_threads = multiprocessing.cpu_count() // 2
-    
+
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()

    handle = cutn.create()
    network_opts = cutn.NetworkOptions(handle=handle, blocking="auto")
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft network opts",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft network opts",mem_avail, "rank =",rank)
    cutn.distributed_reset_configuration(handle, *cutn.get_mpi_comm_pointer(comm))
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft distributed reset config",mem_avail, "rank =",rank)
    # Perform circuit conversion
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft convetor",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
    operands_interleave = myconvertor.state_vector_operands()
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)

    # Pathfinder: To search for the optimal path. Optimal path are assigned to path and info attribute of the network object.
    network = cutn.Network(*operands_interleave, options=network_opts)
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
-    network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract path",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft cutn.Network(*operands_interleave,",mem_avail, "rank =",rank)
+    network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "threads": ncpu_threads,
+            "slicing": {"min_slices": max(16, size)},
+        }
+    )
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract path",mem_avail, "rank =",rank)
    # Execution: To execute the contraction using the optimal path found previously
-    #print("opt_cost",opt_info.opt_cost, "Process =",rank)
+    # print("opt_cost",opt_info.opt_cost, "Process =",rank)

-    '''
+    """
    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})

    num_slices = opt_info.num_slices#Andy
@@ -418,16 +480,16 @@ def eval_tn_MPI(qibo_circ, datatype, n_samples=8):
    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
    slices = range(slice_begin, slice_end)#Andy
    result = network.contract(slices=slices)
-    '''
+    """
    result = network.contract()

-    #mem_avail = cp.cuda.Device().mem_info[0]
-    #print("Mem avail: aft contract",mem_avail, "rank =",rank)
+    # mem_avail = cp.cuda.Device().mem_info[0]
+    # print("Mem avail: aft contract",mem_avail, "rank =",rank)
    cutn.destroy(handle)

    return result, rank

-  
+
 def eval_mps(qibo_circ, gate_algo, datatype):
    myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
    mps_helper = MPSContractionHelper(myconvertor.num_qubits)
@@ -436,18 +498,18 @@ def eval_mps(qibo_circ, gate_algo, datatype):
        myconvertor.mps_tensors, {"handle": myconvertor.handle}
    )

+
 def PauliStringGen(nqubits):
-    
    if nqubits <= 0:
        return "Invalid input. N should be a positive integer."

-    #characters = 'IXYZ'
-    characters = 'XXXZ'
+    # characters = 'IXYZ'
+    characters = "XXXZ"

-    result = ''
+    result = ""

    for i in range(nqubits):
        char_to_add = characters[i % len(characters)]
        result += char_to_add

-    return result
+    return result