diff --git a/src/qibotn/MPSUtils.py b/src/qibotn/MPSUtils.py
index fd1b4c7..4f84f67 100644
--- a/src/qibotn/MPSUtils.py
+++ b/src/qibotn/MPSUtils.py
@@ -2,6 +2,8 @@ import cupy as cp
 from cuquantum.cutensornet.experimental import contract_decompose
 from cuquantum import contract
 
+# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
+
 
 def initial(num_qubits, dtype):
     """
diff --git a/src/qibotn/QiboCircuitConvertor.py b/src/qibotn/QiboCircuitConvertor.py
index d3a0569..c59745b 100644
--- a/src/qibotn/QiboCircuitConvertor.py
+++ b/src/qibotn/QiboCircuitConvertor.py
@@ -1,6 +1,8 @@
 import cupy as cp
 import numpy as np
 
+# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
+
 
 class QiboCircuitToEinsum:
     """Convert a circuit to a Tensor Network (TN) representation.
@@ -159,9 +161,7 @@ class QiboCircuitToEinsum:
         return gates
 
     def expectation_operands(self, pauli_string):
-        # assign pauli string to qubit
-        # _get_forward_inverse_metadata()
-        input_bitstring = "0" * self.circuit.nqubits  # Need all qubits!
+        input_bitstring = "0" * self.circuit.nqubits 
 
         input_operands = self._get_bitstring_tensors(input_bitstring)
         pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
@@ -185,8 +185,6 @@ class QiboCircuitToEinsum:
 
         next_frontier = max(qubits_frontier.values()) + 1
 
-        # input_mode_labels, input_operands, qubits_frontier, next_frontier, inverse_gates = self._get_forward_inverse_metadata(coned_qubits)
-
         pauli_gates = self.get_pauli_gates(
             pauli_map, dtype=self.dtype, backend=self.backend
         )
@@ -208,18 +206,4 @@ class QiboCircuitToEinsum:
 
         operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
 
-        # expec = contract(*operand_exp_interleave)
-        # print(expec)
-
-        """
-        gate_mode_labels, gate_operands = circ_utils.parse_gates_to_mode_labels_operands(gates, 
-                                                                                         qubits_frontier, 
-                                                                                         next_frontier)
-        
-        mode_labels = input_mode_labels + gate_mode_labels + [[qubits_frontier[ix]] for ix in self.qubits]
-        operands = input_operands + gate_operands + input_operands[:n_qubits]
-
-        output_mode_labels = []
-        expression = circ_utils.convert_mode_labels_to_expression(mode_labels, output_mode_labels)
-        """
         return operand_exp_interleave
diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index a9aeaac..b73418a 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,7 +1,5 @@
 from qibotn.QiboCircuitConvertor import QiboCircuitToEinsum
 from cuquantum import contract
-from cuquantum import cutensornet as cutn
-import multiprocessing
 from cupy.cuda.runtime import getDeviceCount
 import cupy as cp
 
@@ -10,15 +8,17 @@ from qibotn.mps_contraction_helper import MPSContractionHelper
 
 
 def dense_vector_tn(qibo_circ, datatype):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to dense vector."""
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(*myconvertor.state_vector_operands())
 
 
-def expectation_pauli_tn(qibo_circ, datatype, pauli_string):
+def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string."""
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
     return contract(
         *myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string)
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
         )
     )
 
@@ -30,54 +30,35 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
     After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
     """
 
-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+    from mpi4py import MPI
     from cuquantum import Network
 
-    # Get the hostname
-    # hostname = socket.gethostname()
-
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
     device_id = rank % getDeviceCount()
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    operands = myconvertor.state_vector_operands()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
-    # Broadcast the operand data.
-    # operands = comm.bcast(operands, root)
+    operands = myconvertor.state_vector_operands()
 
     # Assign the device for each process.
     device_id = rank % getDeviceCount()
 
-    # dev = cp.cuda.Device(device_id)
-    # free_mem, total_mem = dev.mem_info
-    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
-
     # Create network object.
     network = Network(*operands, options={"device_id": device_id})
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
     path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
     )
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
     # Broadcast info from the sender to all other ranks.
     info = comm.bcast(info, sender)
 
@@ -95,45 +76,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
     )
     slices = range(slice_begin, slice_end)
 
-    # print(f"Process {rank} is processing slice range: {slices}.")
-
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
 
-    """
-    path, opt_info = network.contract_path(optimize={"samples": n_samples, "threads": ncpu_threads, 'slicing': {'min_slices': max(16, size)}})
-
-    num_slices = opt_info.num_slices#Andy
-    chunk, extra = num_slices // size, num_slices % size#Andy
-    slice_begin = rank * chunk + min(rank, extra)#Andy
-    slice_end = num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)#Andy
-    slices = range(slice_begin, slice_end)#Andy
-    result = network.contract(slices=slices)
-    """
-
     return result, rank
 
 
 def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+    """Convert qibo circuit to tensornet (TN) format and perform contraction using multi node and multi GPU through NCCL.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give a dense vector representation of the TN.
+    """
+    from mpi4py import MPI
     from cuquantum import Network
     from cupy.cuda import nccl
 
-    # Get the hostname
-    # hostname = socket.gethostname()
-
     root = 0
     comm_mpi = MPI.COMM_WORLD
     rank = comm_mpi.Get_rank()
     size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
     device_id = rank % getDeviceCount()
 
     cp.cuda.Device(device_id).use()
@@ -145,27 +111,18 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.state_vector_operands()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
     network = Network(*operands)
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
     path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
     )
 
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
-
     # Select the best path from all ranks.
     opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
     # Broadcast info from the sender to all other ranks.
     info = comm_mpi.bcast(info, sender)
 
@@ -183,12 +140,8 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
     )
     slices = range(slice_begin, slice_end)
 
-    # print(f"Process {rank} is processing slice range: {slices}.")
-
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     stream_ptr = cp.cuda.get_current_stream().ptr
@@ -205,21 +158,22 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
     return result, rank
 
 
-def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
-    from mpi4py import MPI  # this line initializes MPI
-    import socket
+def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through NCCL.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
+    """
+    from mpi4py import MPI
     from cuquantum import Network
     from cupy.cuda import nccl
 
-    # Get the hostname
-    # hostname = socket.gethostname()
-
     root = 0
     comm_mpi = MPI.COMM_WORLD
     rank = comm_mpi.Get_rank()
     size = comm_mpi.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
     device_id = rank % getDeviceCount()
 
     cp.cuda.Device(device_id).use()
@@ -231,30 +185,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
     operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string)
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
     )
 
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
-
     network = Network(*operands)
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
     path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
     )
 
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
-
     # Select the best path from all ranks.
     opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
     # Broadcast info from the sender to all other ranks.
     info = comm_mpi.bcast(info, sender)
 
@@ -272,12 +216,8 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
     )
     slices = range(slice_begin, slice_end)
 
-    # print(f"Process {rank} is processing slice range: {slices}.")
-
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     stream_ptr = cp.cuda.get_current_stream().ptr
@@ -294,57 +234,44 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string, n_samples=8):
     return result, rank
 
 
-def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
+def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to expectation of given Pauli string using multi node and multi GPU through MPI.
+    The conversion is performed by QiboCircuitToEinsum(), after which it goes through 2 steps: pathfinder and execution.
+    The pauli_string_pattern is used to generate the pauli string corresponding to the number of qubits of the system.
+    The pathfinder looks at user defined number of samples (n_samples) iteratively to select the least costly contraction path. This is sped up with multi thread.
+    After pathfinding the optimal path is used in the actual contraction to give an expectation value.
+    """
     from mpi4py import MPI  # this line initializes MPI
-    import socket
     from cuquantum import Network
 
-    # Get the hostname
-    # hostname = socket.gethostname()
-
     root = 0
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: Start",mem_avail, "rank =",rank, "hostname =",hostname)
+
     device_id = rank % getDeviceCount()
 
     # Perform circuit conversion
     myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft convetor",mem_avail, "rank =",rank)
-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string)
-    )
-    # mem_avail = cp.cuda.Device().mem_info[0]
-    # print("Mem avail: aft operand interleave",mem_avail, "rank =",rank)
 
-    # Broadcast the operand data.
-    # operands = comm.bcast(operands, root)
+    operands = myconvertor.expectation_operands(
+        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+    )
 
     # Assign the device for each process.
     device_id = rank % getDeviceCount()
 
-    # dev = cp.cuda.Device(device_id)
-    # free_mem, total_mem = dev.mem_info
-    # print("Mem free: ",free_mem, "Total mem: ",total_mem, "rank =",rank)
-
     # Create network object.
     network = Network(*operands, options={"device_id": device_id})
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
     path, info = network.contract_path(
-        optimize={"samples": 8, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
     )
-    # print(f"Process {rank} has the path with the  FLOP count {info.opt_cost}.")
 
     # Select the best path from all ranks.
     opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
 
-    # if rank == root:
-    #    print(f"Process {sender} has the path with the lowest FLOP count {opt_cost}.")
-
     # Broadcast info from the sender to all other ranks.
     info = comm.bcast(info, sender)
 
@@ -362,12 +289,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
     )
     slices = range(slice_begin, slice_end)
 
-    # print(f"Process {rank} is processing slice range: {slices}.")
-
     # Contract the group of slices the process is responsible for.
     result = network.contract(slices=slices)
-    # print(f"Process {rank} result shape is : {result.shape}.")
-    # print(f"Process {rank} result size is : {result.nbytes}.")
 
     # Sum the partial contribution from each process on root.
     result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -376,8 +299,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string, n_samples=8):
 
 
 def dense_vector_mps(qibo_circ, gate_algo, datatype):
-    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector.
-    """
+    """Convert qibo circuit to matrix product state (MPS) format and perform contraction to dense vector."""
     myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
     mps_helper = MPSContractionHelper(myconvertor.num_qubits)
 
@@ -387,7 +309,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
 
 
 def pauli_string_gen(nqubits, pauli_string_pattern):
-    """ Used internally to generate the string based on given pattern and number of qubit.
+    """Used internally to generate the string based on given pattern and number of qubit.
     Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
     """
     if nqubits <= 0:
diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py
index ee8e4e4..29d5e25 100644
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,5 +1,7 @@
 from cuquantum import contract, contract_path, CircuitToEinsum, tensor
 
+# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
+
 
 class MPSContractionHelper:
     """
@@ -85,7 +87,7 @@ class MPSContractionHelper:
         self, mps_tensors, operator, qubits, options=None, normalize=False
     ):
         """
-        Contract the corresponding tensor network to form the state vector representation of the MPS.
+        Contract the corresponding tensor network to form the expectation of the MPS.
 
         Args:
             mps_tensors: A list of rank-3 ndarray-like tensor objects.