From ef565eefc4fb97de65f297a0bf442c34f5f5d997 Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Thu, 4 Jul 2024 13:41:10 +0800
Subject: [PATCH] Add configuration and free memory explicitly

---
 src/qibotn/eval.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index 245aa5e..23624aa 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -325,25 +325,29 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
     comm = MPI.COMM_WORLD
     rank = comm.Get_rank()
     size = comm.Get_size()
-
-    device_id = rank % getDeviceCount()
-
-    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-
-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-    )
-
+    
     # Assign the device for each process.
     device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
 
+    # Perform circuit conversion
+    if rank==0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+
+        operands = myconvertor.expectation_operands(
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+        )
+    else:
+        operands = None
+    
+    operands = comm.bcast(operands, root)
+   
     # Create network object.
     network = Network(*operands, options={"device_id": device_id})
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
     path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
     )
 
     # Select the best path from all ranks.
@@ -371,6 +375,9 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
 
     # Sum the partial contribution from each process on root.
     result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+    
+    del network
+    mempool.free_all_blocks()
 
     return result, rank