diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py index 245aa5e..23624aa 100644 --- a/src/qibotn/eval.py +++ b/src/qibotn/eval.py @@ -325,25 +325,29 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - - device_id = rank % getDeviceCount() - - # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - - operands = myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) - ) - + # Assign the device for each process. device_id = rank % getDeviceCount() + cp.cuda.Device(device_id).use() + # Perform circuit conversion + if rank==0: + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + + operands = myconvertor.expectation_operands( + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) + ) + else: + operands = None + + operands = comm.bcast(operands, root) + # Create network object. network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} + optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}} ) # Select the best path from all ranks. @@ -371,6 +375,9 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample # Sum the partial contribution from each process on root. result = comm.reduce(sendobj=result, op=MPI.SUM, root=root) + + del network + mempool.free_all_blocks() return result, rank