first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/src/ExchangeHalo.cpp
+++ b/src/ExchangeHalo.cpp
@@ -0,0 +1,205 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ExchangeHalo.cpp
+
+ HPCG routine
+ */
+
+// Compile this routine only if running with MPI
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#include "Geometry.hpp"
+#include <cstdlib>
+#include <mpi.h>
+
+extern p2p_comm_mode_t P2P_Mode;
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+ */
+void ExchangeHalo(const SparseMatrix& A, Vector& x)
+{
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t * receiveLength = A.receiveLength;
+    local_int_t * sendLength = A.sendLength;
+    int * neighbors = A.neighbors;
+    double * sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t * elementsToSend = A.elementsToSend;
+  
+    double * const xv = x.values;
+  
+    int size, rank; // Number of MPI processes, My process ID
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  
+    //
+    //  first post receives, these are immediate receives
+    //  Do not wait for result to come, will do that at the
+    //  wait call below.
+    //
+  
+    int MPI_MY_TAG = 99;
+  
+    MPI_Request * request = new MPI_Request[num_neighbors];
+  
+    //
+    // Externals are at end of locals
+    //
+    double * x_external = (double *) xv + localNumberOfRows;
+  
+    // Post receives first
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_recv = receiveLength[i];
+      MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
+      x_external += n_recv;
+    }
+  
+  
+    //
+    // Fill up send buffer
+    //
+  
+    // TODO: Thread this loop
+    for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
+  
+    //
+    // Send to each neighbor
+    //
+  
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_send = sendLength[i];
+      MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+      sendBuffer += n_send;
+    }
+  
+    //
+    // Complete the reads issued above
+    //
+  
+    MPI_Status status;
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      if ( MPI_Wait(request+i, &status) ) {
+        std::exit(-1); // TODO: have better error exit
+      }
+    }
+  
+    delete [] request;
+  
+    return;
+}
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+  @param[in]   use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT, 
+    by calling MPI_Ibarrier once at the last routine call in MG.
+ */
+void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
+{
+    // Extract Matrix pieces
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t* receiveLength = A.receiveLength;
+    local_int_t* sendLength = A.sendLength;
+    int* neighbors = A.neighborsPhysical;
+    double* sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t* elementsToSend = A.elementsToSend;
+
+    if (P2P_Mode == MPI_CPU)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        int MPI_MY_TAG = 99;
+        MPI_Request* request = new MPI_Request[num_neighbors];
+
+        // Post receives first
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_recv = receiveLength[i];
+            MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+            x_external += n_recv;
+        }
+
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+
+        //
+        // Send to each neighbor
+        //
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_send = sendLength[i];
+            MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+            sendBuffer += n_send;
+        }
+
+        //
+        // Complete the reads issued above
+        //
+
+        MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
+
+        //[Experimental] Can improve MPI_Allreduce performance
+        #if 0
+        if (use_ibarrier == 1)
+            MPI_Ibarrier(MPI_COMM_WORLD, request);
+        #endif
+
+        delete[] request;
+    }
+    else if (P2P_Mode == MPI_CPU_All2allv)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+        MPI_Alltoallv(
+            sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
+    }
+    return;
+}
+#endif
+// ifndef HPCG_NO_MPI