206 lines
6.3 KiB
C++
206 lines
6.3 KiB
C++
|
|
//@HEADER
|
|
// ***************************************************
|
|
//
|
|
// HPCG: High Performance Conjugate Gradient Benchmark
|
|
//
|
|
// Contact:
|
|
// Michael A. Heroux ( maherou@sandia.gov)
|
|
// Jack Dongarra (dongarra@eecs.utk.edu)
|
|
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
|
//
|
|
// ***************************************************
|
|
//@HEADER
|
|
|
|
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*!
|
|
@file ExchangeHalo.cpp
|
|
|
|
HPCG routine
|
|
*/
|
|
|
|
// Compile this routine only if running with MPI
|
|
#ifndef HPCG_NO_MPI
|
|
#include "ExchangeHalo.hpp"
|
|
#include "Geometry.hpp"
|
|
#include <cstdlib>
|
|
#include <mpi.h>
|
|
|
|
extern p2p_comm_mode_t P2P_Mode;
|
|
|
|
/*!
|
|
Communicates data that is at the border of the part of the domain assigned to this processor.
|
|
|
|
@param[in] A The known system matrix
|
|
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
|
|
non-local entries updated by other processors
|
|
*/
|
|
void ExchangeHalo(const SparseMatrix& A, Vector& x)
|
|
{
|
|
local_int_t localNumberOfRows = A.localNumberOfRows;
|
|
int num_neighbors = A.numberOfSendNeighbors;
|
|
local_int_t * receiveLength = A.receiveLength;
|
|
local_int_t * sendLength = A.sendLength;
|
|
int * neighbors = A.neighbors;
|
|
double * sendBuffer = A.sendBuffer;
|
|
local_int_t totalToBeSent = A.totalToBeSent;
|
|
local_int_t * elementsToSend = A.elementsToSend;
|
|
|
|
double * const xv = x.values;
|
|
|
|
int size, rank; // Number of MPI processes, My process ID
|
|
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
|
|
|
//
|
|
// first post receives, these are immediate receives
|
|
// Do not wait for result to come, will do that at the
|
|
// wait call below.
|
|
//
|
|
|
|
int MPI_MY_TAG = 99;
|
|
|
|
MPI_Request * request = new MPI_Request[num_neighbors];
|
|
|
|
//
|
|
// Externals are at end of locals
|
|
//
|
|
double * x_external = (double *) xv + localNumberOfRows;
|
|
|
|
// Post receives first
|
|
// TODO: Thread this loop
|
|
for (int i = 0; i < num_neighbors; i++) {
|
|
local_int_t n_recv = receiveLength[i];
|
|
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
|
|
x_external += n_recv;
|
|
}
|
|
|
|
|
|
//
|
|
// Fill up send buffer
|
|
//
|
|
|
|
// TODO: Thread this loop
|
|
for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
|
|
|
|
//
|
|
// Send to each neighbor
|
|
//
|
|
|
|
// TODO: Thread this loop
|
|
for (int i = 0; i < num_neighbors; i++) {
|
|
local_int_t n_send = sendLength[i];
|
|
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
|
sendBuffer += n_send;
|
|
}
|
|
|
|
//
|
|
// Complete the reads issued above
|
|
//
|
|
|
|
MPI_Status status;
|
|
// TODO: Thread this loop
|
|
for (int i = 0; i < num_neighbors; i++) {
|
|
if ( MPI_Wait(request+i, &status) ) {
|
|
std::exit(-1); // TODO: have better error exit
|
|
}
|
|
}
|
|
|
|
delete [] request;
|
|
|
|
return;
|
|
}
|
|
|
|
/*!
|
|
Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
|
|
|
|
@param[in] A The known system matrix
|
|
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
|
|
non-local entries updated by other processors
|
|
@param[in] use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT,
|
|
by calling MPI_Ibarrier once at the last routine call in MG.
|
|
*/
|
|
void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
|
|
{
|
|
// Extract Matrix pieces
|
|
local_int_t localNumberOfRows = A.localNumberOfRows;
|
|
int num_neighbors = A.numberOfSendNeighbors;
|
|
local_int_t* receiveLength = A.receiveLength;
|
|
local_int_t* sendLength = A.sendLength;
|
|
int* neighbors = A.neighborsPhysical;
|
|
double* sendBuffer = A.sendBuffer;
|
|
local_int_t totalToBeSent = A.totalToBeSent;
|
|
local_int_t* elementsToSend = A.elementsToSend;
|
|
|
|
if (P2P_Mode == MPI_CPU)
|
|
{
|
|
double* const xv = x.values;
|
|
double* x_external = (double*) xv + localNumberOfRows;
|
|
int MPI_MY_TAG = 99;
|
|
MPI_Request* request = new MPI_Request[num_neighbors];
|
|
|
|
// Post receives first
|
|
for (int i = 0; i < num_neighbors; i++)
|
|
{
|
|
local_int_t n_recv = receiveLength[i];
|
|
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
|
|
x_external += n_recv;
|
|
}
|
|
|
|
for (local_int_t i = 0; i < totalToBeSent; i++)
|
|
sendBuffer[i] = xv[elementsToSend[i]];
|
|
|
|
//
|
|
// Send to each neighbor
|
|
//
|
|
for (int i = 0; i < num_neighbors; i++)
|
|
{
|
|
local_int_t n_send = sendLength[i];
|
|
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
|
sendBuffer += n_send;
|
|
}
|
|
|
|
//
|
|
// Complete the reads issued above
|
|
//
|
|
|
|
MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
|
|
|
|
//[Experimental] Can improve MPI_Allreduce performance
|
|
#if 0
|
|
if (use_ibarrier == 1)
|
|
MPI_Ibarrier(MPI_COMM_WORLD, request);
|
|
#endif
|
|
|
|
delete[] request;
|
|
}
|
|
else if (P2P_Mode == MPI_CPU_All2allv)
|
|
{
|
|
double* const xv = x.values;
|
|
double* x_external = (double*) xv + localNumberOfRows;
|
|
for (local_int_t i = 0; i < totalToBeSent; i++)
|
|
sendBuffer[i] = xv[elementsToSend[i]];
|
|
MPI_Alltoallv(
|
|
sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
|
|
}
|
|
return;
|
|
}
|
|
#endif
|
|
// ifndef HPCG_NO_MPI
|