first commit
This commit is contained in:
241
src/CG.cpp
Normal file
241
src/CG.cpp
Normal file
@@ -0,0 +1,241 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file CG.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
#include "CG.hpp"
|
||||
#include "ComputeDotProduct.hpp"
|
||||
#include "ComputeMG.hpp"
|
||||
#include "ComputeSPMV.hpp"
|
||||
#include "ComputeWAXPBY.hpp"
|
||||
#include "mytimer.hpp"
|
||||
#include <iostream>
|
||||
|
||||
#include "CpuKernels.hpp"
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
#define TICKD() t0 = mytimer() //!< record current time in 't0'
|
||||
#define TOCKD(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
|
||||
|
||||
/*!
|
||||
Routine to compute an approximate solution to Ax = b
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[inout] A The known system matrix
|
||||
@param[inout] data The data structure with all necessary CG vectors preallocated
|
||||
@param[in] b The known right hand side vector
|
||||
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
|
||||
@param[in] max_iter The maximum number of iterations to perform, even if tolerance is not met.
|
||||
@param[in] tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
|
||||
@param[out] niters The number of iterations actually performed.
|
||||
@param[out] normr The 2-norm of the residual vector after the last iteration.
|
||||
@param[out] normr0 The 2-norm of the residual vector before the first iteration.
|
||||
@param[out] times The 7-element vector of the timing information accumulated during all of the iterations.
|
||||
@param[in] doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
|
||||
@see CG_ref()
|
||||
*/
|
||||
int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
|
||||
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
|
||||
{
|
||||
|
||||
double t_begin = mytimer(); // Start timing right away
|
||||
normr = 0.0;
|
||||
double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
|
||||
double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// double t6 = 0.0;
|
||||
// #endif
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
Vector& r = data.r; // Residual vector
|
||||
Vector& z = data.z; // Preconditioned residual vector
|
||||
Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
|
||||
Vector& Ap = data.Ap;
|
||||
|
||||
if (!doPreconditioning && A.geom->rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
|
||||
}
|
||||
|
||||
int print_freq = 1;
|
||||
if (print_freq > 50)
|
||||
print_freq = 50;
|
||||
if (print_freq < 1)
|
||||
print_freq = 1;
|
||||
|
||||
// p is of length ncols, copy x to p for sparse MV operation
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyVectorD2D(x, p);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
CopyVector(x, p);
|
||||
}
|
||||
|
||||
TICKD();
|
||||
ComputeSPMV(A, p, Ap);
|
||||
TOCKD(t3); // Ap = A*p
|
||||
TICKD();
|
||||
ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized, A.rankType);
|
||||
TOCKD(t2); // r = b - Ax (x stored in p)
|
||||
TICKD();
|
||||
ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
|
||||
TOCKD(t1);
|
||||
normr = sqrt(normr);
|
||||
|
||||
if (A.geom->rank == 0 && flag)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Initial Residual = " << normr << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Initial Residual = " << normr << std::endl;
|
||||
}
|
||||
|
||||
// Record initial residual for convergence testing
|
||||
normr0 = normr;
|
||||
|
||||
// Start iterations
|
||||
for (int k = 1; k <= max_iter && normr / normr0 * (1.0 + 1.0e-6) > tolerance; k++)
|
||||
{
|
||||
TICKD();
|
||||
if (doPreconditioning)
|
||||
{
|
||||
ComputeMG(A, r, z); // Apply preconditioner
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
cudaStreamSynchronize(stream);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyVectorD2D(r, z); // copy r to z (no preconditioning)
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
CopyVector(r, z); // copy r to z (no preconditioning)
|
||||
}
|
||||
}
|
||||
TOCKD(t5); // Preconditioner apply time
|
||||
|
||||
if (k == 1)
|
||||
{
|
||||
TICKD();
|
||||
ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized, A.rankType);
|
||||
TOCKD(t2); // Copy Mr to p
|
||||
TICKD();
|
||||
ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
|
||||
TOCKD(t1); // rtz = r'*z
|
||||
}
|
||||
else
|
||||
{
|
||||
oldrtz = rtz;
|
||||
TICKD();
|
||||
ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
|
||||
TOCKD(t1); // rtz = r'*z
|
||||
beta = rtz / oldrtz;
|
||||
TICKD();
|
||||
ComputeWAXPBY(nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized, A.rankType);
|
||||
TOCKD(t2); // p = beta*p + z
|
||||
}
|
||||
TICKD();
|
||||
ComputeSPMV(A, p, Ap);
|
||||
TOCKD(t3); // Ap = A*p
|
||||
TICKD();
|
||||
ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized, A.rankType);
|
||||
TOCKD(t1); // alpha = p'*Ap
|
||||
alpha = rtz / pAp;
|
||||
|
||||
TICKD();
|
||||
ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized, A.rankType); // x = x + alpha*p
|
||||
ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized, A.rankType);
|
||||
TOCKD(t2); // r = r - alpha*Ap
|
||||
TICKD();
|
||||
ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
|
||||
TOCKD(t1);
|
||||
|
||||
normr = sqrt(normr);
|
||||
|
||||
if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
|
||||
}
|
||||
|
||||
niters = k;
|
||||
}
|
||||
|
||||
// Store times
|
||||
times[1] += t1; // dot-product time
|
||||
times[2] += t2; // WAXPBY time
|
||||
times[3] += t3; // SPMV time
|
||||
times[4] += t4; // AllReduce time
|
||||
times[5] += t5; // preconditioner apply time
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// times[6] += t6; // exchange halo time
|
||||
// #endif
|
||||
times[0] += mytimer() - t_begin; // Total time. All done...
|
||||
return 0;
|
||||
}
|
||||
55
src/CG.hpp
Normal file
55
src/CG.hpp
Normal file
@@ -0,0 +1,55 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CG_HPP
|
||||
#define CG_HPP
|
||||
|
||||
#include "CGData.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
|
||||
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
|
||||
|
||||
// this function will compute the Conjugate Gradient iterations.
|
||||
// geom - Domain and processor topology information
|
||||
// A - Matrix
|
||||
// b - constant
|
||||
// x - used for return value
|
||||
// max_iter - how many times we iterate
|
||||
// tolerance - Stopping tolerance for preconditioned iterations.
|
||||
// niters - number of iterations performed
|
||||
// normr - computed residual norm
|
||||
// normr0 - Original residual
|
||||
// times - array of timing information
|
||||
// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
|
||||
|
||||
#endif // CG_HPP
|
||||
84
src/CGData.hpp
Normal file
84
src/CGData.hpp
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file CGData.hpp
|
||||
|
||||
HPCG data structure
|
||||
*/
|
||||
|
||||
#ifndef CGDATA_HPP
|
||||
#define CGDATA_HPP
|
||||
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
struct CGData_STRUCT
|
||||
{
|
||||
Vector r; //!< pointer to residual vector
|
||||
Vector z; //!< pointer to preconditioned residual vector
|
||||
Vector p; //!< pointer to direction vector
|
||||
Vector Ap; //!< pointer to Krylov vector
|
||||
};
|
||||
typedef struct CGData_STRUCT CGData;
|
||||
|
||||
/*!
|
||||
Constructor for the data structure of CG vectors.
|
||||
|
||||
@param[in] A the data structure that describes the problem matrix and its structure
|
||||
@param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
|
||||
*/
|
||||
inline void InitializeSparseCGData(SparseMatrix& A, CGData& data)
|
||||
{
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
local_int_t ncol = A.localNumberOfColumns;
|
||||
InitializeVector(data.r, nrow, A.rankType);
|
||||
InitializeVector(data.z, ncol, A.rankType, true /*Only when rank type is GPU*/);
|
||||
InitializeVector(data.p, ncol, A.rankType, true);
|
||||
InitializeVector(data.Ap, nrow, A.rankType);
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Destructor for the CG vectors data.
|
||||
|
||||
@param[inout] data the CG vectors data structure whose storage is deallocated
|
||||
*/
|
||||
inline void DeleteCGData(CGData& data)
|
||||
{
|
||||
DeleteVector(data.r);
|
||||
DeleteVector(data.z);
|
||||
DeleteVector(data.p);
|
||||
DeleteVector(data.Ap);
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // CGDATA_HPP
|
||||
198
src/CG_ref.cpp
Normal file
198
src/CG_ref.cpp
Normal file
@@ -0,0 +1,198 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file CG_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include <cmath>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
#include "CG_ref.hpp"
|
||||
#include "ComputeDotProduct_ref.hpp"
|
||||
#include "ComputeMG_ref.hpp"
|
||||
#include "ComputeSPMV_ref.hpp"
|
||||
#include "ComputeWAXPBY_ref.hpp"
|
||||
#include "mytimer.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
// Use TICK and TOCK to time a code section in MATLAB-like fashion
|
||||
#define TICK() t0 = mytimer() //!< record current time in 't0'
|
||||
#define TOCK(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
|
||||
|
||||
/*!
|
||||
Reference routine to compute an approximate solution to Ax = b
|
||||
|
||||
@param[inout] A The known system matrix
|
||||
@param[inout] data The data structure with all necessary CG vectors preallocated
|
||||
@param[in] b The known right hand side vector
|
||||
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
|
||||
@param[in] max_iter The maximum number of iterations to perform, even if tolerance is not met.
|
||||
@param[in] tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
|
||||
@param[out] niters The number of iterations actually performed.
|
||||
@param[out] normr The 2-norm of the residual vector after the last iteration.
|
||||
@param[out] normr0 The 2-norm of the residual vector before the first iteration.
|
||||
@param[out] times The 7-element vector of the timing information accumulated during all of the iterations.
|
||||
@param[in] doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
|
||||
@see CG()
|
||||
*/
|
||||
|
||||
int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
|
||||
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
|
||||
{
|
||||
|
||||
double t_begin = mytimer(); // Start timing right away
|
||||
normr = 0.0;
|
||||
double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
|
||||
|
||||
double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// double t6 = 0.0;
|
||||
// #endif
|
||||
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
|
||||
Vector& r = data.r; // Residual vector
|
||||
Vector& z = data.z; // Preconditioned residual vector
|
||||
Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
|
||||
Vector& Ap = data.Ap;
|
||||
|
||||
if (!doPreconditioning && A.geom->rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
|
||||
}
|
||||
|
||||
#if 1
|
||||
// def HPCG_DEBUG
|
||||
int print_freq = 1;
|
||||
if (print_freq > 50)
|
||||
print_freq = 50;
|
||||
if (print_freq < 1)
|
||||
print_freq = 1;
|
||||
#endif
|
||||
// p is of length ncols, copy x to p for sparse MV operation
|
||||
CopyVector(x, p);
|
||||
TICK();
|
||||
ComputeSPMV_ref(A, p, Ap);
|
||||
TOCK(t3); // Ap = A*p
|
||||
TICK();
|
||||
ComputeWAXPBY_ref(nrow, 1.0, b, -1.0, Ap, r);
|
||||
TOCK(t2); // r = b - Ax (x stored in p)
|
||||
TICK();
|
||||
ComputeDotProduct_ref(nrow, r, r, normr, t4);
|
||||
TOCK(t1);
|
||||
normr = sqrt(normr);
|
||||
#if 1
|
||||
// def HPCG_DEBUG
|
||||
if (A.geom->rank == 0 && flag)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Initial Residual = " << normr << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Initial Residual = " << normr << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Record initial residual for convergence testing
|
||||
normr0 = normr;
|
||||
|
||||
// Start iterations
|
||||
|
||||
for (int k = 1; k <= max_iter && normr / normr0 > tolerance; k++)
|
||||
{
|
||||
TICK();
|
||||
if (doPreconditioning)
|
||||
ComputeMG_ref(A, r, z); // Apply preconditioner
|
||||
else
|
||||
ComputeWAXPBY_ref(nrow, 1.0, r, 0.0, r, z); // copy r to z (no preconditioning)
|
||||
TOCK(t5); // Preconditioner apply time
|
||||
|
||||
if (k == 1)
|
||||
{
|
||||
CopyVector(z, p);
|
||||
TOCK(t2); // Copy Mr to p
|
||||
TICK();
|
||||
ComputeDotProduct_ref(nrow, r, z, rtz, t4);
|
||||
TOCK(t1); // rtz = r'*z
|
||||
}
|
||||
else
|
||||
{
|
||||
oldrtz = rtz;
|
||||
TICK();
|
||||
ComputeDotProduct_ref(nrow, r, z, rtz, t4);
|
||||
TOCK(t1); // rtz = r'*z
|
||||
beta = rtz / oldrtz;
|
||||
TICK();
|
||||
ComputeWAXPBY_ref(nrow, 1.0, z, beta, p, p);
|
||||
TOCK(t2); // p = beta*p + z
|
||||
}
|
||||
|
||||
TICK();
|
||||
ComputeSPMV_ref(A, p, Ap);
|
||||
TOCK(t3); // Ap = A*p
|
||||
TICK();
|
||||
ComputeDotProduct_ref(nrow, p, Ap, pAp, t4);
|
||||
TOCK(t1); // alpha = p'*Ap
|
||||
alpha = rtz / pAp;
|
||||
TICK();
|
||||
ComputeWAXPBY_ref(nrow, 1.0, x, alpha, p, x); // x = x + alpha*p
|
||||
ComputeWAXPBY_ref(nrow, 1.0, r, -alpha, Ap, r);
|
||||
TOCK(t2); // r = r - alpha*Ap
|
||||
TICK();
|
||||
ComputeDotProduct_ref(nrow, r, r, normr, t4);
|
||||
TOCK(t1);
|
||||
normr = sqrt(normr);
|
||||
#if 1
|
||||
// def HPCG_DEBUG
|
||||
if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
|
||||
}
|
||||
#endif
|
||||
niters = k;
|
||||
}
|
||||
|
||||
// Store times
|
||||
times[1] += t1; // dot product time
|
||||
times[2] += t2; // WAXPBY time
|
||||
times[3] += t3; // SPMV time
|
||||
times[4] += t4; // AllReduce time
|
||||
times[5] += t5; // preconditioner apply time
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// times[6] += t6; // exchange halo time
|
||||
// #endif
|
||||
times[0] += mytimer() - t_begin; // Total time. All done...
|
||||
return 0;
|
||||
}
|
||||
42
src/CG_ref.hpp
Normal file
42
src/CG_ref.hpp
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef CG_REF_HPP
|
||||
#define CG_REF_HPP
|
||||
|
||||
#include "CGData.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
// The use of CPU and GPU Sparse Matrix is intended to resolve
|
||||
// the linked list structures for MG coarse levels
|
||||
// There is no change of th erefernce code
|
||||
|
||||
int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
|
||||
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
|
||||
|
||||
// this function will compute the Conjugate Gradient iterations.
|
||||
// geom - Domain and processor topology information
|
||||
// A - Matrix
|
||||
// b - constant
|
||||
// x - used for return value
|
||||
// max_iter - how many times we iterate
|
||||
// tolerance - Stopping tolerance for preconditioned iterations.
|
||||
// niters - number of iterations performed
|
||||
// normr - computed residual norm
|
||||
// normr0 - Original residual
|
||||
// times - array of timing information
|
||||
// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
|
||||
|
||||
#endif // CG_REF_HPP
|
||||
84
src/CheckAspectRatio.cpp
Normal file
84
src/CheckAspectRatio.cpp
Normal file
@@ -0,0 +1,84 @@
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file CheckAspectRatio.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
#include "CheckAspectRatio.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo)
|
||||
{
|
||||
double current_ratio = std::min(std::min(x, y), z) / double(std::max(std::max(x, y), z));
|
||||
|
||||
if (current_ratio < smallest_ratio)
|
||||
{ // ratio of the smallest to the largest
|
||||
if (DoIo)
|
||||
{
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "The " << what << " sizes (" << x << "," << y << "," << z
|
||||
<< ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
|
||||
<< " is too small (at least " << smallest_ratio << " is required)." << std::endl;
|
||||
HPCG_fout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl;
|
||||
HPCG_fout.flush();
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "The " << what << " sizes (" << x << "," << y << "," << z
|
||||
<< ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
|
||||
<< " is too small (at least " << smallest_ratio << " is required)." << std::endl;
|
||||
std::cout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl
|
||||
<< std::flush;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Abort(MPI_COMM_WORLD, 127);
|
||||
#endif
|
||||
|
||||
return 127;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
18
src/CheckAspectRatio.hpp
Normal file
18
src/CheckAspectRatio.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef CHECKASPECTRATIO_HPP
|
||||
#define CHECKASPECTRATIO_HPP
|
||||
extern int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo);
|
||||
#endif // CHECKASPECTRATIO_HPP
|
||||
192
src/CheckProblem.cpp
Normal file
192
src/CheckProblem.cpp
Normal file
@@ -0,0 +1,192 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file CheckProblem.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
|
||||
#include <fstream>
|
||||
using std::endl;
|
||||
#include "hpcg.hpp"
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
#include "CheckProblem.hpp"
|
||||
|
||||
/*!
|
||||
Check the contents of the generated sparse matrix to see if values match expected contents.
|
||||
|
||||
@param[in] A The known system matrix
|
||||
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
|
||||
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
|
||||
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
|
||||
non-zero on entry)
|
||||
|
||||
@see GenerateGeometry
|
||||
*/
|
||||
|
||||
void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
|
||||
{
|
||||
|
||||
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
|
||||
// below may result in global range values.
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
|
||||
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
|
||||
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
|
||||
|
||||
double* bv = 0;
|
||||
double* xv = 0;
|
||||
double* xexactv = 0;
|
||||
if (b != 0)
|
||||
bv = b->values; // Only compute exact solution if requested
|
||||
if (x != 0)
|
||||
xv = x->values; // Only compute exact solution if requested
|
||||
if (xexact != 0)
|
||||
xexactv = xexact->values; // Only compute exact solution if requested
|
||||
|
||||
local_int_t localNumberOfNonzeros = 0;
|
||||
// TODO: This triply nested loop could be flattened or use nested parallelism
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t iz = 0; iz < nz; iz++)
|
||||
{
|
||||
global_int_t giz = giz0 + iz;
|
||||
for (local_int_t iy = 0; iy < ny; iy++)
|
||||
{
|
||||
global_int_t giy = giy0 + iy;
|
||||
for (local_int_t ix = 0; ix < nx; ix++)
|
||||
{
|
||||
global_int_t gix = gix0 + ix;
|
||||
local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
|
||||
global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
|
||||
assert(A.localToGlobalMap[currentLocalRow] == currentGlobalRow);
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
|
||||
<< A.globalToLocalMap.find(currentGlobalRow)->second << endl;
|
||||
#endif
|
||||
char numberOfNonzerosInRow = 0;
|
||||
double* currentValuePointer
|
||||
= A.matrixValues[currentLocalRow]; // Pointer to current value in current row
|
||||
global_int_t* currentIndexPointerG
|
||||
= A.mtxIndG[currentLocalRow]; // Pointer to current index in current row
|
||||
for (int sz = -1; sz <= 1; sz++)
|
||||
{
|
||||
if (giz + sz > -1 && giz + sz < gnz)
|
||||
{
|
||||
for (int sy = -1; sy <= 1; sy++)
|
||||
{
|
||||
if (giy + sy > -1 && giy + sy < gny)
|
||||
{
|
||||
for (int sx = -1; sx <= 1; sx++)
|
||||
{
|
||||
if (gix + sx > -1 && gix + sx < gnx)
|
||||
{
|
||||
global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
|
||||
if (curcol == currentGlobalRow)
|
||||
{
|
||||
assert(A.matrixDiagonal[currentLocalRow] == currentValuePointer);
|
||||
assert(*currentValuePointer++ == 26.0);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(*currentValuePointer++ == -1.0);
|
||||
}
|
||||
assert(*currentIndexPointerG++ == curcol);
|
||||
numberOfNonzerosInRow++;
|
||||
} // end x bounds test
|
||||
} // end sx loop
|
||||
} // end y bounds test
|
||||
} // end sy loop
|
||||
} // end z bounds test
|
||||
} // end sz loop
|
||||
assert(A.nonzerosInRow[currentLocalRow] == numberOfNonzerosInRow);
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
|
||||
if (b != 0)
|
||||
assert(bv[currentLocalRow] == 26.0 - ((double) (numberOfNonzerosInRow - 1)));
|
||||
if (x != 0)
|
||||
assert(xv[currentLocalRow] == 0.0);
|
||||
if (xexact != 0)
|
||||
assert(xexactv[currentLocalRow] == 1.0);
|
||||
} // end ix loop
|
||||
} // end iy loop
|
||||
} // end iz loop
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
|
||||
<< endl
|
||||
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
|
||||
<< " nonzeros." << endl;
|
||||
#endif
|
||||
|
||||
global_int_t totalNumberOfNonzeros = 0;
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Use MPI's reduce function to sum all nonzeros
|
||||
#ifdef HPCG_NO_LONG_LONG
|
||||
MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
#else
|
||||
long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
|
||||
MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
totalNumberOfNonzeros = gnnz; // Copy back
|
||||
#endif
|
||||
#else
|
||||
totalNumberOfNonzeros = localNumberOfNonzeros;
|
||||
#endif
|
||||
|
||||
assert(A.totalNumberOfRows == totalNumberOfRows);
|
||||
assert(A.totalNumberOfNonzeros == totalNumberOfNonzeros);
|
||||
assert(A.localNumberOfRows == localNumberOfRows);
|
||||
assert(A.localNumberOfNonzeros == localNumberOfNonzeros);
|
||||
|
||||
return;
|
||||
}
|
||||
21
src/CheckProblem.hpp
Normal file
21
src/CheckProblem.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef CHECKPROBLEM_HPP
|
||||
#define CHECKPROBLEM_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
|
||||
#endif // CHECKPROBLEM_HPP
|
||||
114
src/ComputeDotProduct.cpp
Normal file
114
src/ComputeDotProduct.cpp
Normal file
@@ -0,0 +1,114 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeDotProduct.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "mytimer.hpp"
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#include "ComputeDotProduct.hpp"
|
||||
#include "ComputeDotProduct_ref.hpp"
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#define CHECK_CUBLAS(x) \
|
||||
do \
|
||||
{ \
|
||||
cublasStatus_t cublasStatus = (x); \
|
||||
if (cublasStatus != CUBLAS_STATUS_SUCCESS) \
|
||||
{ \
|
||||
fprintf(stderr, "CUBLAS: %s = %d at (%s:%d)\n", #x, cublasStatus, __FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
#include "CpuKernels.hpp"
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Routine to compute the dot product of two vectors.
|
||||
|
||||
This routine calls the reference dot-product implementation by default, but
|
||||
can be replaced by a custom routine that is optimized and better suited for
|
||||
the target system.
|
||||
|
||||
@param[in] n the number of vector elements (on this processor)
|
||||
@param[in] x, y the input vectors
|
||||
@param[out] result a pointer to scalar value, on exit will contain the result.
|
||||
@param[out] time_allreduce the time it took to perform the communication between processes
|
||||
@param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
|
||||
otherwise leave it unchanged
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeDotProduct_ref
|
||||
*/
|
||||
|
||||
int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
|
||||
bool& isOptimized, rank_type_t rt)
|
||||
{
|
||||
|
||||
double local_result = 0.0;
|
||||
if (rt == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
cublasStatus_t t = cublasDdot(cublashandle, n, x.values_d, 1, y.values_d, 1, &local_result);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
// Consider replacing with NVPL BLAS dot product
|
||||
ComputeDotProductCpu(n, x, y, local_result, isOptimized);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Use MPI's reduce function to collect all partial sums
|
||||
double t0 = mytimer();
|
||||
double global_result = 0.0;
|
||||
MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
result = global_result;
|
||||
t0 = mytimer() - t0;
|
||||
time_allreduce += t0;
|
||||
#else
|
||||
time_allreduce += 0.0;
|
||||
result = local_result;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
39
src/ComputeDotProduct.hpp
Normal file
39
src/ComputeDotProduct.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COMPUTEDOTPRODUCT_HPP
|
||||
#define COMPUTEDOTPRODUCT_HPP
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
|
||||
bool& isOptimized, rank_type_t rt);
|
||||
|
||||
#endif // COMPUTEDOTPRODUCT_HPP
|
||||
84
src/ComputeDotProduct_ref.cpp
Normal file
84
src/ComputeDotProduct_ref.cpp
Normal file
@@ -0,0 +1,84 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeDotProduct_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "mytimer.hpp"
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include "ComputeDotProduct_ref.hpp"
|
||||
#include <cassert>
|
||||
|
||||
/*!
|
||||
Routine to compute the dot product of two vectors where:
|
||||
|
||||
This is the reference dot-product implementation. It _CANNOT_ be modified for the
|
||||
purposes of this benchmark.
|
||||
|
||||
@param[in] n the number of vector elements (on this processor)
|
||||
@param[in] x, y the input vectors
|
||||
@param[in] result a pointer to scalar value, on exit will contain result.
|
||||
@param[out] time_allreduce the time it took to perform the communication between processes
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeDotProduct
|
||||
*/
|
||||
int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
|
||||
{
|
||||
assert(x.localLength >= n); // Test vector lengths
|
||||
assert(y.localLength >= n);
|
||||
|
||||
double local_result = 0.0;
|
||||
double* xv = x.values;
|
||||
double* yv = y.values;
|
||||
if (yv == xv)
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for reduction(+ : local_result)
|
||||
#endif
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
local_result += xv[i] * xv[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for reduction(+ : local_result)
|
||||
#endif
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
local_result += xv[i] * yv[i];
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Use MPI's reduce function to collect all partial sums
|
||||
double t0 = mytimer();
|
||||
double global_result = 0.0;
|
||||
MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
result = global_result;
|
||||
time_allreduce += mytimer() - t0;
|
||||
#else
|
||||
time_allreduce += 0.0;
|
||||
result = local_result;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
21
src/ComputeDotProduct_ref.hpp
Normal file
21
src/ComputeDotProduct_ref.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEDOTPRODUCT_REF_HPP
|
||||
#define COMPUTEDOTPRODUCT_REF_HPP
|
||||
#include "Vector.hpp"
|
||||
int ComputeDotProduct_ref(
|
||||
const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce);
|
||||
|
||||
#endif // COMPUTEDOTPRODUCT_REF_HPP
|
||||
96
src/ComputeMG.cpp
Normal file
96
src/ComputeMG.cpp
Normal file
@@ -0,0 +1,96 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeMG.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "ComputeMG.hpp"
|
||||
#include "ComputeProlongation.hpp"
|
||||
#include "ComputeRestriction.hpp"
|
||||
#include "ComputeSYMGS.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
|
||||
/*!
|
||||
@param[in] A the known system matrix
|
||||
@param[in] r the input vector
|
||||
@param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
|
||||
r.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeMG_ref
|
||||
*/
|
||||
|
||||
int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x)
|
||||
{
|
||||
int ierr = 0;
|
||||
if (A.mgData != 0)
|
||||
{ // Go to next coarse level if defined
|
||||
ComputeSYMGS(A, r, x, 1);
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ComputeRestrictionCuda(A, r);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ComputeRestriction(A, r);
|
||||
#endif
|
||||
}
|
||||
|
||||
ierr = ComputeMG(*A.Ac, *A.mgData->rc, *A.mgData->xc);
|
||||
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ComputeProlongationCuda(A, x);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ComputeProlongation(A, x);
|
||||
#endif
|
||||
}
|
||||
|
||||
ComputeSYMGS(A, r, x, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
ComputeSYMGS(A, r, x, 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
22
src/ComputeMG.hpp
Normal file
22
src/ComputeMG.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEMG_HPP
|
||||
#define COMPUTEMG_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x);
|
||||
|
||||
#endif // COMPUTEMG_HPP
|
||||
81
src/ComputeMG_ref.cpp
Normal file
81
src/ComputeMG_ref.cpp
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeSYMGS_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "ComputeMG_ref.hpp"
|
||||
#include "ComputeProlongation_ref.hpp"
|
||||
#include "ComputeRestriction_ref.hpp"
|
||||
#include "ComputeSPMV_ref.hpp"
|
||||
#include "ComputeSYMGS_ref.hpp"
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
/*!
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] r the input vector
|
||||
@param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
|
||||
r.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeMG
|
||||
*/
|
||||
|
||||
int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x)
|
||||
{
|
||||
assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
|
||||
|
||||
ZeroVector(x); // initialize x to zero
|
||||
|
||||
int ierr = 0;
|
||||
if (A.mgData != 0)
|
||||
{ // Go to next coarse level if defined
|
||||
int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
|
||||
for (int i = 0; i < numberOfPresmootherSteps; ++i)
|
||||
ierr += ComputeSYMGS_ref(A, r, x);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
// Perform restriction operation using simple injection
|
||||
ierr = ComputeRestriction_ref(A, r);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
ierr = ComputeMG_ref(*A.Ac, *A.mgData->rc, *A.mgData->xc);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
ierr = ComputeProlongation_ref(A, x);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
|
||||
for (int i = 0; i < numberOfPostsmootherSteps; ++i)
|
||||
ierr += ComputeSYMGS_ref(A, r, x);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
}
|
||||
else
|
||||
{
|
||||
ierr = ComputeSYMGS_ref(A, r, x);
|
||||
if (ierr != 0)
|
||||
return ierr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
26
src/ComputeMG_ref.hpp
Normal file
26
src/ComputeMG_ref.hpp
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEMG_REF_HPP
|
||||
#define COMPUTEMG_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
// The use of CPU and GPU Sparse Matrix is intended to resolve
|
||||
// the linked list structures for MG coarse levels (A->Ac)
|
||||
// There is no change of th erefernce code
|
||||
|
||||
int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x);
|
||||
|
||||
#endif // COMPUTEMG_REF_HPP
|
||||
175
src/ComputeOptimalShapeXYZ.cpp
Normal file
175
src/ComputeOptimalShapeXYZ.cpp
Normal file
@@ -0,0 +1,175 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
|
||||
#ifdef HPCG_CUBIC_RADICAL_SEARCH
|
||||
#include <algorithm>
|
||||
#endif
|
||||
#include <map>
|
||||
|
||||
#include "ComputeOptimalShapeXYZ.hpp"
|
||||
#include "MixedBaseCounter.hpp"
|
||||
|
||||
#ifdef HPCG_CUBIC_RADICAL_SEARCH
|
||||
static int min3(int a, int b, int c)
|
||||
{
|
||||
return std::min(a, std::min(b, c));
|
||||
}
|
||||
|
||||
static int max3(int a, int b, int c)
|
||||
{
|
||||
return std::max(a, std::max(b, c));
|
||||
}
|
||||
|
||||
static void cubic_radical_search(int n, int& x, int& y, int& z)
|
||||
{
|
||||
double best = 0.0;
|
||||
|
||||
for (int f1 = (int) (pow(n, 1.0 / 3.0) + 0.5); f1 > 0; --f1)
|
||||
if (n % f1 == 0)
|
||||
{
|
||||
int n1 = n / f1;
|
||||
for (int f2 = (int) (pow(n1, 0.5) + 0.5); f2 > 0; --f2)
|
||||
if (n1 % f2 == 0)
|
||||
{
|
||||
int f3 = n1 / f2;
|
||||
double current = (double) min3(f1, f2, f3) / max3(f1, f2, f3);
|
||||
if (current > best)
|
||||
{
|
||||
best = current;
|
||||
x = f1;
|
||||
y = f2;
|
||||
z = f3;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void ComputePrimeFactors(int n, std::map<int, int>& factors)
|
||||
{
|
||||
int d, sq = int((sqrt(double(n))) + 1L);
|
||||
div_t r;
|
||||
|
||||
// remove 2 as a factor with shifts instead "/" and "%"
|
||||
for (; n > 1 && (n & 1) == 0; n >>= 1)
|
||||
{
|
||||
factors[2]++;
|
||||
}
|
||||
|
||||
// keep removing subsequent odd numbers
|
||||
for (d = 3; d <= sq; d += 2)
|
||||
{
|
||||
while (1)
|
||||
{
|
||||
r = div(n, d);
|
||||
if (r.rem == 0)
|
||||
{
|
||||
factors[d]++;
|
||||
n = r.quot;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (n > 1 || factors.size() == 0) // left with a prime or x==1
|
||||
factors[n]++;
|
||||
}
|
||||
|
||||
static int pow_i(int x, int p)
|
||||
{
|
||||
int v;
|
||||
|
||||
if (0 == x || 1 == x)
|
||||
return x;
|
||||
|
||||
if (p < 0)
|
||||
return 0;
|
||||
|
||||
for (v = 1; p; p >>= 1)
|
||||
{
|
||||
if (1 & p)
|
||||
v *= x;
|
||||
x *= x;
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z)
|
||||
{
|
||||
#ifdef HPCG_CUBIC_RADICAL_SEARCH
|
||||
cubic_radical_search(xyz, x, y, z);
|
||||
#else
|
||||
std::map<int, int> factors;
|
||||
|
||||
ComputePrimeFactors(xyz, factors); // factors are sorted: ascending order
|
||||
|
||||
std::map<int, int>::iterator iter = factors.begin();
|
||||
|
||||
// there is at least one prime factor
|
||||
x = (iter++)->first; // cache the first factor, move to the next one
|
||||
|
||||
y = iter != factors.end() ? (iter++)->first : y; // try to cache the second factor in "y"
|
||||
|
||||
if (factors.size() == 1)
|
||||
{ // only a single factor
|
||||
z = pow_i(x, factors[x] / 3);
|
||||
y = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0));
|
||||
x = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0));
|
||||
}
|
||||
else if (factors.size() == 2 && factors[x] == 1 && factors[y] == 1)
|
||||
{ // two distinct prime factors
|
||||
z = 1;
|
||||
}
|
||||
else if (factors.size() == 2 && factors[x] + factors[y] == 3)
|
||||
{ // three prime factors, one repeated
|
||||
z = factors[x] == 2 ? x : y; // test which factor is repeated
|
||||
}
|
||||
else if (factors.size() == 3 && factors[x] == 1 && factors[y] == 1 && iter->second == 1)
|
||||
{ // three distinct and single prime factors
|
||||
z = iter->first;
|
||||
}
|
||||
else
|
||||
{ // 3 or more prime factors so try all possible 3-subsets
|
||||
|
||||
int i, distinct_factors[32 + 1], count_factors[32 + 1];
|
||||
|
||||
i = 0;
|
||||
for (std::map<int, int>::iterator iter = factors.begin(); iter != factors.end(); ++iter, ++i)
|
||||
{
|
||||
distinct_factors[i] = iter->first;
|
||||
count_factors[i] = iter->second;
|
||||
}
|
||||
|
||||
// count total number of prime factors in "c_main" and distribute some factors into "c1"
|
||||
MixedBaseCounter c_main(count_factors, factors.size()), c1(count_factors, factors.size());
|
||||
|
||||
// at the beginning, minimum area is the maximum area
|
||||
double area, min_area = 2.0 * xyz + 1.0;
|
||||
|
||||
for (c1.next(); !c1.is_zero(); c1.next())
|
||||
{
|
||||
MixedBaseCounter c2(c_main, c1); // "c2" gets the factors remaining in "c_main" that "c1" doesn't have
|
||||
for (c2.next(); !c2.is_zero(); c2.next())
|
||||
{
|
||||
int tf1 = c1.product(distinct_factors);
|
||||
int tf2 = c2.product(distinct_factors);
|
||||
int tf3 = xyz / tf1 / tf2; // we derive the third dimension, we don't keep track of the factors it has
|
||||
|
||||
area = tf1 * double(tf2) + tf2 * double(tf3) + tf1 * double(tf3);
|
||||
if (area < min_area)
|
||||
{
|
||||
min_area = area;
|
||||
x = tf1;
|
||||
y = tf2;
|
||||
z = tf3;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
2
src/ComputeOptimalShapeXYZ.hpp
Normal file
2
src/ComputeOptimalShapeXYZ.hpp
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);
|
||||
72
src/ComputeProlongation.cpp
Normal file
72
src/ComputeProlongation.cpp
Normal file
@@ -0,0 +1,72 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeProlongation.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "ComputeProlongation.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute the coarse residual vector.
|
||||
|
||||
@param[in] Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
|
||||
operator.
|
||||
@param[inout] xf - Fine grid solution vector, update with coarse grid correction.
|
||||
|
||||
Note that the fine grid residual is never explicitly constructed.
|
||||
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
*/
|
||||
int ComputeProlongation(const SparseMatrix& Af, Vector& xf)
|
||||
{
|
||||
double* xfv = xf.values;
|
||||
double* xcv = Af.mgData->xc->values;
|
||||
local_int_t* f2c = Af.mgData->f2cOperator;
|
||||
local_int_t nc = Af.mgData->rc->localLength;
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < nc; ++i)
|
||||
{
|
||||
xfv[Af.f2cPerm[i]] += xcv[i];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ComputeProlongation.hpp
Normal file
20
src/ComputeProlongation.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEPROLONGATION_HPP
|
||||
#define COMPUTEPROLONGATION_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
int ComputeProlongation(const SparseMatrix& Af, Vector& xf);
|
||||
#endif // COMPUTEPROLONGATION_HPP
|
||||
55
src/ComputeProlongation_ref.cpp
Normal file
55
src/ComputeProlongation_ref.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeProlongation_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "ComputeProlongation_ref.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute the coarse residual vector.
|
||||
|
||||
@param[in] Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
|
||||
operator.
|
||||
@param[inout] xf - Fine grid solution vector, update with coarse grid correction.
|
||||
|
||||
Note that the fine grid residual is never explicitly constructed.
|
||||
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
*/
|
||||
int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf)
|
||||
{
|
||||
|
||||
double* xfv = xf.values;
|
||||
double* xcv = Af.mgData->xc->values;
|
||||
local_int_t* f2c = Af.mgData->f2cOperator;
|
||||
local_int_t nc = Af.mgData->rc->localLength;
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
// TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
|
||||
for (local_int_t i = 0; i < nc; ++i)
|
||||
xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ComputeProlongation_ref.hpp
Normal file
20
src/ComputeProlongation_ref.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEPROLONGATION_REF_HPP
|
||||
#define COMPUTEPROLONGATION_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf);
|
||||
#endif // COMPUTEPROLONGATION_REF_HPP
|
||||
95
src/ComputeResidual.cpp
Normal file
95
src/ComputeResidual.cpp
Normal file
@@ -0,0 +1,95 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeResidual.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "Vector.hpp"
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
#include "hpcg.hpp"
|
||||
#include <fstream>
|
||||
#endif
|
||||
|
||||
#include "ComputeResidual.hpp"
|
||||
#include <cmath> // needed for fabs
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
#include <iostream>
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Routine to compute the inf-norm difference between two vectors where:
|
||||
|
||||
@param[in] n number of vector elements (local to this processor)
|
||||
@param[in] v1, v2 input vectors
|
||||
@param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
*/
|
||||
int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual)
|
||||
{
|
||||
|
||||
double* v1v = v1.values;
|
||||
double* v2v = v2.values;
|
||||
double local_residual = 0.0;
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel shared(local_residual, v1v, v2v)
|
||||
{
|
||||
double threadlocal_residual = 0.0;
|
||||
#pragma omp for
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
{
|
||||
double diff = std::fabs(v1v[i] - v2v[i]);
|
||||
if (diff > threadlocal_residual)
|
||||
threadlocal_residual = diff;
|
||||
}
|
||||
#pragma omp critical
|
||||
{
|
||||
if (threadlocal_residual > local_residual)
|
||||
local_residual = threadlocal_residual;
|
||||
}
|
||||
}
|
||||
#else // No threading
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
{
|
||||
double diff = std::fabs(v1v[i] - v2v[i]);
|
||||
if (diff > local_residual)
|
||||
local_residual = diff;
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << " Computed, exact, diff = " << v1v[i] << " " << v2v[i] << " " << diff << std::endl;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Use MPI's reduce function to collect all partial sums
|
||||
double global_residual = 0;
|
||||
MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
residual = global_residual;
|
||||
#else
|
||||
residual = local_residual;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
19
src/ComputeResidual.hpp
Normal file
19
src/ComputeResidual.hpp
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTERESIDUAL_HPP
|
||||
#define COMPUTERESIDUAL_HPP
|
||||
#include "Vector.hpp"
|
||||
int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual);
|
||||
#endif // COMPUTERESIDUAL_HPP
|
||||
75
src/ComputeRestriction.cpp
Normal file
75
src/ComputeRestriction.cpp
Normal file
@@ -0,0 +1,75 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeRestriction.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "ComputeRestriction.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute the coarse residual vector.
|
||||
|
||||
@param[inout] A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
|
||||
mgData->rc the coarse residual vector.
|
||||
@param[in] rf - Fine grid RHS.
|
||||
|
||||
|
||||
Note that the fine grid residual is never explicitly constructed.
|
||||
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
*/
|
||||
int ComputeRestriction(const SparseMatrix& A, const Vector& rf)
|
||||
{
|
||||
|
||||
double* Axfv = A.mgData->Axf->values;
|
||||
double* rfv = rf.values;
|
||||
double* rcv = A.mgData->rc->values;
|
||||
local_int_t* f2c = A.mgData->f2cOperator;
|
||||
local_int_t nc = A.mgData->rc->localLength;
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < nc; ++i)
|
||||
{
|
||||
rcv[i] = rfv[A.f2cPerm[i]] - Axfv[A.f2cPerm[i]];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ComputeRestriction.hpp
Normal file
20
src/ComputeRestriction.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTERESTRICTION_HPP
|
||||
#define COMPUTERESTRICTION_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
int ComputeRestriction(const SparseMatrix& A, const Vector& rf);
|
||||
#endif // COMPUTERESTRICTION_HPP
|
||||
56
src/ComputeRestriction_ref.cpp
Normal file
56
src/ComputeRestriction_ref.cpp
Normal file
@@ -0,0 +1,56 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeRestriction_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "ComputeRestriction_ref.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute the coarse residual vector.
|
||||
|
||||
@param[inout] A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
|
||||
mgData->rc the coarse residual vector.
|
||||
@param[in] rf - Fine grid RHS.
|
||||
|
||||
|
||||
Note that the fine grid residual is never explicitly constructed.
|
||||
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
*/
|
||||
int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf)
|
||||
{
|
||||
|
||||
double* Axfv = A.mgData->Axf->values;
|
||||
double* rfv = rf.values;
|
||||
double* rcv = A.mgData->rc->values;
|
||||
local_int_t* f2c = A.mgData->f2cOperator;
|
||||
local_int_t nc = A.mgData->rc->localLength;
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < nc; ++i)
|
||||
rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ComputeRestriction_ref.hpp
Normal file
20
src/ComputeRestriction_ref.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTERESTRICTION_REF_HPP
|
||||
#define COMPUTERESTRICTION_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf);
|
||||
#endif // COMPUTERESTRICTION_REF_HPP
|
||||
111
src/ComputeSPMV.cpp
Normal file
111
src/ComputeSPMV.cpp
Normal file
@@ -0,0 +1,111 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeSPMV.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "ComputeSPMV.hpp"
|
||||
#include "ComputeSPMV_ref.hpp"
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#endif
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#endif
|
||||
|
||||
#include "CpuKernels.hpp"
|
||||
/*!
|
||||
Routine to compute sparse matrix vector product y = Ax where:
|
||||
Precondition: First call exchange_externals to get off-processor values of x
|
||||
|
||||
This routine calls the reference SpMV implementation by default, but
|
||||
can be replaced by a custom, optimized routine suited for
|
||||
the target system.
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] x the known vector
|
||||
@param[out] y the On exit contains the result: Ax.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeSPMV_ref
|
||||
*/
|
||||
int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y)
|
||||
{
|
||||
|
||||
double one = 1.0, zero = 0.0;
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
// #ifdef USE_CUDA
|
||||
#ifndef HPCG_NO_MPI
|
||||
PackSendBufferCuda(A, x, false, copy_stream);
|
||||
#endif
|
||||
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, y.values_d);
|
||||
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, A.cusparseOpt.matA, A.cusparseOpt.vecX,
|
||||
&zero, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExchangeHaloCuda(A, x, copy_stream);
|
||||
ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, y.values_d);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaStreamSynchronize(stream);
|
||||
// #endif
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// #ifdef USE_GRACE
|
||||
// nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, x.values);
|
||||
// nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, y.values);
|
||||
// nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matA,
|
||||
// A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
// NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvADescr);
|
||||
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// if (A.totalToBeSent > 0)
|
||||
// {
|
||||
// ExchangeHaloCpu(A, x);
|
||||
// ExtSpMVCpu(A, A.localNumberOfRows, 1.0, x.values, y.values);
|
||||
// }
|
||||
// #endif
|
||||
// #endif // USE_GRACE
|
||||
// }
|
||||
return 0;
|
||||
}
|
||||
22
src/ComputeSPMV.hpp
Normal file
22
src/ComputeSPMV.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTESPMV_HPP
|
||||
#define COMPUTESPMV_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y);
|
||||
|
||||
#endif // COMPUTESPMV_HPP
|
||||
74
src/ComputeSPMV_ref.cpp
Normal file
74
src/ComputeSPMV_ref.cpp
Normal file
@@ -0,0 +1,74 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeSPMV_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "ComputeSPMV_ref.hpp"
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
/*!
|
||||
Routine to compute matrix vector product y = Ax where:
|
||||
Precondition: First call exchange_externals to get off-processor values of x
|
||||
|
||||
This is the reference SPMV implementation. It CANNOT be modified for the
|
||||
purposes of this benchmark.
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] x the known vector
|
||||
@param[out] y the On exit contains the result: Ax.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeSPMV
|
||||
*/
|
||||
int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y)
|
||||
{
|
||||
|
||||
assert(x.localLength >= A.localNumberOfColumns); // Test vector lengths
|
||||
assert(y.localLength >= A.localNumberOfRows);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
ExchangeHalo(A, x);
|
||||
#endif
|
||||
const double* const xv = x.values;
|
||||
double* const yv = y.values;
|
||||
const local_int_t nrow = A.localNumberOfRows;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < nrow; i++)
|
||||
{
|
||||
double sum = 0.0;
|
||||
const double* const cur_vals = A.matrixValues[i];
|
||||
const local_int_t* const cur_inds = A.mtxIndL[i];
|
||||
const int cur_nnz = A.nonzerosInRow[i];
|
||||
|
||||
for (int j = 0; j < cur_nnz; j++)
|
||||
sum += cur_vals[j] * xv[cur_inds[j]];
|
||||
yv[i] = sum;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
22
src/ComputeSPMV_ref.hpp
Normal file
22
src/ComputeSPMV_ref.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTESPMV_REF_HPP
|
||||
#define COMPUTESPMV_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y);
|
||||
|
||||
#endif // COMPUTESPMV_REF_HPP
|
||||
309
src/ComputeSYMGS.cpp
Normal file
309
src/ComputeSYMGS.cpp
Normal file
@@ -0,0 +1,309 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeSYMGS.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#endif
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#endif
|
||||
#include "ComputeSPMV.hpp"
|
||||
#include "ComputeSYMGS.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute one step of symmetric Gauss-Seidel:
|
||||
|
||||
Assumption about the structure of matrix A:
|
||||
- Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
|
||||
- Entries in row 'i' are ordered such that:
|
||||
- lower triangular terms are stored before the diagonal element.
|
||||
- upper triangular terms are stored after the diagonal element.
|
||||
- No other assumptions are made about entry ordering.
|
||||
|
||||
Symmetric Gauss-Seidel notes:
|
||||
- We use the input vector x as the RHS and start with an initial guess for y of all zeros.
|
||||
- We perform one forward sweep. Since y is initially zero we can ignore the upper triangular terms of A.
|
||||
- We then perform one back sweep.
|
||||
- For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] r the input vector
|
||||
@param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
|
||||
with r as the RHS.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
|
||||
of sync with other kernels.
|
||||
|
||||
@see ComputeSYMGS_ref
|
||||
*/
|
||||
|
||||
#ifdef USE_CUDA
|
||||
int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
double* tmp_d;
|
||||
if (step == 1 && A.mgData != 0)
|
||||
{
|
||||
tmp_d = (*A.mgData->Axf).values_d;
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp_d = A.tempBuffer;
|
||||
}
|
||||
const local_int_t nrow = A.localNumberOfRows;
|
||||
double alpha = 1.0;
|
||||
cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
|
||||
cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
|
||||
|
||||
if (step == 1)
|
||||
{
|
||||
// TRSV(D+L, r, t)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, t, t)
|
||||
SpmvDiagCuda(nrow, tmp_d, A.diagonal);
|
||||
|
||||
// TRSV(D+U, t, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
|
||||
|
||||
if (A.mgData != 0)
|
||||
{
|
||||
#ifndef HPCG_NO_MPI
|
||||
cudaStreamSynchronize(stream);
|
||||
PackSendBufferCuda(A, x, false, copy_stream);
|
||||
#endif
|
||||
|
||||
// SPMV(L, x, t): t = t + L * x
|
||||
double alpha = 1.0;
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
|
||||
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
|
||||
A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExchangeHaloCuda(A, x, copy_stream);
|
||||
double one = 1.0, zero = 0.0;
|
||||
ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{ // step == 0
|
||||
#ifndef HPCG_NO_MPI
|
||||
cudaStreamSynchronize(stream);
|
||||
PackSendBufferCuda(A, x, false, copy_stream);
|
||||
#endif
|
||||
|
||||
// SPMV(U, x, t): t = U * x
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
|
||||
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
|
||||
&beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
|
||||
|
||||
// tmp = rv - t
|
||||
AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
|
||||
ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
|
||||
double mone = -1.0, zero = 0.0;
|
||||
ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
|
||||
}
|
||||
#endif
|
||||
|
||||
// TRSV(D+L, r-t, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t += D*x
|
||||
SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
|
||||
|
||||
// TRSV(D+U, x, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
double* temp;
|
||||
if (step == 1 && A.mgData != 0)
|
||||
{
|
||||
temp = (*A.mgData->Axf).values;
|
||||
}
|
||||
else
|
||||
{
|
||||
temp = A.tempBuffer;
|
||||
}
|
||||
double* xv = x.values;
|
||||
double* rv = r.values;
|
||||
double one = 1.0, zero = 0.0;
|
||||
nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
|
||||
nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
|
||||
|
||||
if (step == 1)
|
||||
{
|
||||
// TRSV(L, r, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t = D*x
|
||||
SpmvDiagCpu(nrow, A.diagonal, xv, temp);
|
||||
|
||||
// TRSV(U, x, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrU);
|
||||
|
||||
if (A.mgData != 0)
|
||||
{
|
||||
// SPMV(L, x, t): t += L*x
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
|
||||
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
ExchangeHaloCpu(A, x);
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExtSpMVCpu(A, nrow, 1.0, xv, temp);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (step == 0)
|
||||
{
|
||||
// SPMV(U, x, t) t = U*x
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
|
||||
|
||||
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
|
||||
|
||||
// axpy: t = r-t
|
||||
AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
|
||||
ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExtSpMVCpu(A, nrow, -1.0, xv, temp);
|
||||
}
|
||||
#endif
|
||||
|
||||
// TRSV(L, r-t, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t += D*x
|
||||
SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
|
||||
|
||||
// TRSV(U, x, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrU);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // USE_GRACE
|
||||
|
||||
int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ComputeSYMGS_Gpu(A, r, x, step);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ComputeSYMGS_Cpu(A, r, x, step);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
39
src/ComputeSYMGS.hpp
Normal file
39
src/ComputeSYMGS.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COMPUTESYMGS_HPP
|
||||
#define COMPUTESYMGS_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step);
|
||||
|
||||
#endif // COMPUTESYMGS_HPP
|
||||
110
src/ComputeSYMGS_ref.cpp
Normal file
110
src/ComputeSYMGS_ref.cpp
Normal file
@@ -0,0 +1,110 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeSYMGS_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#endif
|
||||
#include "ComputeSYMGS_ref.hpp"
|
||||
#include <cassert>
|
||||
|
||||
/*!
|
||||
Computes one step of symmetric Gauss-Seidel:
|
||||
|
||||
Assumption about the structure of matrix A:
|
||||
- Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
|
||||
- Entries in row 'i' are ordered such that:
|
||||
- lower triangular terms are stored before the diagonal element.
|
||||
- upper triangular terms are stored after the diagonal element.
|
||||
- No other assumptions are made about entry ordering.
|
||||
|
||||
Symmetric Gauss-Seidel notes:
|
||||
- We use the input vector x as the RHS and start with an initial guess for y of all zeros.
|
||||
- We perform one forward sweep. x should be initially zero on the first GS sweep, but we do not attempt to exploit
|
||||
this fact.
|
||||
- We then perform one back sweep.
|
||||
- For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] r the input vector
|
||||
@param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
|
||||
with r as the RHS.
|
||||
|
||||
|
||||
@warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
|
||||
of sync with other kernels.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeSYMGS
|
||||
*/
|
||||
int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x)
|
||||
{
|
||||
|
||||
assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
ExchangeHalo(A, x);
|
||||
#endif
|
||||
|
||||
const local_int_t nrow = A.localNumberOfRows;
|
||||
double** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues
|
||||
const double* const rv = r.values;
|
||||
double* const xv = x.values;
|
||||
|
||||
for (local_int_t i = 0; i < nrow; i++)
|
||||
{
|
||||
const double* const currentValues = A.matrixValues[i];
|
||||
const local_int_t* const currentColIndices = A.mtxIndL[i];
|
||||
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
|
||||
const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
|
||||
double sum = rv[i]; // RHS value
|
||||
|
||||
for (int j = 0; j < currentNumberOfNonzeros; j++)
|
||||
{
|
||||
local_int_t curCol = currentColIndices[j];
|
||||
sum -= currentValues[j] * xv[curCol];
|
||||
}
|
||||
sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
|
||||
|
||||
xv[i] = sum / currentDiagonal;
|
||||
}
|
||||
|
||||
// Now the back sweep.
|
||||
|
||||
for (local_int_t i = nrow - 1; i >= 0; i--)
|
||||
{
|
||||
const double* const currentValues = A.matrixValues[i];
|
||||
const local_int_t* const currentColIndices = A.mtxIndL[i];
|
||||
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
|
||||
const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
|
||||
double sum = rv[i]; // RHS value
|
||||
|
||||
for (int j = 0; j < currentNumberOfNonzeros; j++)
|
||||
{
|
||||
local_int_t curCol = currentColIndices[j];
|
||||
sum -= currentValues[j] * xv[curCol];
|
||||
}
|
||||
sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
|
||||
|
||||
xv[i] = sum / currentDiagonal;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
src/ComputeSYMGS_ref.hpp
Normal file
22
src/ComputeSYMGS_ref.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTESYMGS_REF_HPP
|
||||
#define COMPUTESYMGS_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x);
|
||||
|
||||
#endif // COMPUTESYMGS_REF_HPP
|
||||
89
src/ComputeWAXPBY.cpp
Normal file
89
src/ComputeWAXPBY.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeWAXPBY.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "mytimer.hpp"
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#endif
|
||||
#include "ComputeWAXPBY.hpp"
|
||||
#include "ComputeWAXPBY_ref.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute the update of a vector with the sum of two
|
||||
scaled vectors where: w = alpha*x + beta*y
|
||||
|
||||
This routine calls the reference WAXPBY implementation by default, but
|
||||
can be replaced by a custom, optimized routine suited for
|
||||
the target system.
|
||||
|
||||
@param[in] n the number of vector elements (on this processor)
|
||||
@param[in] alpha, beta the scalars applied to x and y respectively.
|
||||
@param[in] x, y the input vectors
|
||||
@param[out] w the output vector
|
||||
@param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
|
||||
otherwise leave it unchanged
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeWAXPBY_ref
|
||||
*/
|
||||
|
||||
int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
|
||||
Vector& w, bool& isOptimized, rank_type_t rt)
|
||||
{
|
||||
if (rt == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ComputeWAXPBYCuda(n, alpha, x, beta, y, w);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ComputeWAXPBYCpu(n, alpha, x, beta, y, w, isOptimized);
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
39
src/ComputeWAXPBY.hpp
Normal file
39
src/ComputeWAXPBY.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef COMPUTEWAXPBY_HPP
|
||||
#define COMPUTEWAXPBY_HPP
|
||||
#include "Vector.hpp"
|
||||
|
||||
int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
|
||||
Vector& w, bool& isOptimized, rank_type_t rt);
|
||||
|
||||
#endif // COMPUTEWAXPBY_HPP
|
||||
79
src/ComputeWAXPBY_ref.cpp
Normal file
79
src/ComputeWAXPBY_ref.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file ComputeWAXPBY_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "ComputeWAXPBY_ref.hpp"
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
#include <cassert>
|
||||
/*!
|
||||
Routine to compute the update of a vector with the sum of two
|
||||
scaled vectors where: w = alpha*x + beta*y
|
||||
|
||||
This is the reference WAXPBY impmentation. It CANNOT be modified for the
|
||||
purposes of this benchmark.
|
||||
|
||||
@param[in] n the number of vector elements (on this processor)
|
||||
@param[in] alpha, beta the scalars applied to x and y respectively.
|
||||
@param[in] x, y the input vectors
|
||||
@param[out] w the output vector.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeWAXPBY
|
||||
*/
|
||||
int ComputeWAXPBY_ref(
|
||||
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w)
|
||||
{
|
||||
|
||||
assert(x.localLength >= n); // Test vector lengths
|
||||
assert(y.localLength >= n);
|
||||
|
||||
const double* const xv = x.values;
|
||||
const double* const yv = y.values;
|
||||
double* const wv = w.values;
|
||||
|
||||
if (alpha == 1.0)
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
wv[i] = xv[i] + beta * yv[i];
|
||||
}
|
||||
else if (beta == 1.0)
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
wv[i] = alpha * xv[i] + yv[i];
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < n; i++)
|
||||
wv[i] = alpha * xv[i] + beta * yv[i];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ComputeWAXPBY_ref.hpp
Normal file
20
src/ComputeWAXPBY_ref.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef COMPUTEWAXPBY_REF_HPP
|
||||
#define COMPUTEWAXPBY_REF_HPP
|
||||
#include "Vector.hpp"
|
||||
int ComputeWAXPBY_ref(
|
||||
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
|
||||
#endif // COMPUTEWAXPBY_REF_HPP
|
||||
1351
src/CpuKernels.cpp
Normal file
1351
src/CpuKernels.cpp
Normal file
File diff suppressed because it is too large
Load Diff
92
src/CpuKernels.hpp
Normal file
92
src/CpuKernels.hpp
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef CPUKERNELS_HPP
|
||||
#define CPUKERNELS_HPP
|
||||
|
||||
#ifdef USE_GRACE
|
||||
|
||||
#include <nvpl_sparse.h>
|
||||
extern nvpl_sparse_handle_t nvpl_sparse_handle;
|
||||
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
///////// Deallocate CPU Memory for data structures //
|
||||
void DeleteMatrixCpu(SparseMatrix& A);
|
||||
|
||||
///////// Find the size of CPU reference allocated memory //
|
||||
size_t EstimateCpuRefMem(SparseMatrix& A);
|
||||
|
||||
/*
|
||||
Translation of a 3D point in all directions
|
||||
27 possibilities
|
||||
*/
|
||||
constexpr int tid2indCpu[32][4] = {{-1, -1, -1, 0}, {0, -1, -1, 0}, {1, -1, -1, 0}, {-1, 0, -1, 0}, {0, 0, -1, 0},
|
||||
{1, 0, -1, 0}, {-1, 1, -1, 0}, {0, 1, -1, 0}, {1, 1, -1, 0}, {-1, -1, 0, 0}, {0, -1, 0, 0}, {1, -1, 0, 0},
|
||||
{-1, 0, 0, 0}, {0, 0, 0, 0}, {1, 0, 0, 0}, {-1, 1, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {-1, -1, 1, 0}, {0, -1, 1, 0},
|
||||
{1, -1, 1, 0}, {-1, 0, 1, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {-1, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 0},
|
||||
{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
|
||||
|
||||
// Generate Problem
|
||||
// Inclusive Prefix Sum
|
||||
void PrefixsumCpu(int* x, int N);
|
||||
|
||||
// Optimize Problem
|
||||
size_t AllocateMemCpu(SparseMatrix& A_in);
|
||||
void ColorMatrixCpu(SparseMatrix& A, int* num_colors);
|
||||
void CreateSellPermCpu(SparseMatrix& A);
|
||||
void F2cPermCpu(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2c_perm, local_int_t* perm_f, local_int_t* iperm_c);
|
||||
|
||||
// Permute a vector using coloring buffer
|
||||
void PermVectorCpu(local_int_t* perm, Vector& x, local_int_t length);
|
||||
|
||||
// Test CG
|
||||
void ReplaceMatrixDiagonalCpu(SparseMatrix& A, Vector diagonal);
|
||||
|
||||
// CG Support Kernels
|
||||
// Dot-product Per single rank
|
||||
void ComputeDotProductCpu(const local_int_t n, const Vector& x, const Vector& y, double& result, bool& isOptimized);
|
||||
|
||||
// WAXPBY
|
||||
int ComputeWAXPBYCpu(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
|
||||
Vector& w, bool& isOptimized);
|
||||
// SYMGS
|
||||
void SpmvDiagCpu(local_int_t n, const double* x, double* y, double* z);
|
||||
void AxpbyCpu(local_int_t n, double* x, double* y, double* z);
|
||||
void SpFmaCpu(local_int_t n, const double* x, double* y, double* z);
|
||||
|
||||
// External Matrix SpMV + Scatter
|
||||
void ExtSpMVCpu(const SparseMatrix& A, const local_int_t n, const double alpha, const double* x, double* y);
|
||||
|
||||
#endif // USE_GRACE
|
||||
#endif // CPUKERNELS_HPP
|
||||
87
src/Cuda.hpp
Normal file
87
src/Cuda.hpp
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#ifdef USE_CUDA
|
||||
#include "cublas_v2.h"
|
||||
#include "cuda_runtime_api.h"
|
||||
#include "cusparse.h"
|
||||
#include <cuda.h>
|
||||
#ifdef USE_NCCL
|
||||
#include "nccl.h"
|
||||
#endif
|
||||
#ifdef USE_NVTX
|
||||
#include <nvToolsExt.h>
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
|
||||
extern cusparseHandle_t cusparsehandle;
|
||||
extern cublasHandle_t cublashandle;
|
||||
extern cudaStream_t stream;
|
||||
extern cudaEvent_t copy_done;
|
||||
extern cudaStream_t copy_stream;
|
||||
extern int* ranktoId; // DEV:Compress rank in MPI_WORLD to Neighbors
|
||||
extern int* rankToId_h; // HOST:Compress rank in MPI_WORLD to Neighbors
|
||||
extern int* idToRank_h;
|
||||
extern bool Use_Compression; /*USE CUDA L2 compression*/
|
||||
extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#define CHECK_CUDART(x) \
|
||||
do \
|
||||
{ \
|
||||
cudaError_t res = (x); \
|
||||
if (res != cudaSuccess) \
|
||||
{ \
|
||||
char rank_name[1024]; \
|
||||
gethostname(rank_name, 1024); \
|
||||
fprintf(stderr, "CUDART: %s = %d (%s) on %s at (%s:%d)\n", #x, res, cudaGetErrorString(res), rank_name, \
|
||||
__FILE__, __LINE__); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// IF NVTX is needed for profiling, please define USE_NVTX
|
||||
// Then, add PUSH_RANGE and POP_RANGE around the target code block
|
||||
// See, https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
|
||||
// #define USE_NVTX
|
||||
#ifdef USE_NVTX
|
||||
const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
|
||||
const int num_colors = sizeof(colors) / sizeof(uint32_t);
|
||||
#define PUSH_RANGE(name, cid) \
|
||||
{ \
|
||||
int color_id = cid; \
|
||||
color_id = color_id % num_colors; \
|
||||
nvtxEventAttributes_t eventAttrib = {0}; \
|
||||
eventAttrib.version = NVTX_VERSION; \
|
||||
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
|
||||
eventAttrib.colorType = NVTX_COLOR_ARGB; \
|
||||
eventAttrib.color = colors[color_id]; \
|
||||
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
|
||||
eventAttrib.message.ascii = name; \
|
||||
nvtxRangePushEx(&eventAttrib); \
|
||||
}
|
||||
#define POP_RANGE nvtxRangePop();
|
||||
#else
|
||||
#define PUSH_RANGE(name, cid) \
|
||||
{ \
|
||||
}
|
||||
#define POP_RANGE
|
||||
#endif
|
||||
#endif
|
||||
2613
src/CudaKernels.cu
Normal file
2613
src/CudaKernels.cu
Normal file
File diff suppressed because it is too large
Load Diff
92
src/CudaKernels.hpp
Normal file
92
src/CudaKernels.hpp
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#ifdef USE_CUDA
|
||||
#include "SparseMatrix.hpp"
|
||||
|
||||
///////// L2 Memory Compression Allocation Support Routines //
|
||||
cudaError_t setProp(CUmemAllocationProp* prop);
|
||||
cudaError_t cudaMallocCompressible(void** adr, size_t size);
|
||||
cudaError_t cudaFreeCompressible(void* ptr, size_t size);
|
||||
|
||||
///////// Allocate CUDA Memory for data structures //
|
||||
local_int_t EstimateLUmem(local_int_t n, local_int_t padded_n, local_int_t level);
|
||||
void AllocateMemCuda(SparseMatrix& A_in);
|
||||
void AllocateMemOptCuda(SparseMatrix& A_in);
|
||||
|
||||
///////// Deallocate CUDA Memory for data structures //
|
||||
void DeleteMatrixGpu(SparseMatrix& A);
|
||||
|
||||
///////// Genrerate Problem //
|
||||
void GenerateProblemCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
|
||||
|
||||
// Halo Exchange
|
||||
void SetupHaloCuda(SparseMatrix& A, local_int_t sendbufld, local_int_t* sendlen, local_int_t* sendbuff,
|
||||
local_int_t* tot_to_send, int* nneighs, int* neighs_h, local_int_t* sendlen_h, local_int_t** elem_to_send_d);
|
||||
void ExtToLocMapCuda(
|
||||
local_int_t localNumberOfRows, local_int_t str, local_int_t end, local_int_t* extToLocMap, local_int_t* eltsToRecv);
|
||||
void ExtTolocCuda(local_int_t localNumberOfRows, int neighborId, local_int_t ext_nnz, local_int_t* csr_ext_columns,
|
||||
double* csr_ext_values, local_int_t* ext2csr_offsets, local_int_t* extToLocMap, local_int_t* csrColumns);
|
||||
void PackSendBufferCuda(const SparseMatrix& A, Vector& x, bool cpu_data, cudaStream_t stream1);
|
||||
void ExchangeHaloCuda(const SparseMatrix& A, Vector& x, cudaStream_t stream1, int use_ibarrier = 0);
|
||||
|
||||
// Optimize Problem
|
||||
void SetVectorAscCuda(local_int_t* arr, local_int_t n);
|
||||
void ColorMatrixCuda(double* A_vals, local_int_t* A_col, local_int_t* nnzPerRow, local_int_t rows, local_int_t* color,
|
||||
int* num_colors, int* count_colors, int max_colors, local_int_t* ref2opt, local_int_t* opt2ref, int rank, int nx,
|
||||
int* rowhash);
|
||||
void PermElemToSendCuda(local_int_t totalToBeSent, local_int_t* elementsToSend, local_int_t* perm);
|
||||
void EllPermColumnsValuesCuda(local_int_t localNumberOfRows, local_int_t* nnzPerRow, local_int_t* csrColumns,
|
||||
double* csrValues, local_int_t* permOffsets, local_int_t* permColumns, double* permValues, local_int_t* opt2ref,
|
||||
local_int_t* ref2opt, local_int_t* diagonalIdx, local_int_t* permLOffsets, local_int_t* permUOffsets, bool diag);
|
||||
void TransposeCuda(local_int_t n, local_int_t slice_size, local_int_t* sellCollIndex, double* sellValues);
|
||||
void EllMaxRowLenPerBlockCuda(local_int_t nrow, int sliceSize, local_int_t* sellLPermOffsets,
|
||||
local_int_t* sellUPermOffsets, local_int_t* sellLSliceMrl, local_int_t* sellUSliceMrl);
|
||||
void PrefixsumCuda(local_int_t localNumberOfRows, local_int_t* arr);
|
||||
void MultiplyBySliceSizeCUDA(local_int_t nrow, int slice_size, local_int_t* arr);
|
||||
void CreateAMatrixSliceOffsetsCuda(local_int_t nrow, local_int_t slice_size, local_int_t* arr);
|
||||
void CreateSellLUColumnsValuesCuda(const local_int_t n, int sliceSize, local_int_t* columns, double* values,
|
||||
local_int_t* sellLSliceOffset, local_int_t* sellLColumns, double* sellLValues, local_int_t* sellUSliceOffset,
|
||||
local_int_t* sellUColumns, double* sellUValues, int level);
|
||||
void PermVectorCuda(local_int_t* perm, Vector& x, local_int_t length);
|
||||
void F2cPermCuda(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2cPerm, local_int_t* permF, local_int_t* ipermC);
|
||||
|
||||
// Test CG
|
||||
void ReplaceMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
|
||||
void CopyMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
|
||||
|
||||
// CG Support Kernels
|
||||
// 1. MG
|
||||
void ComputeRestrictionCuda(const SparseMatrix& A, const Vector& r);
|
||||
void ComputeProlongationCuda(const SparseMatrix& A, Vector& x);
|
||||
|
||||
// 2. WAXPBY
|
||||
void ComputeWAXPBYCuda(
|
||||
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
|
||||
|
||||
// 3.SYMGS
|
||||
void SpmvDiagCuda(local_int_t n, double* x, double* d);
|
||||
void AxpbyCuda(local_int_t n, double* x, double* y, double* z);
|
||||
void SpFmaCuda(local_int_t n, double* x, double* y, double* z);
|
||||
|
||||
// 4.External Matrix SpMV + Scatter
|
||||
void ExtSpMVCuda(SparseMatrix& A, double alpha, double* x, double* y);
|
||||
|
||||
// Transfer Problem to CPU
|
||||
size_t CopyDataToHostCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
|
||||
#endif
|
||||
205
src/ExchangeHalo.cpp
Normal file
205
src/ExchangeHalo.cpp
Normal file
@@ -0,0 +1,205 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ExchangeHalo.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
// Compile this routine only if running with MPI
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#include "Geometry.hpp"
|
||||
#include <cstdlib>
|
||||
#include <mpi.h>
|
||||
|
||||
extern p2p_comm_mode_t P2P_Mode;
|
||||
|
||||
/*!
|
||||
Communicates data that is at the border of the part of the domain assigned to this processor.
|
||||
|
||||
@param[in] A The known system matrix
|
||||
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
|
||||
non-local entries updated by other processors
|
||||
*/
|
||||
void ExchangeHalo(const SparseMatrix& A, Vector& x)
|
||||
{
|
||||
local_int_t localNumberOfRows = A.localNumberOfRows;
|
||||
int num_neighbors = A.numberOfSendNeighbors;
|
||||
local_int_t * receiveLength = A.receiveLength;
|
||||
local_int_t * sendLength = A.sendLength;
|
||||
int * neighbors = A.neighbors;
|
||||
double * sendBuffer = A.sendBuffer;
|
||||
local_int_t totalToBeSent = A.totalToBeSent;
|
||||
local_int_t * elementsToSend = A.elementsToSend;
|
||||
|
||||
double * const xv = x.values;
|
||||
|
||||
int size, rank; // Number of MPI processes, My process ID
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &size);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
|
||||
//
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
//
|
||||
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request * request = new MPI_Request[num_neighbors];
|
||||
|
||||
//
|
||||
// Externals are at end of locals
|
||||
//
|
||||
double * x_external = (double *) xv + localNumberOfRows;
|
||||
|
||||
// Post receives first
|
||||
// TODO: Thread this loop
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
local_int_t n_recv = receiveLength[i];
|
||||
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
|
||||
x_external += n_recv;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Fill up send buffer
|
||||
//
|
||||
|
||||
// TODO: Thread this loop
|
||||
for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
|
||||
|
||||
//
|
||||
// Send to each neighbor
|
||||
//
|
||||
|
||||
// TODO: Thread this loop
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
local_int_t n_send = sendLength[i];
|
||||
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
sendBuffer += n_send;
|
||||
}
|
||||
|
||||
//
|
||||
// Complete the reads issued above
|
||||
//
|
||||
|
||||
MPI_Status status;
|
||||
// TODO: Thread this loop
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
if ( MPI_Wait(request+i, &status) ) {
|
||||
std::exit(-1); // TODO: have better error exit
|
||||
}
|
||||
}
|
||||
|
||||
delete [] request;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
|
||||
|
||||
@param[in] A The known system matrix
|
||||
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
|
||||
non-local entries updated by other processors
|
||||
@param[in] use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT,
|
||||
by calling MPI_Ibarrier once at the last routine call in MG.
|
||||
*/
|
||||
void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
|
||||
{
|
||||
// Extract Matrix pieces
|
||||
local_int_t localNumberOfRows = A.localNumberOfRows;
|
||||
int num_neighbors = A.numberOfSendNeighbors;
|
||||
local_int_t* receiveLength = A.receiveLength;
|
||||
local_int_t* sendLength = A.sendLength;
|
||||
int* neighbors = A.neighborsPhysical;
|
||||
double* sendBuffer = A.sendBuffer;
|
||||
local_int_t totalToBeSent = A.totalToBeSent;
|
||||
local_int_t* elementsToSend = A.elementsToSend;
|
||||
|
||||
if (P2P_Mode == MPI_CPU)
|
||||
{
|
||||
double* const xv = x.values;
|
||||
double* x_external = (double*) xv + localNumberOfRows;
|
||||
int MPI_MY_TAG = 99;
|
||||
MPI_Request* request = new MPI_Request[num_neighbors];
|
||||
|
||||
// Post receives first
|
||||
for (int i = 0; i < num_neighbors; i++)
|
||||
{
|
||||
local_int_t n_recv = receiveLength[i];
|
||||
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
|
||||
x_external += n_recv;
|
||||
}
|
||||
|
||||
for (local_int_t i = 0; i < totalToBeSent; i++)
|
||||
sendBuffer[i] = xv[elementsToSend[i]];
|
||||
|
||||
//
|
||||
// Send to each neighbor
|
||||
//
|
||||
for (int i = 0; i < num_neighbors; i++)
|
||||
{
|
||||
local_int_t n_send = sendLength[i];
|
||||
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
sendBuffer += n_send;
|
||||
}
|
||||
|
||||
//
|
||||
// Complete the reads issued above
|
||||
//
|
||||
|
||||
MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
|
||||
|
||||
//[Experimental] Can improve MPI_Allreduce performance
|
||||
#if 0
|
||||
if (use_ibarrier == 1)
|
||||
MPI_Ibarrier(MPI_COMM_WORLD, request);
|
||||
#endif
|
||||
|
||||
delete[] request;
|
||||
}
|
||||
else if (P2P_Mode == MPI_CPU_All2allv)
|
||||
{
|
||||
double* const xv = x.values;
|
||||
double* x_external = (double*) xv + localNumberOfRows;
|
||||
for (local_int_t i = 0; i < totalToBeSent; i++)
|
||||
sendBuffer[i] = xv[elementsToSend[i]];
|
||||
MPI_Alltoallv(
|
||||
sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// ifndef HPCG_NO_MPI
|
||||
38
src/ExchangeHalo.hpp
Normal file
38
src/ExchangeHalo.hpp
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef EXCHANGEHALO_HPP
|
||||
#define EXCHANGEHALO_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
void ExchangeHalo(const SparseMatrix& A, Vector& x);
|
||||
void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier = 0);
|
||||
#endif // EXCHANGEHALO_HPP
|
||||
158
src/GenerateCoarseProblem.cpp
Normal file
158
src/GenerateCoarseProblem.cpp
Normal file
@@ -0,0 +1,158 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file GenerateProblem.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "GenerateCoarseProblem.hpp"
|
||||
#include "GenerateGeometry.hpp"
|
||||
#include "GenerateProblem.hpp"
|
||||
#include "SetupHalo.hpp"
|
||||
#include <cassert>
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Used to find ranks for CPU and GPU programs
|
||||
extern int global_total_ranks;
|
||||
extern int* physical_rank_dims;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Routine to construct a prolongation/restriction operator for a given fine grid matrix
|
||||
solution (as computed by a direct solver).
|
||||
|
||||
@param[inout] Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary
|
||||
vectors will be defined.
|
||||
|
||||
Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.
|
||||
|
||||
*/
|
||||
|
||||
void GenerateCoarseProblem(const SparseMatrix& Af)
|
||||
{
|
||||
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
|
||||
// below may result in global range values.
|
||||
global_int_t nxf = Af.geom->nx;
|
||||
global_int_t nyf = Af.geom->ny;
|
||||
global_int_t nzf = Af.geom->nz;
|
||||
|
||||
local_int_t nxc, nyc, nzc; // Coarse nx, ny, nz
|
||||
assert(nxf % 2 == 0);
|
||||
assert(nyf % 2 == 0);
|
||||
assert(nzf % 2 == 0); // Need fine grid dimensions to be divisible by 2
|
||||
nxc = nxf / 2;
|
||||
nyc = nyf / 2;
|
||||
nzc = nzf / 2;
|
||||
local_int_t* f2cOperator = new local_int_t[Af.localNumberOfRows];
|
||||
|
||||
local_int_t localNumberOfRows = nxc * nyc * nzc; // This is the size of our subblock
|
||||
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
|
||||
assert(localNumberOfRows
|
||||
> 0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)
|
||||
|
||||
for (int i = 0; i < 3 * global_total_ranks; i++)
|
||||
physical_rank_dims[i] = physical_rank_dims[i] / 2;
|
||||
|
||||
// Construct the geometry and linear system
|
||||
Geometry* geomc = new Geometry;
|
||||
GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy,
|
||||
Af.geom->npz, Af.geom->different_dim, geomc);
|
||||
Vector* rc = new Vector;
|
||||
Vector* xc = new Vector;
|
||||
Vector* Axf = new Vector;
|
||||
MGData* mgData = new MGData;
|
||||
if (Af.rankType == GPU)
|
||||
{
|
||||
SparseMatrix* Ac = Af.Ac;
|
||||
Ac->rankType = GPU;
|
||||
InitializeSparseMatrix(*Ac, geomc);
|
||||
GenerateProblem(*Ac, 0, 0, 0);
|
||||
SetupHalo(*Ac);
|
||||
InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
|
||||
InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
|
||||
InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
|
||||
#ifdef USE_CUDA
|
||||
cudaMemcpy(f2cOperator, Af.gpuAux.f2c, sizeof(local_int_t) * localNumberOfRows, cudaMemcpyDeviceToHost);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
SparseMatrix* Ac = new SparseMatrix;
|
||||
InitializeSparseMatrix(*Ac, geomc);
|
||||
Ac->rankType = CPU;
|
||||
(*Ac).Ac = 0;
|
||||
GenerateProblem(*Ac, 0, 0, 0);
|
||||
SetupHalo(*Ac);
|
||||
InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
|
||||
InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
|
||||
InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
|
||||
Af.Ac = Ac;
|
||||
|
||||
// Use a parallel loop to do initial assignment:
|
||||
// distributes the physical placement of arrays of pointers across the memory system
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
{
|
||||
f2cOperator[i] = 0;
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for(local_int_t i = 0; i < nzc * nyc * nxc; i++)
|
||||
{
|
||||
local_int_t izc = (i / (nxc * nyc));
|
||||
local_int_t iyc = (i - izc * nxc * nyc) / nxc;
|
||||
local_int_t ixc = i - (izc * nyc + iyc) * nxc;
|
||||
|
||||
local_int_t izf = 2 * izc;
|
||||
local_int_t iyf = 2 * iyc;
|
||||
local_int_t ixf = 2 * ixc;
|
||||
|
||||
local_int_t currentCoarseRow = izc * nxc * nyc + iyc * nxc + ixc;
|
||||
local_int_t currentFineRow = izf * nxf * nyf + iyf * nxf + ixf;
|
||||
f2cOperator[currentCoarseRow] = currentFineRow;
|
||||
}
|
||||
}
|
||||
InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
|
||||
Af.mgData = mgData;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
19
src/GenerateCoarseProblem.hpp
Normal file
19
src/GenerateCoarseProblem.hpp
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef GENERATECOARSEPROBLEM_HPP
|
||||
#define GENERATECOARSEPROBLEM_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
void GenerateCoarseProblem(const SparseMatrix& A);
|
||||
#endif // GENERATECOARSEPROBLEM_HPP
|
||||
801
src/GenerateGeometry.cpp
Normal file
801
src/GenerateGeometry.cpp
Normal file
@@ -0,0 +1,801 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file GenerateGeometry.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ComputeOptimalShapeXYZ.hpp"
|
||||
#include "GenerateGeometry.hpp"
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
#include "hpcg.hpp"
|
||||
#include <fstream>
|
||||
using std::endl;
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Used to find ranks for CPU and GPU programs
|
||||
extern int global_total_ranks;
|
||||
extern int* physical_rank_dims;
|
||||
extern int* logical_rank_to_phys;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Computes the factorization of the total number of processes into a
|
||||
3-dimensional process grid that is as close as possible to a cube. The
|
||||
quality of the factorization depends on the prime number structure of the
|
||||
total number of processes. It then stores this decompostion together with the
|
||||
parallel parameters of the run in the geometry data structure.
|
||||
|
||||
@param[in] size total number of MPI processes
|
||||
@param[in] rank this process' rank among other MPI processes
|
||||
@param[in] numThreads number of OpenMP threads in this process
|
||||
@param[in] nx, ny, nz number of grid points for each local block in the x, y, and z dimensions, respectively
|
||||
@param[out] geom data structure that will store the above parameters and the factoring of total number of processes
|
||||
into three dimensions
|
||||
*/
|
||||
|
||||
// Level 0 Generation, we need to decide nx, ny, nz based on
|
||||
// G2C ratio and npx, npy, npz
|
||||
// Remap rank IDs to logical IDs to enforce 3D shape correctness when exec_mode is GPUCPU
|
||||
void GenerateGeometry(HPCG_Params& params, Geometry* geom)
|
||||
{
|
||||
int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
|
||||
int nx = params.nx, ny = params.ny, nz = params.nz;
|
||||
int npx = params.npx, npy = params.npy, npz = params.npz;
|
||||
|
||||
// If npx. npy, and npz are not provided by user
|
||||
// find the optimal shape
|
||||
if (npx * npy * npz <= 0 || npx * npy * npz > size)
|
||||
ComputeOptimalShapeXYZ(size, npx, npy, npz);
|
||||
|
||||
// When search_for_same0 is true, finds the next rank that is the same as local
|
||||
// problem size as rank 0. When false, finds the ranks that are not the same as rank 0
|
||||
auto loop_over_ranks = [](int index, int lp, bool search_for_same0) -> int
|
||||
{
|
||||
for (int p = index; p < global_total_ranks; p++)
|
||||
{
|
||||
int nnpx = physical_rank_dims[3 * p];
|
||||
int nnpy = physical_rank_dims[3 * p + 1];
|
||||
int nnpz = physical_rank_dims[3 * p + 2];
|
||||
bool same_zero = false;
|
||||
if (nnpx == physical_rank_dims[0] && nnpy == physical_rank_dims[1] && nnpz == physical_rank_dims[2])
|
||||
same_zero = true;
|
||||
|
||||
if (same_zero == search_for_same0)
|
||||
{
|
||||
logical_rank_to_phys[lp] = p;
|
||||
index = p + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return index;
|
||||
};
|
||||
|
||||
// Here decide and broadcast nx, ny, nz
|
||||
// 1 Check for GPU and CPU execution modes
|
||||
auto user_diff_dim = NONE;
|
||||
if (params.exec_mode == GPUCPU)
|
||||
{
|
||||
// User defined diff direction between GPU and CPU
|
||||
// If user decides that nz should be diff between GPU and CPU
|
||||
// and NPZ is even --> Decide GPU and CPU local size based on
|
||||
// local_problem_def and g2c
|
||||
if (params.diff_dim == Z && (npz & 1) == 0)
|
||||
{
|
||||
user_diff_dim = Z;
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = nz / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = nz / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nz = nz - (nz / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
nz = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nz = nz - params.g2c;
|
||||
}
|
||||
}
|
||||
// If user decides that ny should be diff between GPU and CPU
|
||||
// and NPY is even --> Decide GPU and CPU local size based on
|
||||
// local_problem_def and g2c
|
||||
else if (params.diff_dim == Y && (npy & 1) == 0)
|
||||
{
|
||||
user_diff_dim = Y;
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = ny / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = ny / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
ny = ny - (ny / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
ny = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
ny = ny - params.g2c;
|
||||
}
|
||||
}
|
||||
// If user decides that nx should be diff between GPU and CPU
|
||||
// and NPX is even --> Decide GPU and CPU local size based on
|
||||
// local_problem_def and g2c
|
||||
else if (params.diff_dim == X && (npx & 1) == 0)
|
||||
{
|
||||
user_diff_dim = X;
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = nx / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = nx / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nx = nx - (nx / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
nx = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nx = nx - params.g2c;
|
||||
}
|
||||
}
|
||||
// Automatic partition direction
|
||||
// When user does not specify the diff dimension
|
||||
if (user_diff_dim == NONE)
|
||||
{ // Did not succeed with user choice
|
||||
if ((npz & 1) == 0)
|
||||
{
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = nz / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nz = nz / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nz = nz - (nz / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
nz = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nz = nz - params.g2c;
|
||||
}
|
||||
}
|
||||
else if ((npy & 1) == 0)
|
||||
{
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = ny / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
ny = ny / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
ny = ny - (ny / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
ny = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
ny = ny - params.g2c;
|
||||
}
|
||||
}
|
||||
else if ((npx & 1) == 0)
|
||||
{
|
||||
if (params.local_problem_def == GPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = nx / params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_ABS)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = params.g2c;
|
||||
}
|
||||
else if (params.local_problem_def == GPU_CPU_RATIO)
|
||||
{
|
||||
if (params.rank_type == CPU)
|
||||
nx = nx / params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nx = nx - (nx / params.g2c);
|
||||
}
|
||||
else
|
||||
{ /*GPU_CPU_ABS*/
|
||||
if (params.rank_type == CPU)
|
||||
nx = params.g2c;
|
||||
if (params.rank_type == GPU)
|
||||
nx = nx - params.g2c;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now let us exchange dimensions
|
||||
int sendBuf[] = {nx, ny, nz};
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Allgather(sendBuf, 3, MPI_INT, physical_rank_dims, 3, MPI_INT, MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
// My logical rank Id
|
||||
int logical_rank;
|
||||
// last physical position for the rank that has the same size as 0
|
||||
int same_as_0_position = 0;
|
||||
// last physical position for the rank that does not have the same size as 0
|
||||
int not_same_as_0_position = 0;
|
||||
auto different_dim = NONE;
|
||||
|
||||
bool all_same = true;
|
||||
int num_ranks_same = 1;
|
||||
int num_ranks_not_same = 0;
|
||||
int x0 = physical_rank_dims[0];
|
||||
int y0 = physical_rank_dims[1];
|
||||
int z0 = physical_rank_dims[2];
|
||||
for (int p = 1; p < global_total_ranks; p++)
|
||||
{
|
||||
int x = physical_rank_dims[3 * p];
|
||||
int y = physical_rank_dims[3 * p + 1];
|
||||
int z = physical_rank_dims[3 * p + 2];
|
||||
if (x != x0 || y != y0 || z != z0)
|
||||
num_ranks_not_same++;
|
||||
else
|
||||
num_ranks_same++;
|
||||
}
|
||||
|
||||
if (num_ranks_not_same > 0)
|
||||
all_same = false;
|
||||
|
||||
if (!all_same)
|
||||
{
|
||||
// try twice: user-based, automatic
|
||||
for (int i = 0; i < 2; i++)
|
||||
{
|
||||
bool z_condition = (i == 0) ? user_diff_dim == Z && (npz & 1) == 0 : (npz & 1) == 0;
|
||||
bool y_condition = (i == 0) ? user_diff_dim == Y && (npy & 1) == 0 : (npy & 1) == 0;
|
||||
bool x_condition = (i == 0) ? user_diff_dim == X && (npx & 1) == 0 : (npx & 1) == 0;
|
||||
// Let us start with Z
|
||||
if (z_condition)
|
||||
{ // Z is even
|
||||
different_dim = Z;
|
||||
bool x_same = true;
|
||||
bool y_same = true;
|
||||
for (int p = 1; p < global_total_ranks; p++)
|
||||
{
|
||||
int x = physical_rank_dims[3 * p];
|
||||
int y = physical_rank_dims[3 * p + 1];
|
||||
assert(x == x0 && y == y0);
|
||||
}
|
||||
}
|
||||
else if (y_condition)
|
||||
{ // Y is even
|
||||
different_dim = Y;
|
||||
bool x_same = true;
|
||||
bool z_same = true;
|
||||
for (int p = 1; p < global_total_ranks; p++)
|
||||
{
|
||||
int x = physical_rank_dims[3 * p];
|
||||
int z = physical_rank_dims[3 * p + 2];
|
||||
assert(x == x0 && z == z0);
|
||||
}
|
||||
}
|
||||
else if (x_condition)
|
||||
{
|
||||
different_dim = X;
|
||||
bool y_same = true;
|
||||
bool z_same = true;
|
||||
for (int p = 1; p < global_total_ranks; p++)
|
||||
{
|
||||
int y = physical_rank_dims[3 * p + 1];
|
||||
int z = physical_rank_dims[3 * p + 2];
|
||||
assert(z == z0 && y == y0);
|
||||
}
|
||||
}
|
||||
|
||||
if (z_condition || y_condition || x_condition)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// When exec_mode is GPUCPU, GPU and CPU ranks can have different dims. Therefore,
|
||||
// we must rearrange the ranks such that the 3D shape is correct.
|
||||
int same_rank_counter = 0;
|
||||
if (different_dim != NONE)
|
||||
{
|
||||
for (int iz = 0; iz < npz; iz++)
|
||||
for (int iy = 0; iy < npy; iy++)
|
||||
for (int ix = 0; ix < npx; ix++)
|
||||
{
|
||||
int logical_position = iz * npy * npx + iy * npx + ix;
|
||||
|
||||
// Different dim is Z
|
||||
// The first NPXxNPY are GPUs, then the next NPXxNPY is CPUs, and so on
|
||||
if (different_dim == Z)
|
||||
{
|
||||
if ((iz & 1) == 0 && same_rank_counter < num_ranks_same)
|
||||
{ // same as 0
|
||||
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
|
||||
same_rank_counter++;
|
||||
}
|
||||
else
|
||||
{ // Not same as 0
|
||||
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
|
||||
}
|
||||
}
|
||||
// Different dim is Y
|
||||
// The first NPXxNPZ are GPUs, then the next NPXxNPZ is CPUs, and so on
|
||||
else if (different_dim == Y)
|
||||
{
|
||||
if ((iy & 1) == 0 && same_rank_counter < num_ranks_same)
|
||||
{ // same as 0
|
||||
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
|
||||
same_rank_counter++;
|
||||
}
|
||||
else
|
||||
{ // Not same as 0
|
||||
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
|
||||
}
|
||||
}
|
||||
// Different dim is X
|
||||
// The first NPYxNPZ are GPUs, then the next NPYxNPZ is CPUs, and so on
|
||||
else if (different_dim == X)
|
||||
{
|
||||
if ((ix & 1) == 0 && same_rank_counter < num_ranks_same)
|
||||
{ // same as 0
|
||||
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
|
||||
same_rank_counter++;
|
||||
}
|
||||
else
|
||||
{ // Not same as 0
|
||||
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Keep rank Ids the same if all ranks have the same problem size
|
||||
for (int p = 0; p < global_total_ranks; p++)
|
||||
logical_rank_to_phys[p] = p;
|
||||
}
|
||||
|
||||
for (int p = 0; p < global_total_ranks; p++)
|
||||
{
|
||||
if (rank == logical_rank_to_phys[p])
|
||||
{
|
||||
logical_rank = p;
|
||||
}
|
||||
}
|
||||
|
||||
// Now compute this process's indices in the 3D cube
|
||||
int ipz = logical_rank / (npx * npy);
|
||||
int ipy = (logical_rank - ipz * npx * npy) / npx;
|
||||
int ipx = logical_rank % npx;
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
HPCG_fout << "size = " << size << endl
|
||||
<< "nx = " << nx << endl
|
||||
<< "ny = " << ny << endl
|
||||
<< "nz = " << nz << endl
|
||||
<< "npx = " << npx << endl
|
||||
<< "npy = " << npy << endl
|
||||
<< "npz = " << npz << endl;
|
||||
|
||||
HPCG_fout << "For rank = " << rank << endl
|
||||
<< "ipx = " << ipx << endl
|
||||
<< "ipy = " << ipy << endl
|
||||
<< "ipz = " << ipz << endl;
|
||||
|
||||
assert(size >= npx * npy * npz);
|
||||
#endif
|
||||
geom->size = size;
|
||||
geom->rank = rank;
|
||||
geom->logical_rank = logical_rank;
|
||||
geom->different_dim = different_dim;
|
||||
geom->numThreads = params.numThreads;
|
||||
geom->nx = nx;
|
||||
geom->ny = ny;
|
||||
geom->nz = nz;
|
||||
geom->npx = npx;
|
||||
geom->npy = npy;
|
||||
geom->npz = npz;
|
||||
geom->ipx = ipx;
|
||||
geom->ipy = ipy;
|
||||
geom->ipz = ipz;
|
||||
|
||||
// These values should be defined to take into account changes in nx, ny, nz values
|
||||
// due to variable local grid sizes
|
||||
global_int_t gnx = 0;
|
||||
global_int_t gny = 0;
|
||||
global_int_t gnz = 0;
|
||||
|
||||
// Find the global NX. NY, and NZ
|
||||
// For diff dims, accumulate sequentially
|
||||
// For similar dims, just multiply rank 3D location by the local dim
|
||||
if (different_dim == X)
|
||||
for (int i = 0; i < npx; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + i;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gnx += physical_rank_dims[p * 3];
|
||||
}
|
||||
else
|
||||
gnx = npx * nx;
|
||||
|
||||
if (different_dim == Y)
|
||||
for (int i = 0; i < npy; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + i * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gny += physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
else
|
||||
gny = npy * ny;
|
||||
|
||||
if (different_dim == Z)
|
||||
for (int i = 0; i < npz; i++)
|
||||
{
|
||||
int r = i * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gnz += physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
else
|
||||
gnz = npz * nz;
|
||||
|
||||
// Here, we find the initial global indices (gix0, giy0, and giz0)
|
||||
// for each rank based on its 3d location in the grid
|
||||
// Also, for the diff dim find the previous and next neighbor IDs
|
||||
// Notice, on the diff dims the previous and next neighbors have
|
||||
// the different dimension!
|
||||
int prev_n = 0;
|
||||
int next_n = 0;
|
||||
global_int_t giz0 = 0;
|
||||
global_int_t gix0 = 0;
|
||||
global_int_t giy0 = 0;
|
||||
if (different_dim == X)
|
||||
{
|
||||
for (int i = 0; i < ipx; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + i;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gix0 += physical_rank_dims[p * 3];
|
||||
if (i == ipx - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3];
|
||||
}
|
||||
}
|
||||
if (ipx + 1 < npx)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + (ipx + 1);
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3];
|
||||
}
|
||||
}
|
||||
else
|
||||
gix0 = ipx * nx;
|
||||
|
||||
if (different_dim == Y)
|
||||
{
|
||||
for (int i = 0; i < ipy; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + i * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
giy0 += physical_rank_dims[p * 3 + 1];
|
||||
if (i == ipy - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
}
|
||||
if (ipy + 1 < npy)
|
||||
{
|
||||
int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
}
|
||||
else
|
||||
giy0 = ipy * ny;
|
||||
|
||||
if (different_dim == Z)
|
||||
{
|
||||
for (int i = 0; i < ipz; i++)
|
||||
{
|
||||
int r = i * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
giz0 += physical_rank_dims[p * 3 + 2];
|
||||
if (i == ipz - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
}
|
||||
if (ipz + 1 < npz)
|
||||
{
|
||||
int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
}
|
||||
else
|
||||
giz0 = ipz * nz;
|
||||
|
||||
// Keep these values for later
|
||||
geom->gnx = gnx;
|
||||
geom->gny = gny;
|
||||
geom->gnz = gnz;
|
||||
geom->gix0 = gix0;
|
||||
geom->giy0 = giy0;
|
||||
geom->giz0 = giz0;
|
||||
geom->previous_neighbor_dim = prev_n;
|
||||
geom->next_neighbor_dim = next_n;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Simpler generateion for next/coarse levels
|
||||
// Do not need to find nx, ny, nz for CPU and GPU based on parameters
|
||||
// Do not need to find logical rank IDs
|
||||
void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
|
||||
int npy, int npz, dim_3d_t different_dim, Geometry* geom)
|
||||
{
|
||||
|
||||
// My logical rank Id
|
||||
int logical_rank;
|
||||
for (int p = 0; p < global_total_ranks; p++)
|
||||
{
|
||||
if (rank == logical_rank_to_phys[p])
|
||||
{
|
||||
logical_rank = p;
|
||||
}
|
||||
}
|
||||
|
||||
// Now compute this process's indices in the 3D cube
|
||||
int ipz = logical_rank / (npx * npy);
|
||||
int ipy = (logical_rank - ipz * npx * npy) / npx;
|
||||
int ipx = logical_rank % npx;
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
HPCG_fout << "size = " << size << endl
|
||||
<< "nx = " << nx << endl
|
||||
<< "ny = " << ny << endl
|
||||
<< "nz = " << nz << endl
|
||||
<< "npx = " << npx << endl
|
||||
<< "npy = " << npy << endl
|
||||
<< "npz = " << npz << endl;
|
||||
|
||||
HPCG_fout << "For rank = " << rank << endl
|
||||
<< "ipx = " << ipx << endl
|
||||
<< "ipy = " << ipy << endl
|
||||
<< "ipz = " << ipz << endl;
|
||||
|
||||
assert(size >= npx * npy * npz);
|
||||
#endif
|
||||
geom->size = size;
|
||||
geom->rank = rank;
|
||||
geom->logical_rank = logical_rank;
|
||||
geom->different_dim = different_dim;
|
||||
geom->numThreads = numThreads;
|
||||
geom->nx = nx;
|
||||
geom->ny = ny;
|
||||
geom->nz = nz;
|
||||
geom->npx = npx;
|
||||
geom->npy = npy;
|
||||
geom->npz = npz;
|
||||
geom->ipx = ipx;
|
||||
geom->ipy = ipy;
|
||||
geom->ipz = ipz;
|
||||
|
||||
// Find the global NX. NY, and NZ
|
||||
// For diff dims, accumulate sequentially
|
||||
// For similar dims, just multiply rank 3D location by the local dim
|
||||
global_int_t gnx = 0;
|
||||
global_int_t gny = 0;
|
||||
global_int_t gnz = 0;
|
||||
if (different_dim == X)
|
||||
for (int i = 0; i < npx; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + i;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gnx += physical_rank_dims[p * 3];
|
||||
}
|
||||
else
|
||||
gnx = npx * nx;
|
||||
|
||||
if (different_dim == Y)
|
||||
for (int i = 0; i < npy; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + i * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gny += physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
else
|
||||
gny = npy * ny;
|
||||
|
||||
if (different_dim == Z)
|
||||
for (int i = 0; i < npz; i++)
|
||||
{
|
||||
int r = i * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gnz += physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
else
|
||||
gnz = npz * nz;
|
||||
|
||||
// Here, we find the initial global indices (gix0, giy0, and giz0)
|
||||
// for each rank based on its 3d location in the grid
|
||||
// Also, for the diff dim find the previous and next neighbor IDs
|
||||
// Notice, on the diff dims the previous and next neighbors have
|
||||
// the different dimension!
|
||||
int prev_n = 0;
|
||||
int next_n = 0;
|
||||
global_int_t giz0 = 0;
|
||||
global_int_t gix0 = 0;
|
||||
global_int_t giy0 = 0;
|
||||
if (different_dim == X)
|
||||
{
|
||||
for (int i = 0; i < ipx; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + i;
|
||||
int p = logical_rank_to_phys[r];
|
||||
gix0 += physical_rank_dims[p * 3];
|
||||
if (i == ipx - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3];
|
||||
}
|
||||
}
|
||||
if (ipx + 1 < npx)
|
||||
{
|
||||
int r = ipz * npx * npy + ipy * npx + (ipx + 1);
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3];
|
||||
}
|
||||
}
|
||||
else
|
||||
gix0 = ipx * nx;
|
||||
|
||||
if (different_dim == Y)
|
||||
{
|
||||
for (int i = 0; i < ipy; i++)
|
||||
{
|
||||
int r = ipz * npx * npy + i * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
giy0 += physical_rank_dims[p * 3 + 1];
|
||||
if (i == ipy - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
}
|
||||
if (ipy + 1 < npy)
|
||||
{
|
||||
int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3 + 1];
|
||||
}
|
||||
}
|
||||
else
|
||||
giy0 = ipy * ny;
|
||||
|
||||
if (different_dim == Z)
|
||||
{
|
||||
for (int i = 0; i < ipz; i++)
|
||||
{
|
||||
int r = i * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
giz0 += physical_rank_dims[p * 3 + 2];
|
||||
if (i == ipz - 1)
|
||||
{
|
||||
prev_n = physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
}
|
||||
if (ipz + 1 < npz)
|
||||
{
|
||||
int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
|
||||
int p = logical_rank_to_phys[r];
|
||||
next_n = physical_rank_dims[p * 3 + 2];
|
||||
}
|
||||
}
|
||||
else
|
||||
giz0 = ipz * nz;
|
||||
|
||||
// Keep these values for later
|
||||
geom->gnx = gnx;
|
||||
geom->gny = gny;
|
||||
geom->gnz = gnz;
|
||||
geom->gix0 = gix0;
|
||||
geom->giy0 = giy0;
|
||||
geom->giz0 = giz0;
|
||||
geom->previous_neighbor_dim = prev_n;
|
||||
geom->next_neighbor_dim = next_n;
|
||||
|
||||
return;
|
||||
}
|
||||
39
src/GenerateGeometry.hpp
Normal file
39
src/GenerateGeometry.hpp
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef GENERATEGEOMETRY_HPP
|
||||
#define GENERATEGEOMETRY_HPP
|
||||
#include "Geometry.hpp"
|
||||
#include "hpcg.hpp"
|
||||
void GenerateGeometry(HPCG_Params& params, Geometry* geom);
|
||||
void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
|
||||
int npy, int npz, dim_3d_t partition_by, Geometry* geom);
|
||||
#endif // GENERATEGEOMETRY_HPP
|
||||
404
src/GenerateProblem.cpp
Normal file
404
src/GenerateProblem.cpp
Normal file
@@ -0,0 +1,404 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file GenerateProblem.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "mytimer.hpp"
|
||||
|
||||
#include "GenerateProblem.hpp"
|
||||
#include "GenerateProblem_ref.hpp"
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
#include "CpuKernels.hpp"
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.
|
||||
|
||||
@param[in] A The generated system matrix
|
||||
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
|
||||
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
|
||||
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
|
||||
non-zero on entry)
|
||||
|
||||
@see GenerateGeometry
|
||||
*/
|
||||
#ifdef USE_CUDA
|
||||
void GenerateProblem_Gpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
|
||||
{
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
|
||||
local_int_t localNumberOfRows = nx * ny * nz;
|
||||
local_int_t numberOfNonzerosPerRow = 27;
|
||||
global_int_t totalNumberOfRows = gnx * gny * gnz;
|
||||
|
||||
if (b != 0)
|
||||
InitializeVector(*b, localNumberOfRows, GPU);
|
||||
if (x != 0)
|
||||
InitializeVector(*x, localNumberOfRows, GPU);
|
||||
if (xexact != 0)
|
||||
InitializeVector(*xexact, localNumberOfRows, GPU);
|
||||
|
||||
GenerateProblemCuda(A, b, x, xexact);
|
||||
|
||||
local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
|
||||
global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
|
||||
+ 18LL
|
||||
* (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
|
||||
+ 2LL * ((gny - 2LL) * (gnz - 2LL)))
|
||||
+ 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
|
||||
|
||||
A.title = 0;
|
||||
A.totalNumberOfRows = totalNumberOfRows;
|
||||
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
|
||||
A.localNumberOfRows = localNumberOfRows;
|
||||
A.localNumberOfColumns = localNumberOfRows;
|
||||
A.localNumberOfNonzeros = localNumberOfNonzeros;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
// Neighbor rank to sequential ID and vice versa
|
||||
extern int *rankToId_h, *idToRank_h;
|
||||
// GenerateProblem_Cpu is called 4 times for each level
|
||||
// Sometimes we need to perform actions based on the level (global across the applications)
|
||||
int global_steps = 0;
|
||||
void GenerateProblem_Cpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
|
||||
{
|
||||
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
|
||||
// below may result in global range values.
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
int npx = A.geom->npx;
|
||||
int npy = A.geom->npy;
|
||||
|
||||
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
|
||||
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
|
||||
assert(localNumberOfRows
|
||||
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
|
||||
local_int_t numberOfNonzerosPerRow
|
||||
= 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
|
||||
|
||||
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
|
||||
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
|
||||
assert(totalNumberOfRows
|
||||
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
|
||||
|
||||
// Allocate arrays that are of length localNumberOfRows
|
||||
if (global_steps == 0)
|
||||
{
|
||||
rankToId_h = new int[A.geom->size + 1];
|
||||
idToRank_h = new int[27];
|
||||
global_steps++;
|
||||
}
|
||||
local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
|
||||
global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
|
||||
local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
|
||||
double** matrixValues = new double*[localNumberOfRows];
|
||||
double** matrixDiagonal = new double*[localNumberOfRows];
|
||||
|
||||
if (b != 0)
|
||||
InitializeVector(*b, localNumberOfRows, CPU);
|
||||
if (x != 0)
|
||||
InitializeVector(*x, localNumberOfRows, CPU);
|
||||
if (xexact != 0)
|
||||
InitializeVector(*xexact, localNumberOfRows, CPU);
|
||||
double* bv = 0;
|
||||
double* xv = 0;
|
||||
double* xexactv = 0;
|
||||
if (b != 0)
|
||||
bv = b->values; // Only compute exact solution if requested
|
||||
if (x != 0)
|
||||
xv = x->values; // Only compute exact solution if requested
|
||||
if (xexact != 0)
|
||||
xexactv = xexact->values; // Only compute exact solution if requested
|
||||
A.localToGlobalMap.resize(localNumberOfRows);
|
||||
|
||||
// Use a parallel loop to do initial assignment:
|
||||
// distributes the physical placement of arrays of pointers across the memory system
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
{
|
||||
matrixValues[i] = 0;
|
||||
matrixDiagonal[i] = 0;
|
||||
mtxIndG[i] = 0;
|
||||
mtxIndL[i] = 0;
|
||||
}
|
||||
|
||||
if (global_steps == 1)
|
||||
{
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < A.geom->size + 1; i++)
|
||||
{
|
||||
rankToId_h[i] = 0;
|
||||
}
|
||||
global_steps++;
|
||||
}
|
||||
|
||||
// Now allocate the arrays pointed to
|
||||
mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
|
||||
local_int_t localNumberOfNonzeros = 0;
|
||||
local_int_t ext_nnz = 0;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for reduction(+ : localNumberOfNonzeros) reduction(+ : ext_nnz)
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
|
||||
matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
|
||||
mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
|
||||
|
||||
const local_int_t iz = (i / (nx * ny));
|
||||
const local_int_t iy = (i - iz * nx * ny) / nx;
|
||||
const local_int_t ix = i - (iz * ny + iy) * nx;
|
||||
const global_int_t gix = ix + gix0;
|
||||
const global_int_t giy = iy + giy0;
|
||||
const global_int_t giz = iz + giz0;
|
||||
|
||||
local_int_t currentLocalRow = i;
|
||||
global_int_t currentGlobalRow = gix + giy * gnx + giz * gnx * gny;
|
||||
|
||||
A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
|
||||
|
||||
char numberOfNonzerosInRow = 0;
|
||||
double* currentValuePointer = matrixValues[currentLocalRow];
|
||||
global_int_t* currentIndexPointerG = mtxIndG[currentLocalRow];
|
||||
global_int_t curcol;
|
||||
double* diagonalPointer = nullptr;
|
||||
// Go through all the neighbors around a 3D point to decide
|
||||
// which one is a halo and which one is local to the rank
|
||||
for (int k = 0; k < 27; k++)
|
||||
{
|
||||
// Neibor global Ids
|
||||
long long int cgix = gix + tid2indCpu[k][0];
|
||||
long long int cgiy = giy + tid2indCpu[k][1];
|
||||
long long int cgiz = giz + tid2indCpu[k][2];
|
||||
|
||||
// These used when the point is local to the rank
|
||||
local_int_t zi = (cgiz) % nz;
|
||||
local_int_t yi = (cgiy) % ny;
|
||||
local_int_t xi = (cgix) % nx;
|
||||
// local column Id
|
||||
local_int_t lcol = zi * ny * nx + yi * nx + xi;
|
||||
|
||||
// Is the global 3D point inside the global problem?
|
||||
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
|
||||
|
||||
if (ok /*Yes this a valid point globally*/)
|
||||
{
|
||||
*currentIndexPointerG++ = cgix + cgiy * gnx + cgiz * gnx * gny;
|
||||
;
|
||||
if (k == 13)
|
||||
{
|
||||
*currentValuePointer = 26.0;
|
||||
diagonalPointer = currentValuePointer;
|
||||
}
|
||||
else
|
||||
{
|
||||
*currentValuePointer = -1.0;
|
||||
}
|
||||
|
||||
// Rank Id in the global domain
|
||||
int ipz = cgiz / nz;
|
||||
int ipy = cgiy / ny;
|
||||
int ipx = cgix / nx;
|
||||
|
||||
// For GPUCPU exec mode, when the CPU and GPU have diff dims in a direction,
|
||||
// we need to find the point rank manually, not based on its local dimension
|
||||
// but based on its physical location to the local problem
|
||||
// Note the halo size is always 1
|
||||
if (A.geom->different_dim == Z)
|
||||
{
|
||||
long long int local = cgiz - giz0;
|
||||
if (local >= 0 && local < nz)
|
||||
ipz = A.geom->ipz;
|
||||
else if (local < 0)
|
||||
ipz = A.geom->ipz - 1;
|
||||
else if (local >= nz)
|
||||
ipz = A.geom->ipz + 1;
|
||||
}
|
||||
else if (A.geom->different_dim == Y)
|
||||
{
|
||||
long long int local = cgiy - giy0;
|
||||
if (local >= 0 && local < ny)
|
||||
ipy = A.geom->ipy;
|
||||
else if (local < 0)
|
||||
ipy = A.geom->ipy - 1;
|
||||
else if (local >= ny)
|
||||
ipy = A.geom->ipy + 1;
|
||||
}
|
||||
else if (A.geom->different_dim == X)
|
||||
{
|
||||
long long int local = cgix - gix0;
|
||||
if (local >= 0 && local < nx)
|
||||
ipx = A.geom->ipx;
|
||||
else if (local < 0)
|
||||
ipx = A.geom->ipx - 1;
|
||||
else if (local >= nx)
|
||||
ipx = A.geom->ipx + 1;
|
||||
}
|
||||
|
||||
// Now, after find the point rank from the location
|
||||
// in the 3D grid (ranks domain NPXxNPYxNPZ)
|
||||
int col_rank = ipx + ipy * npx + ipz * npy * npx;
|
||||
|
||||
// The neighbor point rank is diff than the current point rank
|
||||
if (A.geom->logical_rank != col_rank)
|
||||
{
|
||||
if (global_steps == 2)
|
||||
rankToId_h[col_rank + 1] = 1; // To find its sequential Id (will be prefix summed later)
|
||||
ext_nnz++;
|
||||
}
|
||||
|
||||
currentValuePointer++;
|
||||
numberOfNonzerosInRow++;
|
||||
}
|
||||
}
|
||||
|
||||
matrixDiagonal[currentLocalRow] = diagonalPointer;
|
||||
nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
|
||||
localNumberOfNonzeros += numberOfNonzerosInRow;
|
||||
if (b != 0)
|
||||
bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
|
||||
if (x != 0)
|
||||
xv[currentLocalRow] = 0.0;
|
||||
if (xexact != 0)
|
||||
xexactv[currentLocalRow] = 1.0;
|
||||
}
|
||||
|
||||
// Prefixsum to RakToId
|
||||
// Map physical neighbor ranks to sequential IDs
|
||||
// less memory consumption
|
||||
if (global_steps == 2)
|
||||
{
|
||||
PrefixsumCpu(rankToId_h + 1, A.geom->size);
|
||||
int counter = 1;
|
||||
for (int i = 1; i < A.geom->size + 1; i++)
|
||||
{
|
||||
if (rankToId_h[i] == counter)
|
||||
{
|
||||
idToRank_h[counter - 1] = i - 1;
|
||||
counter++;
|
||||
}
|
||||
}
|
||||
global_steps++;
|
||||
}
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
|
||||
<< endl
|
||||
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
|
||||
<< " nonzeros." << endl;
|
||||
#endif
|
||||
|
||||
global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
|
||||
+ 18LL
|
||||
* (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
|
||||
+ 2LL * ((gny - 2LL) * (gnz - 2LL)))
|
||||
+ 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
|
||||
|
||||
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
|
||||
// This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
|
||||
assert(totalNumberOfNonzeros
|
||||
> 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
|
||||
|
||||
A.title = 0;
|
||||
A.totalNumberOfRows = totalNumberOfRows;
|
||||
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
|
||||
A.localNumberOfRows = localNumberOfRows;
|
||||
A.localNumberOfColumns = localNumberOfRows;
|
||||
A.localNumberOfNonzeros = localNumberOfNonzeros;
|
||||
A.nonzerosInRow = nonzerosInRow;
|
||||
A.mtxIndG = mtxIndG;
|
||||
A.mtxIndL = mtxIndL;
|
||||
A.matrixValues = matrixValues;
|
||||
A.matrixDiagonal = matrixDiagonal;
|
||||
A.extNnz = ext_nnz;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // USE_GRACE
|
||||
|
||||
void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
GenerateProblem_Gpu(A, b, x, xexact);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
GenerateProblem_Cpu(A, b, x, xexact);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
20
src/GenerateProblem.hpp
Normal file
20
src/GenerateProblem.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef GENERATEPROBLEM_HPP
|
||||
#define GENERATEPROBLEM_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
|
||||
#endif // GENERATEPROBLEM_HPP
|
||||
251
src/GenerateProblem_ref.cpp
Normal file
251
src/GenerateProblem_ref.cpp
Normal file
@@ -0,0 +1,251 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file GenerateProblem_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
|
||||
#include <fstream>
|
||||
using std::endl;
|
||||
#include "hpcg.hpp"
|
||||
#endif
|
||||
#include <cassert>
|
||||
|
||||
#include "GenerateProblem_ref.hpp"
|
||||
|
||||
/*!
|
||||
Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact
|
||||
solution.
|
||||
|
||||
@param[in] A The known system matrix
|
||||
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
|
||||
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
|
||||
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
|
||||
non-zero on entry)
|
||||
|
||||
@see GenerateGeometry
|
||||
*/
|
||||
|
||||
void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
|
||||
{
|
||||
|
||||
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
|
||||
// below may result in global range values.
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
|
||||
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
|
||||
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
|
||||
assert(localNumberOfRows
|
||||
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
|
||||
local_int_t numberOfNonzerosPerRow
|
||||
= 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
|
||||
|
||||
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
|
||||
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
|
||||
assert(totalNumberOfRows
|
||||
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
|
||||
|
||||
// Allocate arrays that are of length localNumberOfRows
|
||||
local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
|
||||
global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
|
||||
local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
|
||||
double** matrixValues = new double*[localNumberOfRows];
|
||||
double** matrixDiagonal = new double*[localNumberOfRows];
|
||||
|
||||
if (b != 0)
|
||||
InitializeVector(*b, localNumberOfRows, CPU);
|
||||
if (x != 0)
|
||||
InitializeVector(*x, localNumberOfRows, CPU);
|
||||
if (xexact != 0)
|
||||
InitializeVector(*xexact, localNumberOfRows, CPU);
|
||||
double* bv = 0;
|
||||
double* xv = 0;
|
||||
double* xexactv = 0;
|
||||
if (b != 0)
|
||||
bv = b->values; // Only compute exact solution if requested
|
||||
if (x != 0)
|
||||
xv = x->values; // Only compute exact solution if requested
|
||||
if (xexact != 0)
|
||||
xexactv = xexact->values; // Only compute exact solution if requested
|
||||
A.localToGlobalMap.resize(localNumberOfRows);
|
||||
|
||||
// Use a parallel loop to do initial assignment:
|
||||
// distributes the physical placement of arrays of pointers across the memory system
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
{
|
||||
matrixValues[i] = 0;
|
||||
matrixDiagonal[i] = 0;
|
||||
mtxIndG[i] = 0;
|
||||
mtxIndL[i] = 0;
|
||||
}
|
||||
|
||||
#ifndef HPCG_CONTIGUOUS_ARRAYS
|
||||
// Now allocate the arrays pointed to
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
matrixValues[i] = new double[numberOfNonzerosPerRow];
|
||||
for (local_int_t i = 0; i < localNumberOfRows; ++i)
|
||||
mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
|
||||
|
||||
#else
|
||||
// Now allocate the arrays pointed to
|
||||
mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
|
||||
|
||||
for (local_int_t i = 1; i < localNumberOfRows; ++i)
|
||||
{
|
||||
mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
|
||||
matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
|
||||
mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
|
||||
}
|
||||
#endif
|
||||
|
||||
local_int_t localNumberOfNonzeros = 0;
|
||||
// TODO: This triply nested loop could be flattened or use nested parallelism
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t iz = 0; iz < nz; iz++)
|
||||
{
|
||||
global_int_t giz = giz0 + iz;
|
||||
for (local_int_t iy = 0; iy < ny; iy++)
|
||||
{
|
||||
global_int_t giy = giy0 + iy;
|
||||
for (local_int_t ix = 0; ix < nx; ix++)
|
||||
{
|
||||
global_int_t gix = gix0 + ix;
|
||||
local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
|
||||
global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
// C++ std::map is not threadsafe for writing
|
||||
#pragma omp critical
|
||||
#endif
|
||||
A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
|
||||
|
||||
A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
|
||||
<< A.globalToLocalMap[currentGlobalRow] << endl;
|
||||
#endif
|
||||
char numberOfNonzerosInRow = 0;
|
||||
double* currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
|
||||
global_int_t* currentIndexPointerG
|
||||
= mtxIndG[currentLocalRow]; // Pointer to current index in current row
|
||||
for (int sz = -1; sz <= 1; sz++)
|
||||
{
|
||||
if (giz + sz > -1 && giz + sz < gnz)
|
||||
{
|
||||
for (int sy = -1; sy <= 1; sy++)
|
||||
{
|
||||
if (giy + sy > -1 && giy + sy < gny)
|
||||
{
|
||||
for (int sx = -1; sx <= 1; sx++)
|
||||
{
|
||||
if (gix + sx > -1 && gix + sx < gnx)
|
||||
{
|
||||
global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
|
||||
if (curcol == currentGlobalRow)
|
||||
{
|
||||
matrixDiagonal[currentLocalRow] = currentValuePointer;
|
||||
*currentValuePointer++ = 26.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
*currentValuePointer++ = -1.0;
|
||||
}
|
||||
*currentIndexPointerG++ = curcol;
|
||||
numberOfNonzerosInRow++;
|
||||
} // end x bounds test
|
||||
} // end sx loop
|
||||
} // end y bounds test
|
||||
} // end sy loop
|
||||
} // end z bounds test
|
||||
} // end sz loop
|
||||
nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
|
||||
if (b != 0)
|
||||
bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
|
||||
if (x != 0)
|
||||
xv[currentLocalRow] = 0.0;
|
||||
if (xexact != 0)
|
||||
xexactv[currentLocalRow] = 1.0;
|
||||
} // end ix loop
|
||||
} // end iy loop
|
||||
} // end iz loop
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
|
||||
<< endl
|
||||
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
|
||||
<< " nonzeros." << endl;
|
||||
#endif
|
||||
|
||||
global_int_t totalNumberOfNonzeros = 0;
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Use MPI's reduce function to sum all nonzeros
|
||||
#ifdef HPCG_NO_LONG_LONG
|
||||
MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
#else
|
||||
long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
|
||||
MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
totalNumberOfNonzeros = gnnz; // Copy back
|
||||
#endif
|
||||
#else
|
||||
totalNumberOfNonzeros = localNumberOfNonzeros;
|
||||
#endif
|
||||
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
|
||||
// This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
|
||||
assert(totalNumberOfNonzeros
|
||||
> 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
|
||||
|
||||
A.title = 0;
|
||||
A.totalNumberOfRows = totalNumberOfRows;
|
||||
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
|
||||
A.localNumberOfRows = localNumberOfRows;
|
||||
A.localNumberOfColumns = localNumberOfRows;
|
||||
A.localNumberOfNonzeros = localNumberOfNonzeros;
|
||||
A.nonzerosInRow = nonzerosInRow;
|
||||
A.mtxIndG = mtxIndG;
|
||||
A.mtxIndL = mtxIndL;
|
||||
A.matrixValues = matrixValues;
|
||||
A.matrixDiagonal = matrixDiagonal;
|
||||
|
||||
return;
|
||||
}
|
||||
21
src/GenerateProblem_ref.hpp
Normal file
21
src/GenerateProblem_ref.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef GENERATEPROBLEM_REF_HPP
|
||||
#define GENERATEPROBLEM_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
|
||||
#endif // GENERATEPROBLEM_REF_HPP
|
||||
207
src/Geometry.hpp
Normal file
207
src/Geometry.hpp
Normal file
@@ -0,0 +1,207 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file Geometry.hpp
|
||||
|
||||
HPCG data structure for problem geometry
|
||||
*/
|
||||
|
||||
#ifndef GEOMETRY_HPP
|
||||
#define GEOMETRY_HPP
|
||||
|
||||
/*!
|
||||
This defines the type for integers that have local subdomain dimension.
|
||||
|
||||
Define as "long long" when local problem dimension is > 2^31
|
||||
*/
|
||||
// #define INDEX_64
|
||||
|
||||
#ifndef INDEX_64
|
||||
typedef int local_int_t;
|
||||
#else
|
||||
typedef long long local_int_t;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
This defines the type for integers that have global dimension
|
||||
|
||||
Define as "long long" when global problem dimension is > 2^31
|
||||
*/
|
||||
#ifdef HPCG_NO_LONG_LONG
|
||||
typedef int global_int_t;
|
||||
#else
|
||||
typedef long long global_int_t;
|
||||
#endif
|
||||
|
||||
#define HPCG_MAX_ROW_LEN 27
|
||||
|
||||
// Enums
|
||||
typedef enum
|
||||
{
|
||||
X = 0,
|
||||
Y = 1,
|
||||
Z = 2,
|
||||
NONE = 3
|
||||
} dim_3d_t;
|
||||
typedef enum
|
||||
{
|
||||
MPI_CPU,
|
||||
MPI_CUDA_AWARE,
|
||||
MPI_GPU_All2allv,
|
||||
MPI_CPU_All2allv,
|
||||
NCCL /*GPUONLY*/
|
||||
} p2p_comm_mode_t;
|
||||
typedef enum
|
||||
{
|
||||
CPU,
|
||||
GPU
|
||||
} rank_type_t;
|
||||
typedef enum
|
||||
{
|
||||
GPUONLY = 0,
|
||||
CPUONLY = 1,
|
||||
GPUCPU = 2
|
||||
} exec_mode_t;
|
||||
typedef enum
|
||||
{
|
||||
GPU_RATIO = 0 /*NX, NY, NZ are local to GPU and g2c is a ratio*/,
|
||||
GPU_ABS = 1 /*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
|
||||
GPU_CPU_RATIO = 2 /*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
|
||||
GPU_CPU_ABS = 3 /*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
|
||||
} local_problem_def_t;
|
||||
|
||||
// This macro should be defined if the global_int_t is not long long
|
||||
// in order to stop complaints from non-C++11 compliant compilers.
|
||||
// #define HPCG_NO_LONG_LONG
|
||||
|
||||
/*!
|
||||
This is a data structure to contain all processor geometry information
|
||||
*/
|
||||
struct Geometry_STRUCT
|
||||
{
|
||||
int size; //!< Number of MPI processes
|
||||
int rank; //!< This process' rank in the range [0 to size - 1]
|
||||
int logical_rank; //!< For hetrogeneous setup,
|
||||
int numThreads; //!< This process' number of threads
|
||||
local_int_t nx; //!< Number of x-direction grid points for each local subdomain
|
||||
local_int_t ny; //!< Number of y-direction grid points for each local subdomain
|
||||
local_int_t nz; //!< Number of z-direction grid points for each local subdomain
|
||||
int npx; //!< Number of processors in x-direction
|
||||
int npy; //!< Number of processors in y-direction
|
||||
int npz; //!< Number of processors in z-direction
|
||||
int pz; //!< partition ID of z-dimension process that starts the second region of nz values
|
||||
int npartz; //!< Number of partitions with varying nz values
|
||||
int* partz_ids; //!< Array of partition ids of processor in z-direction where new value of nz starts (valid values
|
||||
//!< are 1 to npz)
|
||||
local_int_t* partz_nz; //!< Array of length npartz containing the nz values for each partition
|
||||
int ipx; //!< Current rank's x location in the npx by npy by npz processor grid
|
||||
int ipy; //!< Current rank's y location in the npx by npy by npz processor grid
|
||||
int ipz; //!< Current rank's z location in the npx by npy by npz processor grid
|
||||
global_int_t gnx; //!< Global number of x-direction grid points
|
||||
global_int_t gny; //!< Global number of y-direction grid points
|
||||
global_int_t gnz; //!< Global number of z-direction grid points
|
||||
global_int_t gix0; //!< Base global x index for this rank in the npx by npy by npz processor grid
|
||||
global_int_t giy0; //!< Base global y index for this rank in the npx by npy by npz processor grid
|
||||
global_int_t giz0; //!< Base global z index for this rank in the npx by npy by npz processor grid
|
||||
|
||||
dim_3d_t different_dim; //!< The dimension that the GPU and CPU rank are partitioned along
|
||||
int previous_neighbor_dim;
|
||||
int next_neighbor_dim;
|
||||
};
|
||||
typedef struct Geometry_STRUCT Geometry;
|
||||
|
||||
/*!
|
||||
Returns the rank of the MPI process that is assigned the global row index
|
||||
given as the input argument.
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[in] index The global row index
|
||||
|
||||
@return Returns the MPI rank of the process assigned the row
|
||||
*/
|
||||
inline int ComputeRankOfMatrixRow(const Geometry& geom, global_int_t index)
|
||||
{
|
||||
global_int_t gnx = geom.gnx;
|
||||
global_int_t gny = geom.gny;
|
||||
|
||||
global_int_t iz = index / (gny * gnx);
|
||||
global_int_t iy = (index - iz * gny * gnx) / gnx;
|
||||
global_int_t ix = index % gnx;
|
||||
// We now permit varying values for nz for any nx-by-ny plane of MPI processes.
|
||||
// npartz is the number of different groups of nx-by-ny groups of processes.
|
||||
// partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith
|
||||
// nx-by-ny group. partz_nz is an array of length npartz containing the value of nz for the ith group.
|
||||
|
||||
// With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
|
||||
|
||||
int ipz = 0;
|
||||
int ipartz_ids = 0;
|
||||
for (int i = 0; i < geom.npartz; ++i)
|
||||
{
|
||||
int ipart_nz = geom.partz_nz[i];
|
||||
ipartz_ids = geom.partz_ids[i] - ipartz_ids;
|
||||
if (iz <= ipart_nz * ipartz_ids)
|
||||
{
|
||||
ipz += iz / ipart_nz;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
ipz += ipartz_ids;
|
||||
iz -= ipart_nz * ipartz_ids;
|
||||
}
|
||||
}
|
||||
// global_int_t ipz = iz/geom.nz;
|
||||
int ipy = iy / geom.ny;
|
||||
int ipx = ix / geom.nx;
|
||||
int rank = ipx + ipy * geom.npx + ipz * geom.npy * geom.npx;
|
||||
return rank;
|
||||
}
|
||||
|
||||
/*!
|
||||
Destructor for geometry data.
|
||||
|
||||
@param[inout] data the geometry data structure whose storage is deallocated
|
||||
*/
|
||||
inline void DeleteGeometry(Geometry& geom)
|
||||
{
|
||||
|
||||
// Not used anymore
|
||||
// if(geom.partz_nz != 0)
|
||||
// delete [] geom.partz_nz;
|
||||
|
||||
// if(geom.partz_ids != 0)
|
||||
// delete [] geom.partz_ids;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // GEOMETRY_HPP
|
||||
81
src/MGData.hpp
Normal file
81
src/MGData.hpp
Normal file
@@ -0,0 +1,81 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file MGData.hpp
|
||||
|
||||
HPCG data structure
|
||||
*/
|
||||
|
||||
#ifndef MGDATA_HPP
|
||||
#define MGDATA_HPP
|
||||
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
#include <cassert>
|
||||
|
||||
struct MGData_STRUCT
|
||||
{
|
||||
int numberOfPresmootherSteps; // Call ComputeSYMGS this many times prior to coarsening
|
||||
int numberOfPostsmootherSteps; // Call ComputeSYMGS this many times after coarsening
|
||||
local_int_t*
|
||||
f2cOperator; //!< 1D array containing the fine operator local IDs that will be injected into coarse space.
|
||||
Vector* rc; // coarse grid residual vector
|
||||
Vector* xc; // coarse grid solution vector
|
||||
Vector* Axf; // fine grid residual vector
|
||||
/*!
|
||||
This is for storing optimized data structres created in OptimizeProblem and
|
||||
used inside optimized ComputeSPMV().
|
||||
*/
|
||||
void* optimizationData;
|
||||
};
|
||||
typedef struct MGData_STRUCT MGData;
|
||||
|
||||
/*!
|
||||
Constructor for the data structure of CG vectors.
|
||||
|
||||
@param[in] Ac - Fully-formed coarse matrix
|
||||
@param[in] f2cOperator -
|
||||
@param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
|
||||
*/
|
||||
inline void InitializeMGData(local_int_t* f2cOperator, Vector* rc, Vector* xc, Vector* Axf, MGData& data)
|
||||
{
|
||||
data.numberOfPresmootherSteps = 1;
|
||||
data.numberOfPostsmootherSteps = 1;
|
||||
data.f2cOperator = f2cOperator; // Space for injection operator
|
||||
data.rc = rc;
|
||||
data.xc = xc;
|
||||
data.Axf = Axf;
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Destructor for the CG vectors data.
|
||||
|
||||
@param[inout] data the MG data structure whose storage is deallocated
|
||||
*/
|
||||
inline void DeleteMGData(MGData& data)
|
||||
{
|
||||
|
||||
delete[] data.f2cOperator;
|
||||
DeleteVector(*data.Axf);
|
||||
DeleteVector(*data.rc);
|
||||
DeleteVector(*data.xc);
|
||||
delete data.Axf;
|
||||
delete data.rc;
|
||||
delete data.xc;
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // MGDATA_HPP
|
||||
66
src/MixedBaseCounter.cpp
Normal file
66
src/MixedBaseCounter.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "MixedBaseCounter.hpp"
|
||||
|
||||
MixedBaseCounter::MixedBaseCounter(int* counts, int length)
|
||||
{
|
||||
this->length = length;
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; ++i)
|
||||
{
|
||||
this->max_counts[i] = counts[i];
|
||||
this->cur_counts[i] = 0;
|
||||
}
|
||||
// terminate with 0's
|
||||
this->max_counts[i] = this->cur_counts[i] = 0;
|
||||
this->max_counts[length] = this->cur_counts[length] = 0;
|
||||
}
|
||||
|
||||
MixedBaseCounter::MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right)
|
||||
{
|
||||
this->length = left.length;
|
||||
for (int i = 0; i < left.length; ++i)
|
||||
{
|
||||
this->max_counts[i] = left.max_counts[i] - right.cur_counts[i];
|
||||
this->cur_counts[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void MixedBaseCounter::next()
|
||||
{
|
||||
for (int i = 0; i < this->length; ++i)
|
||||
{
|
||||
this->cur_counts[i]++;
|
||||
if (this->cur_counts[i] > this->max_counts[i])
|
||||
{
|
||||
this->cur_counts[i] = 0;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int MixedBaseCounter::is_zero()
|
||||
{
|
||||
for (int i = 0; i < this->length; ++i)
|
||||
if (this->cur_counts[i])
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int MixedBaseCounter::product(int* multipliers)
|
||||
{
|
||||
int k = 0, x = 1;
|
||||
|
||||
for (int i = 0; i < this->length; ++i)
|
||||
for (int j = 0; j < this->cur_counts[i]; ++j)
|
||||
{
|
||||
k = 1;
|
||||
x *= multipliers[i];
|
||||
}
|
||||
|
||||
return x * k;
|
||||
}
|
||||
16
src/MixedBaseCounter.hpp
Normal file
16
src/MixedBaseCounter.hpp
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
|
||||
class MixedBaseCounter
|
||||
{
|
||||
private:
|
||||
int length; //!< number of prime factor counts (cannot exceed 32 for a 32-bit integer)
|
||||
int max_counts[32 + 1]; //!< maximum value for prime factor counts
|
||||
int cur_counts[32 + 1]; //!< current prime factor counts
|
||||
|
||||
public:
|
||||
MixedBaseCounter(int* counts, int length);
|
||||
MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right);
|
||||
void next();
|
||||
int is_zero();
|
||||
int product(int* multipliers);
|
||||
};
|
||||
427
src/OptimizeProblem.cpp
Normal file
427
src/OptimizeProblem.cpp
Normal file
@@ -0,0 +1,427 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file OptimizeProblem.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "OptimizeProblem.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#include "Cuda.hpp"
|
||||
#include "WriteProblem.hpp"
|
||||
#include "mytimer.hpp"
|
||||
|
||||
extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
|
||||
|
||||
/*!
|
||||
Optimizes the data structures used for CG iteration to increase the
|
||||
performance of the benchmark version of the preconditioned CG algorithm.
|
||||
|
||||
@param[inout] A The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
|
||||
@param[inout] data The data structure with all necessary CG vectors preallocated
|
||||
@param[inout] b The known right hand side vector
|
||||
@param[inout] x The solution vector to be computed in future CG iteration
|
||||
@param[inout] xexact The exact solution vector
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see GenerateGeometry
|
||||
@see GenerateProblem
|
||||
*/
|
||||
|
||||
#ifdef USE_CUDA
|
||||
size_t OptimizeProblemGpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
|
||||
{
|
||||
// This function can be used to completely transform any part of the data structures.
|
||||
// Right now it does nothing, so compiling with a check for unused variables results in complaints
|
||||
SparseMatrix* A = &A_in;
|
||||
local_int_t numberOfMgLevels = 4;
|
||||
local_int_t slice_size = A->slice_size;
|
||||
for (int level = 0; level < numberOfMgLevels; ++level)
|
||||
{
|
||||
const local_int_t nrow = A->localNumberOfRows;
|
||||
int totalColors = 8;
|
||||
|
||||
// Let's deal with perm and iperm
|
||||
SetVectorAscCuda(A->ref2opt, nrow);
|
||||
SetVectorAscCuda(A->opt2ref, nrow);
|
||||
|
||||
// Let us color the matrix
|
||||
int num_colors = 0;
|
||||
ColorMatrixCuda(NULL, A->gpuAux.columns, A->gpuAux.nnzPerRow, A->localNumberOfRows, A->gpuAux.color,
|
||||
&(num_colors), A->gpuAux.colorCountCpu, 8, A->ref2opt, A->opt2ref, A->geom->rank, A->geom->nx, NULL);
|
||||
A->totalColors = totalColors;
|
||||
PermElemToSendCuda(A->totalToBeSent, A->gpuAux.elementsToSend, A->ref2opt);
|
||||
|
||||
// Create (S)ELL
|
||||
local_int_t TranslateIndex = slice_size * HPCG_MAX_ROW_LEN;
|
||||
local_int_t* translated_ell_col_index = A->sellAPermColumns + TranslateIndex;
|
||||
double* translated_ell_values = A->sellAPermValues + TranslateIndex;
|
||||
|
||||
EllPermColumnsValuesCuda(nrow, A->gpuAux.nnzPerRow, A->gpuAux.columns, A->gpuAux.values,
|
||||
A->gpuAux.csrAPermOffsets, translated_ell_col_index, translated_ell_values, A->opt2ref, A->ref2opt,
|
||||
A->gpuAux.sellADiagonalIdx, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets, false);
|
||||
|
||||
// Coloumn mojor blocked/sliced ellpack
|
||||
TransposeCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues);
|
||||
|
||||
// Per block max row len
|
||||
local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
|
||||
EllMaxRowLenPerBlockCuda(nrow, slice_size, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets,
|
||||
A->sellLSliceMrl, A->sellUSliceMrl);
|
||||
|
||||
// Find prefix sum for sliced ell
|
||||
PrefixsumCuda(num_slices, A->sellLSliceMrl);
|
||||
MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellLSliceMrl + 1);
|
||||
|
||||
PrefixsumCuda(num_slices, A->sellUSliceMrl);
|
||||
MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellUSliceMrl + 1);
|
||||
|
||||
// Set the general matrix slice_offsets
|
||||
CreateAMatrixSliceOffsetsCuda(num_slices + 1, A->slice_size, A->sellASliceMrl);
|
||||
|
||||
// Lower Upper ELL variant parts
|
||||
CreateSellLUColumnsValuesCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues, A->sellLSliceMrl,
|
||||
A->sellLPermColumns, A->sellLPermValues, A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, level);
|
||||
|
||||
local_int_t sell_slices = (nrow + slice_size - 1) / slice_size;
|
||||
const local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
|
||||
|
||||
local_int_t sell_l_nnz = 0;
|
||||
cudaMemcpyAsync(
|
||||
&sell_l_nnz, &(A->sellLSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
local_int_t sell_u_nnz = 0;
|
||||
cudaMemcpyAsync(
|
||||
&sell_u_nnz, &(A->sellUSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
auto INDEX_TYPE = CUSPARSE_INDEX_32I;
|
||||
#ifdef INDEX_64 // In src/Geometry
|
||||
INDEX_TYPE = CUSPARSE_INDEX_64I;
|
||||
#endif
|
||||
cusparseCreateSlicedEll(&(A->cusparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
|
||||
A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
|
||||
CUDA_R_64F);
|
||||
|
||||
cusparseCreateSlicedEll(&(A->cusparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
|
||||
A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
|
||||
CUDA_R_64F);
|
||||
|
||||
local_int_t sell_nnz = sell_slices * slice_size * HPCG_MAX_ROW_LEN;
|
||||
cusparseCreateSlicedEll(&(A->cusparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz, slice_size,
|
||||
A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
|
||||
CUDA_R_64F);
|
||||
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
size_t e_buf_size = 0;
|
||||
size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
|
||||
cusparseDnVecDescr_t dummy1, dummy2;
|
||||
cusparseCreateDnVec(&dummy1, nrow, x.values_d, CUDA_R_64F);
|
||||
cusparseCreateDnVec(&dummy2, nrow, b.values_d, CUDA_R_64F);
|
||||
cusparseCreateDnVec(&(A->cusparseOpt.vecX), nrow, x.values_d, CUDA_R_64F);
|
||||
cusparseCreateDnVec(&(A->cusparseOpt.vecY), nrow, b.values_d, CUDA_R_64F);
|
||||
max_buf_size = e_buf_size;
|
||||
|
||||
// MV
|
||||
// Lower
|
||||
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL, dummy1,
|
||||
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &l_buf_size);
|
||||
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU, dummy1,
|
||||
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &u_buf_size);
|
||||
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matA, dummy1,
|
||||
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &i_buf_size);
|
||||
|
||||
max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
|
||||
|
||||
// SV
|
||||
// Lower
|
||||
size_t buffer_size_sv_l, buffer_size_sv_u;
|
||||
cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
|
||||
cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
|
||||
cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
|
||||
|
||||
cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrL);
|
||||
cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrU);
|
||||
cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
|
||||
cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
|
||||
if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
|
||||
{
|
||||
cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
|
||||
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
|
||||
A->cusparseOpt.spsvDescrL, &buffer_size_sv_l);
|
||||
cudaMalloc(&A->bufferSvL, buffer_size_sv_l);
|
||||
}
|
||||
cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
|
||||
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrL,
|
||||
A->bufferSvL);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A->cusparseOpt.spsvDescrL, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
|
||||
cusparseSpMatSetAttribute(A->cusparseOpt.matU, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
|
||||
if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
|
||||
{
|
||||
cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
|
||||
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
|
||||
A->cusparseOpt.spsvDescrU, &buffer_size_sv_u);
|
||||
cudaMalloc(&A->bufferSvU, buffer_size_sv_u);
|
||||
}
|
||||
cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
|
||||
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrU,
|
||||
A->bufferSvU);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A->cusparseOpt.spsvDescrU, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
|
||||
if (max_buf_size > 0)
|
||||
cudaMalloc(&(A->bufferMvA), max_buf_size);
|
||||
|
||||
cusparseDestroyDnVec(dummy1);
|
||||
cusparseDestroyDnVec(dummy2);
|
||||
// //////////////////////////////////////////////////////////////////////////
|
||||
A = A->Ac;
|
||||
}
|
||||
|
||||
A = &A_in;
|
||||
for (int level = 1; level < numberOfMgLevels; ++level)
|
||||
{
|
||||
const local_int_t nrow_c = A->Ac->localNumberOfRows;
|
||||
const local_int_t nrow_f = A->localNumberOfRows;
|
||||
F2cPermCuda(nrow_c, A->gpuAux.f2c, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
|
||||
A = A->Ac;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
size_t OptimizeProblemCpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
|
||||
{
|
||||
// Initialize data structures
|
||||
size_t mem = AllocateMemCpu(A_in);
|
||||
|
||||
SparseMatrix* A = &A_in;
|
||||
local_int_t numberOfMgLevels = 4;
|
||||
local_int_t slice_size = A->slice_size;
|
||||
for (int level = 0; level < numberOfMgLevels; ++level)
|
||||
{
|
||||
// Color the matrix
|
||||
int num_colors;
|
||||
ColorMatrixCpu(*A, &num_colors);
|
||||
A->totalColors = num_colors;
|
||||
|
||||
// Compute when each color starts
|
||||
A->cpuAux.firstRowOfColor[0] = 0;
|
||||
for (int c = 1; c < A->totalColors; c++)
|
||||
{
|
||||
A->cpuAux.firstRowOfColor[c] = A->cpuAux.firstRowOfColor[c - 1] + A->cpuAux.nRowsWithColor[c - 1];
|
||||
}
|
||||
|
||||
// Reorder the matrix
|
||||
CreateSellPermCpu(*A);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Translate row IDs that will be send to neighbours
|
||||
#pragma omp parallel for
|
||||
for (local_int_t i = 0; i < A->totalToBeSent; i++)
|
||||
{
|
||||
local_int_t orig = A->elementsToSend[i];
|
||||
A->elementsToSend[i] = A->ref2opt[orig];
|
||||
}
|
||||
#endif
|
||||
|
||||
local_int_t numberOfNonzerosPerRow = HPCG_MAX_ROW_LEN;
|
||||
local_int_t nrow = A->localNumberOfRows;
|
||||
local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
|
||||
local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
|
||||
local_int_t sell_l_nnz = A->sellLSliceMrl[num_slices];
|
||||
local_int_t sell_u_nnz = A->sellUSliceMrl[num_slices];
|
||||
local_int_t sell_nnz = num_slices * slice_size * numberOfNonzerosPerRow;
|
||||
|
||||
auto INDEX_TYPE = NVPL_SPARSE_INDEX_32I;
|
||||
#ifdef INDEX_64 // In src/Geometry
|
||||
INDEX_TYPE = NVPL_SPARSE_INDEX_64I;
|
||||
#endif
|
||||
|
||||
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
|
||||
A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE,
|
||||
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
|
||||
|
||||
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
|
||||
A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE,
|
||||
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
|
||||
|
||||
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz,
|
||||
slice_size, A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE,
|
||||
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
|
||||
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
size_t e_buf_size = 0;
|
||||
size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
|
||||
nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecX), nrow, x.values, NVPL_SPARSE_R_64F);
|
||||
nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecY), nrow, b.values, NVPL_SPARSE_R_64F);
|
||||
max_buf_size = e_buf_size;
|
||||
|
||||
// //MV
|
||||
// //Lower
|
||||
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvLDescr);
|
||||
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
|
||||
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvLDescr, &l_buf_size);
|
||||
// //Upper
|
||||
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvUDescr);
|
||||
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
|
||||
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvUDescr, &u_buf_size);
|
||||
// //L+D+U
|
||||
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvADescr);
|
||||
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
|
||||
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvADescr, &i_buf_size);
|
||||
|
||||
max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
|
||||
|
||||
// //SV
|
||||
// //Lower
|
||||
size_t buffer_size_sv_l, buffer_size_sv_u;
|
||||
nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
|
||||
nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
|
||||
nvpl_sparse_diag_type_t diagtype = NVPL_SPARSE_DIAG_TYPE_NON_UNIT;
|
||||
|
||||
nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrL);
|
||||
nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrU);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
|
||||
Vector origDiagA;
|
||||
InitializeVector(origDiagA, A->localNumberOfRows, CPU);
|
||||
CopyMatrixDiagonal(*A, origDiagA);
|
||||
|
||||
// Pass strictly L, and then update the diagonal
|
||||
if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
|
||||
{
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, &buffer_size_sv_l);
|
||||
|
||||
A->bufferSvL = new char[buffer_size_sv_l];
|
||||
mem += buffer_size_sv_l;
|
||||
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
|
||||
}
|
||||
else
|
||||
{
|
||||
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
}
|
||||
|
||||
// Pass strctly U, and then update diagonal
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A->nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
|
||||
{
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, &buffer_size_sv_u);
|
||||
A->bufferSvU = new char[buffer_size_sv_u];
|
||||
mem += buffer_size_sv_u;
|
||||
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
|
||||
}
|
||||
else
|
||||
{
|
||||
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
|
||||
A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
}
|
||||
|
||||
DeleteVector(origDiagA);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
A = A->Ac;
|
||||
}
|
||||
A = &A_in;
|
||||
|
||||
for (int level = 1; level < numberOfMgLevels; level++)
|
||||
{
|
||||
local_int_t nrow_c = A->Ac->localNumberOfRows;
|
||||
local_int_t nrow_f = A->localNumberOfRows;
|
||||
// Permute space injector operator
|
||||
F2cPermCpu(nrow_c, A->mgData->f2cOperator, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
|
||||
A = A->Ac;
|
||||
}
|
||||
|
||||
return mem;
|
||||
}
|
||||
#endif // USE_GRACE
|
||||
|
||||
size_t OptimizeProblem(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
|
||||
{
|
||||
size_t result = 0;
|
||||
if (A_in.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
result = OptimizeProblemGpu(A_in, data, b, x, xexact);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
result = OptimizeProblemCpu(A_in, data, b, x, xexact);
|
||||
#endif
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper function (see OptimizeProblem.hpp for details)
|
||||
double OptimizeProblemMemoryUse(const SparseMatrix& A)
|
||||
{
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
30
src/OptimizeProblem.hpp
Normal file
30
src/OptimizeProblem.hpp
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef OPTIMIZEPROBLEM_HPP
|
||||
#define OPTIMIZEPROBLEM_HPP
|
||||
|
||||
#include "CGData.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
|
||||
size_t OptimizeProblem(SparseMatrix& A, CGData& data, Vector& b, Vector& x, Vector& xexact);
|
||||
|
||||
// This helper function should be implemented in a non-trivial way if OptimizeProblem is non-trivial
|
||||
// It should return as type double, the total number of bytes allocated and retained after calling OptimizeProblem.
|
||||
// This value will be used to report Gbytes used in ReportResults (the value returned will be divided by 1000000000.0).
|
||||
|
||||
double OptimizeProblemMemoryUse(const SparseMatrix& A);
|
||||
|
||||
#endif // OPTIMIZEPROBLEM_HPP
|
||||
176
src/OutputFile.cpp
Normal file
176
src/OutputFile.cpp
Normal file
@@ -0,0 +1,176 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "OutputFile.hpp"
|
||||
|
||||
using std::string;
|
||||
using std::stringstream;
|
||||
using std::list;
|
||||
using std::ofstream;
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
OutputFile::OutputFile(const string& name_arg, const string& version_arg)
|
||||
: name(name_arg)
|
||||
, version(version_arg)
|
||||
, eol("\n")
|
||||
, keySeparator("::")
|
||||
{
|
||||
}
|
||||
|
||||
OutputFile::OutputFile(void)
|
||||
: eol("\n")
|
||||
, keySeparator("::")
|
||||
{
|
||||
}
|
||||
|
||||
OutputFile::~OutputFile()
|
||||
{
|
||||
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
|
||||
{
|
||||
delete *it;
|
||||
}
|
||||
}
|
||||
|
||||
void OutputFile::add(const string& key_arg, const string& value_arg)
|
||||
{
|
||||
descendants.push_back(allocKeyVal(key_arg, value_arg));
|
||||
}
|
||||
|
||||
void OutputFile::add(const string& key_arg, double value_arg)
|
||||
{
|
||||
stringstream ss;
|
||||
ss << value_arg;
|
||||
descendants.push_back(allocKeyVal(key_arg, ss.str()));
|
||||
}
|
||||
|
||||
void OutputFile::add(const string& key_arg, int value_arg)
|
||||
{
|
||||
stringstream ss;
|
||||
ss << value_arg;
|
||||
descendants.push_back(allocKeyVal(key_arg, ss.str()));
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
|
||||
void OutputFile::add(const string& key_arg, long long value_arg)
|
||||
{
|
||||
stringstream ss;
|
||||
ss << value_arg;
|
||||
descendants.push_back(allocKeyVal(key_arg, ss.str()));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void OutputFile::add(const string& key_arg, size_t value_arg)
|
||||
{
|
||||
stringstream ss;
|
||||
ss << value_arg;
|
||||
descendants.push_back(allocKeyVal(key_arg, ss.str()));
|
||||
}
|
||||
|
||||
void OutputFile::setKeyValue(const string& key_arg, const string& value_arg)
|
||||
{
|
||||
key = key_arg;
|
||||
value = value_arg;
|
||||
}
|
||||
|
||||
OutputFile* OutputFile::get(const string& key_arg)
|
||||
{
|
||||
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
|
||||
{
|
||||
if ((*it)->key == key_arg)
|
||||
return *it;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
string OutputFile::generateRecursive(string prefix)
|
||||
{
|
||||
string result = "";
|
||||
|
||||
result += prefix + key + "=" + value + eol;
|
||||
|
||||
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
|
||||
{
|
||||
result += (*it)->generateRecursive(prefix + key + keySeparator);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
string OutputFile::generate(void)
|
||||
{
|
||||
string result = name + "\nversion=" + version + eol;
|
||||
|
||||
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
|
||||
{
|
||||
result += (*it)->generateRecursive("");
|
||||
}
|
||||
|
||||
time_t rawtime;
|
||||
time(&rawtime);
|
||||
tm* ptm = localtime(&rawtime);
|
||||
char sdate[64];
|
||||
// use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
|
||||
sprintf(sdate, "%04d-%02d-%02d_%02d-%02d-%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
|
||||
ptm->tm_min, ptm->tm_sec);
|
||||
|
||||
string filename = name + "_" + version + "_";
|
||||
filename += string(sdate) + ".txt";
|
||||
|
||||
if (use_output_file)
|
||||
{
|
||||
ofstream myfile(filename.c_str());
|
||||
myfile << result;
|
||||
myfile.close();
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << result << std::flush;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
OutputFile* OutputFile::allocKeyVal(const std::string& key_arg, const std::string& value_arg)
|
||||
{
|
||||
OutputFile* of = new OutputFile();
|
||||
of->setKeyValue(key_arg, value_arg);
|
||||
return of;
|
||||
}
|
||||
161
src/OutputFile.hpp
Normal file
161
src/OutputFile.hpp
Normal file
@@ -0,0 +1,161 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file Output_File.hpp
|
||||
|
||||
HPCG output file classes
|
||||
*/
|
||||
|
||||
#ifndef OUTPUTFILE_HPP
|
||||
#define OUTPUTFILE_HPP
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
|
||||
//! The OutputFile class for the uniform collecting and reporting of performance data for HPCG
|
||||
|
||||
/*!
|
||||
|
||||
The OutputFile class facilitates easy collecting and reporting of
|
||||
key-value-formatted data that can be then registered with the HPCG results
|
||||
collection website. The keys may have hierarchy key1::key2::key3=val with
|
||||
double colon :: as a separator. A sample output may look like this (note how
|
||||
"major" and "micro" keys repeat with different ancestor keys):
|
||||
|
||||
\code
|
||||
|
||||
version=3.2.1alpha
|
||||
version::major=3
|
||||
version::minor=2
|
||||
version::micro=1
|
||||
version::release=alpha
|
||||
axis=xyz
|
||||
axis::major=x
|
||||
axis::minor=y
|
||||
|
||||
\endcode
|
||||
|
||||
*/
|
||||
class OutputFile
|
||||
{
|
||||
protected:
|
||||
std::list<OutputFile*> descendants; //!< descendant elements
|
||||
std::string name; //!< name of the benchmark
|
||||
std::string version; //!< version of the benchmark
|
||||
std::string key; //!< the key under which the element is stored
|
||||
std::string value; //!< the value of the stored element
|
||||
std::string eol; //!< end-of-line character sequence in the output file
|
||||
std::string keySeparator; //!< character sequence to separate keys in the output file
|
||||
|
||||
//! Recursively generate output string from descendant list, and their descendants and so on
|
||||
std::string generateRecursive(std::string prefix);
|
||||
|
||||
public:
|
||||
static OutputFile* allocKeyVal(const std::string& key, const std::string& value);
|
||||
|
||||
//! Constructor: accepts name and version as strings that are used to create a file name for printing results.
|
||||
/*!
|
||||
This constructor accepts and name and version number for the benchmark that
|
||||
are used to form a file name information for results that are generated by
|
||||
the generate() method.
|
||||
\param name (in) string containing name of the benchmark
|
||||
\param version (in) string containing the version of the benchmark
|
||||
*/
|
||||
OutputFile(const std::string& name, const std::string& version);
|
||||
|
||||
//! Default constructor: no-arguments accepted, should be used for descendant nodes
|
||||
/*!
|
||||
This no-argument constructor can be used for descendant nodes to provide
|
||||
key1::key2::key3=val output. Unlike the root node, descendant nodes do not
|
||||
have name and version but only store key-value pairs.
|
||||
*/
|
||||
OutputFile(void);
|
||||
|
||||
~OutputFile();
|
||||
|
||||
//! Create and add a descendant element with value of type "string"
|
||||
/*!
|
||||
Create and add a descendant element identified by "key" and associated with
|
||||
"value". The element is added at the end of a list of previously added
|
||||
elements.
|
||||
|
||||
@param[in] key The key that identifies the added element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void add(const std::string& key, const std::string& value);
|
||||
|
||||
//! Create and add a descendant element with value of type "double"
|
||||
/*!
|
||||
Create and add a descendant element identified by "key" and associated with
|
||||
"value". The element is added at the end of a list of previously added
|
||||
elements.
|
||||
|
||||
@param[in] key The key that identifies the added element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void add(const std::string& key, double value);
|
||||
|
||||
//! Create and add a descendant element with value of type "int"
|
||||
/*!
|
||||
Create and add a descendant element identified by "key" and associated with
|
||||
"value". The element is added at the end of a list of previously added
|
||||
elements.
|
||||
|
||||
@param[in] key The key that identifies the added element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void add(const std::string& key, int value);
|
||||
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
//! Create and add a descendant element with value of type "long long"
|
||||
/*!
|
||||
Create and add a descendant element identified by "key" and associated with
|
||||
"value". The element is added at the end of a list of previously added
|
||||
elements.
|
||||
|
||||
@param[in] key The key that identifies the added element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void add(const std::string& key, long long value);
|
||||
#endif
|
||||
|
||||
//! Create and add a descendant element with value of type "size_t"
|
||||
/*!
|
||||
Create and add a descendant element identified by "key" and associated with
|
||||
"value". The element is added at the end of a list of previously added
|
||||
elements.
|
||||
|
||||
@param[in] key The key that identifies the added element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void add(const std::string& key, size_t value);
|
||||
|
||||
//! Key-Value setter method
|
||||
/*!
|
||||
Set the key and the value of this element.
|
||||
|
||||
@param[in] key The key that identifies this element and under which the element is stored
|
||||
@param[in] value The value stored by the element
|
||||
*/
|
||||
void setKeyValue(const std::string& key, const std::string& value);
|
||||
|
||||
//! Get the element in the list with the given key or return NULL if not found
|
||||
OutputFile* get(const std::string& key);
|
||||
|
||||
//! Generate output string with results based on the stored key-value hierarchy
|
||||
std::string generate(void);
|
||||
};
|
||||
|
||||
#endif // OUTPUTFILE_HPP
|
||||
79
src/ReadHpcgDat.cpp
Normal file
79
src/ReadHpcgDat.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include "ReadHpcgDat.hpp"
|
||||
|
||||
static int SkipUntilEol(FILE* stream)
|
||||
{
|
||||
int chOrEof;
|
||||
bool finished;
|
||||
|
||||
do
|
||||
{
|
||||
chOrEof = fgetc(stream);
|
||||
finished = (chOrEof == EOF) || (chOrEof == '\n') || (chOrEof == '\r');
|
||||
} while (!finished);
|
||||
|
||||
if ('\r' == chOrEof)
|
||||
{ // on Windows, \r might be followed by \n
|
||||
int chOrEofExtra = fgetc(stream);
|
||||
|
||||
if ('\n' == chOrEofExtra || EOF == chOrEofExtra)
|
||||
chOrEof = chOrEofExtra;
|
||||
else
|
||||
ungetc(chOrEofExtra, stream);
|
||||
}
|
||||
|
||||
return chOrEof;
|
||||
}
|
||||
|
||||
int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename)
|
||||
{
|
||||
FILE* hpcgStream = fopen(filename, "r");
|
||||
|
||||
if (!hpcgStream)
|
||||
{
|
||||
printf("Cannot open input file: %s\n", filename);
|
||||
return -1;
|
||||
}
|
||||
|
||||
SkipUntilEol(hpcgStream); // skip the first line
|
||||
|
||||
SkipUntilEol(hpcgStream); // skip the second line
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
if (fscanf(hpcgStream, "%d", localDimensions + i) != 1 || localDimensions[i] < 16)
|
||||
localDimensions[i] = 16;
|
||||
|
||||
SkipUntilEol(hpcgStream); // skip the rest of the second line
|
||||
|
||||
if (secondsPerRun != 0)
|
||||
{ // Only read number of seconds if the pointer is non-zero
|
||||
if (fscanf(hpcgStream, "%d", secondsPerRun) != 1 || secondsPerRun[0] < 0)
|
||||
secondsPerRun[0] = 30 * 60; // 30 minutes
|
||||
}
|
||||
|
||||
SkipUntilEol(hpcgStream); // skip the rest of the third line
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
// the user didn't specify (or values are invalid) process dimensions
|
||||
if (fscanf(hpcgStream, "%d", localProcDimensions + i) != 1 || localProcDimensions[i] < 1)
|
||||
localProcDimensions[i] = 0; // value 0 means: "not specified" and it will be fixed later
|
||||
|
||||
fclose(hpcgStream);
|
||||
|
||||
return 0;
|
||||
}
|
||||
20
src/ReadHpcgDat.hpp
Normal file
20
src/ReadHpcgDat.hpp
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef READHPCGDAT_HPP
|
||||
#define READHPCGDAT_HPP
|
||||
|
||||
int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename);
|
||||
|
||||
#endif // READHPCGDAT_HPP
|
||||
512
src/ReportResults.cpp
Normal file
512
src/ReportResults.cpp
Normal file
@@ -0,0 +1,512 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ReportResults.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#include "OptimizeProblem.hpp"
|
||||
#include "OutputFile.hpp"
|
||||
#include "ReportResults.hpp"
|
||||
#include <vector>
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
#include <fstream>
|
||||
using std::endl;
|
||||
|
||||
#include "hpcg.hpp"
|
||||
#endif
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
/*!
|
||||
Creates a YAML file and writes the information about the HPCG run, its results, and validity.
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[in] A The known system matrix
|
||||
@param[in] numberOfMgLevels Number of levels in multigrid V cycle
|
||||
@param[in] numberOfCgSets Number of CG runs performed
|
||||
@param[in] niters Number of preconditioned CG iterations performed to lower the residual below a threshold
|
||||
@param[in] times Vector of cumulative timings for each of the phases of a preconditioned CG iteration
|
||||
@param[in] testcg_data the data structure with the results of the CG-correctness test including pass/fail
|
||||
information
|
||||
@param[in] testsymmetry_data the data structure with the results of the CG symmetry test including pass/fail
|
||||
information
|
||||
@param[in] testnorms_data the data structure with the results of the CG norm test including pass/fail information
|
||||
@param[in] global_failure indicates whether a failure occurred during the correctness tests of CG
|
||||
|
||||
@see YAML_Doc
|
||||
*/
|
||||
|
||||
void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
|
||||
double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
|
||||
const TestNormsData& testnorms_data, int global_failure, bool quickPath)
|
||||
{
|
||||
|
||||
double minOfficialTime = 1800; // Any official benchmark result must run at least this many seconds
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
double t4 = times[4];
|
||||
double t4min = 0.0;
|
||||
double t4max = 0.0;
|
||||
double t4avg = 0.0;
|
||||
MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
t4avg = t4avg / ((double) A.geom->size);
|
||||
#endif
|
||||
|
||||
if (A.geom->rank == 0)
|
||||
{ // Only PE 0 needs to compute and report timing results
|
||||
|
||||
// TODO: Put the FLOP count, Memory BW and Memory Usage models into separate functions
|
||||
|
||||
// ======================== FLOP count model =======================================
|
||||
|
||||
double fNumberOfCgSets = numberOfCgSets;
|
||||
double fniters = fNumberOfCgSets * (double) optMaxIters;
|
||||
double fnrow = A.totalNumberOfRows;
|
||||
double fnnz = A.totalNumberOfNonzeros;
|
||||
|
||||
// Op counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
|
||||
double fnops_ddot = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 ddots with nrow adds and nrow mults
|
||||
double fnops_waxpby
|
||||
= (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 WAXPBYs with nrow adds and nrow mults
|
||||
double fnops_sparsemv = (fniters + fNumberOfCgSets) * 2.0 * fnnz; // 1 SpMV with nnz adds and nnz mults
|
||||
// Op counts from the multigrid preconditioners
|
||||
double fnops_precond = 0.0;
|
||||
const SparseMatrix* Af = &A;
|
||||
for (int i = 1; i < numberOfMgLevels; ++i)
|
||||
{
|
||||
double fnnz_Af = Af->totalNumberOfNonzeros;
|
||||
double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
|
||||
double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
|
||||
fnops_precond += fnumberOfPresmootherSteps * fniters * 4.0 * fnnz_Af; // number of presmoother flops
|
||||
fnops_precond += fniters * 2.0 * fnnz_Af; // cost of fine grid residual calculation
|
||||
fnops_precond += fnumberOfPostsmootherSteps * fniters * 4.0 * fnnz_Af; // number of postsmoother flops
|
||||
Af = Af->Ac; // Go to next coarse level
|
||||
}
|
||||
|
||||
fnops_precond
|
||||
+= fniters * 4.0 * ((double) Af->totalNumberOfNonzeros); // One symmetric GS sweep at the coarsest level
|
||||
double fnops = fnops_ddot + fnops_waxpby + fnops_sparsemv + fnops_precond;
|
||||
double frefnops = fnops * ((double) refMaxIters) / ((double) optMaxIters);
|
||||
|
||||
// ======================== Memory bandwidth model =======================================
|
||||
|
||||
// Read/Write counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
|
||||
double fnreads_ddot
|
||||
= (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow * sizeof(double); // 3 ddots with 2 nrow reads
|
||||
double fnwrites_ddot = (3.0 * fniters + fNumberOfCgSets) * sizeof(double); // 3 ddots with 1 write
|
||||
double fnreads_waxpby = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow
|
||||
* sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
|
||||
double fnwrites_waxpby
|
||||
= (3.0 * fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
|
||||
double fnreads_sparsemv = (fniters + fNumberOfCgSets)
|
||||
* (fnnz * (sizeof(double) + sizeof(local_int_t))
|
||||
+ fnrow * sizeof(double)); // 1 SpMV with nnz reads of values, nnz reads indices,
|
||||
// plus nrow reads of x
|
||||
double fnwrites_sparsemv = (fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 1 SpMV nrow writes
|
||||
// Op counts from the multigrid preconditioners
|
||||
double fnreads_precond = 0.0;
|
||||
double fnwrites_precond = 0.0;
|
||||
Af = &A;
|
||||
for (int i = 1; i < numberOfMgLevels; ++i)
|
||||
{
|
||||
double fnnz_Af = Af->totalNumberOfNonzeros;
|
||||
double fnrow_Af = Af->totalNumberOfRows;
|
||||
double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
|
||||
double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
|
||||
fnreads_precond += fnumberOfPresmootherSteps * fniters
|
||||
* (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
|
||||
+ fnrow_Af * sizeof(double)); // number of presmoother reads
|
||||
fnwrites_precond
|
||||
+= fnumberOfPresmootherSteps * fniters * fnrow_Af * sizeof(double); // number of presmoother writes
|
||||
fnreads_precond += fniters
|
||||
* (fnnz_Af * (sizeof(double) + sizeof(local_int_t))
|
||||
+ fnrow_Af * sizeof(double)); // Number of reads for fine grid residual calculation
|
||||
fnwrites_precond
|
||||
+= fniters * fnnz_Af * sizeof(double); // Number of writes for fine grid residual calculation
|
||||
fnreads_precond += fnumberOfPostsmootherSteps * fniters
|
||||
* (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
|
||||
+ fnrow_Af * sizeof(double)); // number of postsmoother reads
|
||||
fnwrites_precond
|
||||
+= fnumberOfPostsmootherSteps * fniters * fnnz_Af * sizeof(double); // number of postsmoother writes
|
||||
Af = Af->Ac; // Go to next coarse level
|
||||
}
|
||||
|
||||
double fnnz_Af = Af->totalNumberOfNonzeros;
|
||||
double fnrow_Af = Af->totalNumberOfRows;
|
||||
fnreads_precond
|
||||
+= fniters * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t)) + fnrow_Af * sizeof(double));
|
||||
; // One symmetric GS sweep at the coarsest level
|
||||
fnwrites_precond += fniters * fnrow_Af * sizeof(double); // One symmetric GS sweep at the coarsest level
|
||||
double fnreads = fnreads_ddot + fnreads_waxpby + fnreads_sparsemv + fnreads_precond;
|
||||
double fnwrites = fnwrites_ddot + fnwrites_waxpby + fnwrites_sparsemv + fnwrites_precond;
|
||||
double frefnreads = fnreads * ((double) refMaxIters) / ((double) optMaxIters);
|
||||
double frefnwrites = fnwrites * ((double) refMaxIters) / ((double) optMaxIters);
|
||||
|
||||
// ======================== Memory usage model =======================================
|
||||
|
||||
// Data in GenerateProblem_ref
|
||||
|
||||
double numberOfNonzerosPerRow
|
||||
= 27.0; // We are approximating a 27-point finite element/volume/difference 3D stencil
|
||||
double size = ((double) A.geom->size); // Needed for estimating size of halo
|
||||
|
||||
double fnbytes = ((double) sizeof(Geometry)); // Geometry struct in main.cpp
|
||||
fnbytes += ((double) sizeof(double) * fNumberOfCgSets); // testnorms_data in main.cpp
|
||||
|
||||
// Model for GenerateProblem_ref.cpp
|
||||
fnbytes += fnrow * sizeof(char); // array nonzerosInRow
|
||||
fnbytes += fnrow * ((double) sizeof(global_int_t*)); // mtxIndG
|
||||
fnbytes += fnrow * ((double) sizeof(local_int_t*)); // mtxIndL
|
||||
fnbytes += fnrow * ((double) sizeof(double*)); // matrixValues
|
||||
fnbytes += fnrow * ((double) sizeof(double*)); // matrixDiagonal
|
||||
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(local_int_t)); // mtxIndL[1..nrows]
|
||||
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(double)); // matrixValues[1..nrows]
|
||||
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
|
||||
fnbytes += fnrow * ((double) 3 * sizeof(double)); // x, b, xexact
|
||||
|
||||
// Model for CGData.hpp
|
||||
double fncol = ((global_int_t) A.localNumberOfColumns)
|
||||
* size; // Estimate of the global number of columns using the value from rank 0
|
||||
fnbytes += fnrow * ((double) 2 * sizeof(double)); // r, Ap
|
||||
fnbytes += fncol * ((double) 2 * sizeof(double)); // z, p
|
||||
|
||||
std::vector<double> fnbytesPerLevel(numberOfMgLevels); // Count byte usage per level (level 0 is main CG level)
|
||||
fnbytesPerLevel[0] = fnbytes;
|
||||
|
||||
// Benchmarker-provided model for OptimizeProblem.cpp
|
||||
double fnbytes_OptimizedProblem = OptimizeProblemMemoryUse(A);
|
||||
fnbytes += fnbytes_OptimizedProblem;
|
||||
|
||||
Af = A.Ac;
|
||||
for (int i = 1; i < numberOfMgLevels; ++i)
|
||||
{
|
||||
double fnrow_Af = Af->totalNumberOfRows;
|
||||
double fncol_Af = ((global_int_t) Af->localNumberOfColumns)
|
||||
* size; // Estimate of the global number of columns using the value from rank 0
|
||||
double fnbytes_Af = 0.0;
|
||||
// Model for GenerateCoarseProblem.cpp
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t)); // f2cOperator
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(double)); // rc
|
||||
fnbytes_Af += 2.0 * fncol_Af
|
||||
* ((double) sizeof(double)); // xc, Axf are estimated based on the size of these arrays on rank 0
|
||||
fnbytes_Af += ((double) (sizeof(Geometry) + sizeof(SparseMatrix) + 3 * sizeof(Vector)
|
||||
+ sizeof(MGData))); // Account for structs geomc, Ac, rc, xc, Axf - (minor)
|
||||
|
||||
// Model for GenerateProblem.cpp (called within GenerateCoarseProblem.cpp)
|
||||
fnbytes_Af += fnrow_Af * sizeof(char); // array nonzerosInRow
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(global_int_t*)); // mtxIndG
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t*)); // mtxIndL
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(double*)); // matrixValues
|
||||
fnbytes_Af += fnrow_Af * ((double) sizeof(double*)); // matrixDiagonal
|
||||
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(local_int_t)); // mtxIndL[1..nrows]
|
||||
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(double)); // matrixValues[1..nrows]
|
||||
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
|
||||
|
||||
// Model for SetupHalo_ref.cpp
|
||||
#ifndef HPCG_NO_MPI
|
||||
fnbytes_Af += ((double) sizeof(double) * Af->totalToBeSent); // sendBuffer
|
||||
fnbytes_Af += ((double) sizeof(local_int_t) * Af->totalToBeSent); // elementsToSend
|
||||
fnbytes_Af += ((double) sizeof(int) * Af->numberOfSendNeighbors); // neighbors
|
||||
fnbytes_Af += ((double) sizeof(local_int_t) * Af->numberOfSendNeighbors); // receiveLength, sendLength
|
||||
#endif
|
||||
fnbytesPerLevel[i] = fnbytes_Af;
|
||||
fnbytes += fnbytes_Af; // Running sum
|
||||
Af = Af->Ac; // Go to next coarse level
|
||||
}
|
||||
|
||||
assert(Af == 0); // Make sure we got to the lowest grid level
|
||||
|
||||
// Count number of bytes used per equation
|
||||
double fnbytesPerEquation = fnbytes / fnrow;
|
||||
|
||||
// Instantiate YAML document
|
||||
OutputFile doc("HPCG-Benchmark", "3.1");
|
||||
doc.add("Release date", "March 28, 2019");
|
||||
|
||||
doc.add("Machine Summary", "");
|
||||
doc.get("Machine Summary")->add("Distributed Processes", A.geom->size);
|
||||
doc.get("Machine Summary")->add("Threads per processes", A.geom->numThreads);
|
||||
|
||||
doc.add("Global Problem Dimensions", "");
|
||||
doc.get("Global Problem Dimensions")->add("Global nx", A.geom->gnx);
|
||||
doc.get("Global Problem Dimensions")->add("Global ny", A.geom->gny);
|
||||
doc.get("Global Problem Dimensions")->add("Global nz", A.geom->gnz);
|
||||
|
||||
doc.add("Processor Dimensions", "");
|
||||
doc.get("Processor Dimensions")->add("npx", A.geom->npx);
|
||||
doc.get("Processor Dimensions")->add("npy", A.geom->npy);
|
||||
doc.get("Processor Dimensions")->add("npz", A.geom->npz);
|
||||
|
||||
doc.add("Local Domain Dimensions", "");
|
||||
doc.get("Local Domain Dimensions")->add("nx", A.geom->nx);
|
||||
doc.get("Local Domain Dimensions")->add("ny", A.geom->ny);
|
||||
|
||||
doc.add("########## Problem Summary ##########", "");
|
||||
|
||||
doc.add("Setup Information", "");
|
||||
doc.get("Setup Information")->add("Setup Time", times[9]);
|
||||
|
||||
doc.add("Linear System Information", "");
|
||||
doc.get("Linear System Information")->add("Number of Equations", A.totalNumberOfRows);
|
||||
doc.get("Linear System Information")->add("Number of Nonzero Terms", A.totalNumberOfNonzeros);
|
||||
|
||||
doc.add("Multigrid Information", "");
|
||||
doc.get("Multigrid Information")->add("Number of coarse grid levels", numberOfMgLevels - 1);
|
||||
Af = &A;
|
||||
doc.get("Multigrid Information")->add("Coarse Grids", "");
|
||||
for (int i = 1; i < numberOfMgLevels; ++i)
|
||||
{
|
||||
doc.get("Multigrid Information")->get("Coarse Grids")->add("Grid Level", i);
|
||||
doc.get("Multigrid Information")
|
||||
->get("Coarse Grids")
|
||||
->add("Number of Equations", Af->Ac->totalNumberOfRows);
|
||||
doc.get("Multigrid Information")
|
||||
->get("Coarse Grids")
|
||||
->add("Number of Nonzero Terms", Af->Ac->totalNumberOfNonzeros);
|
||||
doc.get("Multigrid Information")
|
||||
->get("Coarse Grids")
|
||||
->add("Number of Presmoother Steps", Af->mgData->numberOfPresmootherSteps);
|
||||
doc.get("Multigrid Information")
|
||||
->get("Coarse Grids")
|
||||
->add("Number of Postsmoother Steps", Af->mgData->numberOfPostsmootherSteps);
|
||||
Af = Af->Ac;
|
||||
}
|
||||
|
||||
doc.add("########## Memory Use Summary ##########", "");
|
||||
|
||||
doc.add("Memory Use Information", "");
|
||||
doc.get("Memory Use Information")->add("Total memory used for data (Gbytes)", fnbytes / 1000000000.0);
|
||||
doc.get("Memory Use Information")
|
||||
->add("Memory used for OptimizeProblem data (Gbytes)", fnbytes_OptimizedProblem / 1000000000.0);
|
||||
doc.get("Memory Use Information")
|
||||
->add("Bytes per equation (Total memory / Number of Equations)", fnbytesPerEquation);
|
||||
|
||||
doc.get("Memory Use Information")
|
||||
->add("Memory used for linear system and CG (Gbytes)", fnbytesPerLevel[0] / 1000000000.0);
|
||||
|
||||
doc.get("Memory Use Information")->add("Coarse Grids", "");
|
||||
for (int i = 1; i < numberOfMgLevels; ++i)
|
||||
{
|
||||
doc.get("Memory Use Information")->get("Coarse Grids")->add("Grid Level", i);
|
||||
doc.get("Memory Use Information")
|
||||
->get("Coarse Grids")
|
||||
->add("Memory used", fnbytesPerLevel[i] / 1000000000.0);
|
||||
}
|
||||
|
||||
doc.add("########## V&V Testing Summary ##########", "");
|
||||
doc.add("Spectral Convergence Tests", "");
|
||||
if (testcg_data.count_fail == 0)
|
||||
doc.get("Spectral Convergence Tests")->add("Result", "PASSED");
|
||||
else
|
||||
doc.get("Spectral Convergence Tests")->add("Result", "FAILED");
|
||||
doc.get("Spectral Convergence Tests")->add("Unpreconditioned", "");
|
||||
doc.get("Spectral Convergence Tests")
|
||||
->get("Unpreconditioned")
|
||||
->add("Maximum iteration count", testcg_data.niters_max_no_prec);
|
||||
doc.get("Spectral Convergence Tests")
|
||||
->get("Unpreconditioned")
|
||||
->add("Expected iteration count", testcg_data.expected_niters_no_prec);
|
||||
doc.get("Spectral Convergence Tests")->add("Preconditioned", "");
|
||||
doc.get("Spectral Convergence Tests")
|
||||
->get("Preconditioned")
|
||||
->add("Maximum iteration count", testcg_data.niters_max_prec);
|
||||
doc.get("Spectral Convergence Tests")
|
||||
->get("Preconditioned")
|
||||
->add("Expected iteration count", testcg_data.expected_niters_prec);
|
||||
|
||||
const char DepartureFromSymmetry[] = "Departure from Symmetry |x'Ay-y'Ax|/(2*||x||*||A||*||y||)/epsilon";
|
||||
doc.add(DepartureFromSymmetry, "");
|
||||
if (testsymmetry_data.count_fail == 0)
|
||||
doc.get(DepartureFromSymmetry)->add("Result", "PASSED");
|
||||
else
|
||||
doc.get(DepartureFromSymmetry)->add("Result", "FAILED");
|
||||
doc.get(DepartureFromSymmetry)->add("Departure for SpMV", testsymmetry_data.depsym_spmv);
|
||||
doc.get(DepartureFromSymmetry)->add("Departure for MG", testsymmetry_data.depsym_mg);
|
||||
|
||||
doc.add("########## Iterations Summary ##########", "");
|
||||
doc.add("Iteration Count Information", "");
|
||||
if (!global_failure)
|
||||
doc.get("Iteration Count Information")->add("Result", "PASSED");
|
||||
else
|
||||
doc.get("Iteration Count Information")->add("Result", "FAILED");
|
||||
doc.get("Iteration Count Information")->add("Reference CG iterations per set", refMaxIters);
|
||||
doc.get("Iteration Count Information")->add("Optimized CG iterations per set", optMaxIters);
|
||||
doc.get("Iteration Count Information")
|
||||
->add("Total number of reference iterations", refMaxIters * numberOfCgSets);
|
||||
doc.get("Iteration Count Information")
|
||||
->add("Total number of optimized iterations", optMaxIters * numberOfCgSets);
|
||||
|
||||
doc.add("########## Reproducibility Summary ##########", "");
|
||||
doc.add("Reproducibility Information", "");
|
||||
if (testnorms_data.pass)
|
||||
doc.get("Reproducibility Information")->add("Result", "PASSED");
|
||||
else
|
||||
doc.get("Reproducibility Information")->add("Result", "FAILED");
|
||||
doc.get("Reproducibility Information")->add("Scaled residual mean", testnorms_data.mean);
|
||||
doc.get("Reproducibility Information")->add("Scaled residual variance", testnorms_data.variance);
|
||||
|
||||
doc.add("########## Performance Summary (times in sec) ##########", "");
|
||||
|
||||
doc.add("Benchmark Time Summary", "");
|
||||
doc.get("Benchmark Time Summary")->add("Optimization phase", times[7]);
|
||||
doc.get("Benchmark Time Summary")->add("DDOT", times[1]);
|
||||
doc.get("Benchmark Time Summary")->add("WAXPBY", times[2]);
|
||||
doc.get("Benchmark Time Summary")->add("SpMV", times[3]);
|
||||
doc.get("Benchmark Time Summary")->add("MG", times[5]);
|
||||
doc.get("Benchmark Time Summary")->add("Total", times[0]);
|
||||
|
||||
doc.add("Floating Point Operations Summary", "");
|
||||
doc.get("Floating Point Operations Summary")->add("Raw DDOT", fnops_ddot);
|
||||
doc.get("Floating Point Operations Summary")->add("Raw WAXPBY", fnops_waxpby);
|
||||
doc.get("Floating Point Operations Summary")->add("Raw SpMV", fnops_sparsemv);
|
||||
doc.get("Floating Point Operations Summary")->add("Raw MG", fnops_precond);
|
||||
doc.get("Floating Point Operations Summary")->add("Total", fnops);
|
||||
doc.get("Floating Point Operations Summary")->add("Total with convergence overhead", frefnops);
|
||||
|
||||
doc.add("GB/s Summary", "");
|
||||
doc.get("GB/s Summary")->add("Raw Read B/W", fnreads / times[0] / 1.0E9);
|
||||
doc.get("GB/s Summary")->add("Raw Write B/W", fnwrites / times[0] / 1.0E9);
|
||||
doc.get("GB/s Summary")->add("Raw Total B/W", (fnreads + fnwrites) / (times[0]) / 1.0E9);
|
||||
doc.get("GB/s Summary")
|
||||
->add("Total with convergence and optimization phase overhead",
|
||||
(frefnreads + frefnwrites) / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0))
|
||||
/ 1.0E9);
|
||||
|
||||
doc.add("GFLOP/s Summary", "");
|
||||
doc.get("GFLOP/s Summary")->add("Raw DDOT", fnops_ddot / times[1] / 1.0E9);
|
||||
doc.get("GFLOP/s Summary")->add("Raw WAXPBY", fnops_waxpby / times[2] / 1.0E9);
|
||||
doc.get("GFLOP/s Summary")->add("Raw SpMV", fnops_sparsemv / (times[3]) / 1.0E9);
|
||||
doc.get("GFLOP/s Summary")->add("Raw MG", fnops_precond / (times[5]) / 1.0E9);
|
||||
doc.get("GFLOP/s Summary")->add("Raw Total", fnops / times[0] / 1.0E9);
|
||||
doc.get("GFLOP/s Summary")->add("Total with convergence overhead", frefnops / times[0] / 1.0E9);
|
||||
// This final GFLOP/s rating includes the overhead of problem setup and optimizing the data structures vs ten
|
||||
// sets of 50 iterations of CG
|
||||
double totalGflops = frefnops / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0)) / 1.0E9;
|
||||
double totalGflops24 = frefnops / (times[0] + fNumberOfCgSets * times[7] / 10.0) / 1.0E9;
|
||||
doc.get("GFLOP/s Summary")->add("Total with convergence and optimization phase overhead", totalGflops);
|
||||
|
||||
doc.add("User Optimization Overheads", "");
|
||||
doc.get("User Optimization Overheads")->add("Optimization phase time (sec)", (times[7]));
|
||||
doc.get("User Optimization Overheads")
|
||||
->add("Optimization phase time vs reference SpMV+MG time", times[7] / times[8]);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
doc.add("DDOT Timing Variations", "");
|
||||
doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time", t4min);
|
||||
doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time", t4max);
|
||||
doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time", t4avg);
|
||||
|
||||
// doc.get("Sparse Operations Overheads")->add("Halo exchange time (sec)", (times[6]));
|
||||
// doc.get("Sparse Operations Overheads")->add("Halo exchange as percentage of SpMV time",
|
||||
// (times[6])/totalSparseMVTime*100.0);
|
||||
#endif
|
||||
doc.add("Final Summary", "");
|
||||
bool isValidRun = (testcg_data.count_fail == 0) && (testsymmetry_data.count_fail == 0) && (testnorms_data.pass)
|
||||
&& (!global_failure);
|
||||
if (isValidRun)
|
||||
{
|
||||
doc.get("Final Summary")->add("HPCG result is VALID with a GFLOP/s rating of", totalGflops);
|
||||
doc.get("Final Summary")->add("HPCG 2.4 rating for historical reasons is", totalGflops24);
|
||||
if (!A.isDotProductOptimized)
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("Reference version of ComputeDotProduct used",
|
||||
"Performance results are most likely suboptimal");
|
||||
}
|
||||
if (!A.isSpmvOptimized)
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("Reference version of ComputeSPMV used", "Performance results are most likely suboptimal");
|
||||
}
|
||||
if (!A.isMgOptimized)
|
||||
{
|
||||
if (A.geom->numThreads > 1)
|
||||
doc.get("Final Summary")
|
||||
->add("Reference version of ComputeMG used and number of threads greater than 1",
|
||||
"Performance results are severely suboptimal");
|
||||
else // numThreads ==1
|
||||
doc.get("Final Summary")
|
||||
->add("Reference version of ComputeMG used", "Performance results are most likely suboptimal");
|
||||
}
|
||||
if (!A.isWaxpbyOptimized)
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("Reference version of ComputeWAXPBY used", "Performance results are most likely suboptimal");
|
||||
}
|
||||
if (times[0] >= minOfficialTime)
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("Please upload results from the YAML file contents to", "http://hpcg-benchmark.org");
|
||||
}
|
||||
else
|
||||
{
|
||||
doc.get("Final Summary")->add("Results are valid but execution time (sec) is", times[0]);
|
||||
if (quickPath)
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("You have selected the QuickPath option",
|
||||
"Results are official for legacy installed systems with confirmation from the HPCG "
|
||||
"Benchmark leaders.");
|
||||
doc.get("Final Summary")
|
||||
->add("After confirmation please upload results from the YAML file contents to",
|
||||
"http://hpcg-benchmark.org");
|
||||
}
|
||||
else
|
||||
{
|
||||
doc.get("Final Summary")
|
||||
->add("Official results execution time (sec) must be at least", minOfficialTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
doc.get("Final Summary")->add("HPCG result is", "INVALID.");
|
||||
doc.get("Final Summary")
|
||||
->add("Please review the YAML file contents", "You may NOT submit these results for consideration.");
|
||||
}
|
||||
|
||||
std::string yaml = doc.generate();
|
||||
#ifdef HPCG_DEBUG
|
||||
HPCG_fout << yaml;
|
||||
#endif
|
||||
}
|
||||
return;
|
||||
}
|
||||
26
src/ReportResults.hpp
Normal file
26
src/ReportResults.hpp
Normal file
@@ -0,0 +1,26 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef REPORTRESULTS_HPP
|
||||
#define REPORTRESULTS_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "TestCG.hpp"
|
||||
#include "TestNorms.hpp"
|
||||
#include "TestSymmetry.hpp"
|
||||
|
||||
void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
|
||||
double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
|
||||
const TestNormsData& testnorms_data, int global_failure, bool quickPath);
|
||||
|
||||
#endif // REPORTRESULTS_HPP
|
||||
729
src/SetupHalo.cpp
Normal file
729
src/SetupHalo.cpp
Normal file
@@ -0,0 +1,729 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file SetupHalo.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <map>
|
||||
#include <mpi.h>
|
||||
#include <set>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "SetupHalo.hpp"
|
||||
#include "SetupHalo_ref.hpp"
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
#include "CpuKernels.hpp"
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Used to find ranks for CPU and GPU programs
|
||||
extern int global_total_ranks;
|
||||
extern int* physical_rank_dims;
|
||||
extern int* logical_rank_to_phys;
|
||||
extern int* rankToId_h;
|
||||
extern int* idToRank_h;
|
||||
extern p2p_comm_mode_t P2P_Mode;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Prepares system matrix data structure and creates data necessary necessary
|
||||
for communication of boundary values of this process.
|
||||
|
||||
@param[inout] A The known system matrix
|
||||
|
||||
@see ExchangeHalo
|
||||
*/
|
||||
#ifdef USE_CUDA
|
||||
void SetupHalo_Gpu(SparseMatrix& A)
|
||||
{
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
local_int_t localNumberOfRows = A.localNumberOfRows;
|
||||
|
||||
local_int_t* send_buffer_d;
|
||||
local_int_t sendbufld
|
||||
= std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
|
||||
int* neighbors = new int[27];
|
||||
int* neighborsPhysical = new int[27];
|
||||
|
||||
CHECK_CUDART(cudaMalloc((void**) &(send_buffer_d), 27 * sendbufld * sizeof(local_int_t)));
|
||||
local_int_t* sendLength = new local_int_t[27];
|
||||
|
||||
local_int_t totalToBeSent = 0;
|
||||
int neiCount = 0;
|
||||
int numberOfExternalValues = 0;
|
||||
|
||||
local_int_t* sendcounts2 = new local_int_t[27];
|
||||
local_int_t* receiveLength = new local_int_t[27];
|
||||
memset(sendcounts2, 0, sizeof(local_int_t) * (27));
|
||||
|
||||
local_int_t* sendcounts_d = NULL;
|
||||
local_int_t* elementsToSendGpu;
|
||||
|
||||
cudaMalloc(&sendcounts_d, sizeof(local_int_t) * (27));
|
||||
cudaMemsetAsync(sendcounts_d, 0, sizeof(local_int_t) * (27), stream);
|
||||
|
||||
// Finds elements to send and neighbors
|
||||
SetupHaloCuda(A, sendbufld, sendcounts_d, send_buffer_d, &totalToBeSent, &neiCount, neighbors, sendLength,
|
||||
&elementsToSendGpu);
|
||||
|
||||
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
|
||||
double* sendBuffer = nullptr;
|
||||
if (totalToBeSent > 0)
|
||||
{
|
||||
cudaMemcpyAsync(
|
||||
elementsToSend, elementsToSendGpu, sizeof(local_int_t) * totalToBeSent, cudaMemcpyDeviceToHost, stream);
|
||||
|
||||
local_int_t* sendcounts = (local_int_t*) malloc(sizeof(local_int_t) * (A.geom->size + 1));
|
||||
memset(sendcounts, 0, sizeof(local_int_t) * (A.geom->size + 1));
|
||||
|
||||
local_int_t *eltsToRecv_d = NULL, *extToLocMap = NULL;
|
||||
|
||||
sendcounts[0] = 0;
|
||||
for (int i = 0; i < neiCount; i++)
|
||||
{
|
||||
receiveLength[i] = sendLength[i];
|
||||
sendcounts[i + 1] = sendcounts[i] + sendLength[i];
|
||||
int neighborId = neighbors[i];
|
||||
neighborsPhysical[i] = logical_rank_to_phys[neighborId];
|
||||
}
|
||||
CHECK_CUDART(cudaMalloc(&extToLocMap, sizeof(local_int_t) * localNumberOfRows));
|
||||
CHECK_CUDART(cudaMalloc(&eltsToRecv_d, sizeof(local_int_t) * totalToBeSent));
|
||||
|
||||
CHECK_CUDART(cudaMallocHost(&(sendBuffer), sizeof(double) * totalToBeSent));
|
||||
CHECK_CUDART(cudaMalloc(&(A.gpuAux.sendBuffer), sizeof(double) * totalToBeSent));
|
||||
|
||||
local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
|
||||
|
||||
// Exchange elements to send with neighbors
|
||||
auto INDEX_TYPE = MPI_INT;
|
||||
#ifdef INDEX_64 // In src/Geometry
|
||||
INDEX_TYPE = MPI_LONG;
|
||||
#endif
|
||||
|
||||
MPI_Status status;
|
||||
int MPI_MY_TAG = 93;
|
||||
MPI_Request* request = new MPI_Request[neiCount];
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
local_int_t* recv_ptr = eltsToRecv;
|
||||
for (int i = 0; i < neiCount; i++)
|
||||
{
|
||||
auto n_recv = sendLength[i];
|
||||
MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
|
||||
recv_ptr += n_recv;
|
||||
}
|
||||
|
||||
local_int_t* elts_ptr = elementsToSend;
|
||||
for (int i = 0; i < neiCount; i++)
|
||||
{
|
||||
auto n_send = sendLength[i];
|
||||
MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
elts_ptr += n_send;
|
||||
}
|
||||
for (int i = 0; i < neiCount; i++)
|
||||
{
|
||||
MPI_Wait(request + i, &status);
|
||||
}
|
||||
delete[] request;
|
||||
|
||||
cudaMemcpyAsync(
|
||||
eltsToRecv_d, eltsToRecv, sizeof(local_int_t) * (totalToBeSent), cudaMemcpyHostToDevice, stream);
|
||||
|
||||
// Add the sorted indices from neighbors. For each neighbor, add its indices sequentially
|
||||
// before the next neighbor's indices. Tje indices will be adjusted to be
|
||||
// localNumberOfRows + its sequential location
|
||||
for (int neighborCount = 0; neighborCount < neiCount; ++neighborCount)
|
||||
{
|
||||
int neighborId = neighbors[neighborCount];
|
||||
cudaMemsetAsync(extToLocMap, 0, sizeof(local_int_t) * localNumberOfRows, stream);
|
||||
local_int_t str = sendcounts[neighborCount];
|
||||
local_int_t end = sendcounts[neighborCount + 1];
|
||||
ExtToLocMapCuda(localNumberOfRows, str, end, extToLocMap, eltsToRecv_d);
|
||||
ExtTolocCuda(localNumberOfRows, neighborId, A.extNnz, A.csrExtColumns, A.csrExtValues,
|
||||
A.gpuAux.ext2csrOffsets, extToLocMap, A.gpuAux.columns);
|
||||
}
|
||||
|
||||
CHECK_CUDART(cudaFree(sendcounts_d));
|
||||
CHECK_CUDART(cudaFree(extToLocMap));
|
||||
CHECK_CUDART(cudaFree(eltsToRecv_d));
|
||||
|
||||
// For P2P Alltoallv communication
|
||||
if (P2P_Mode == MPI_GPU_All2allv || P2P_Mode == MPI_CPU_All2allv)
|
||||
{
|
||||
int* sdispls = new int[A.geom->size];
|
||||
int* rdispls = new int[A.geom->size];
|
||||
int* scounts = new int[A.geom->size];
|
||||
int* rcounts = new int[A.geom->size];
|
||||
int tmp_s = 0, tmp_r = 0;
|
||||
|
||||
if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
|
||||
return;
|
||||
|
||||
for (local_int_t i = 0; i < A.geom->size; i++)
|
||||
{
|
||||
scounts[i] = 0;
|
||||
rcounts[i] = 0;
|
||||
sdispls[i] = 0;
|
||||
rdispls[i] = 0;
|
||||
}
|
||||
|
||||
for (local_int_t i = 0; i < neiCount; i++)
|
||||
{
|
||||
local_int_t root = neighborsPhysical[i];
|
||||
scounts[root] = sendLength[i];
|
||||
rcounts[root] = receiveLength[i];
|
||||
sdispls[root] = tmp_s;
|
||||
tmp_s += sendLength[i];
|
||||
rdispls[root] = tmp_r;
|
||||
tmp_r += receiveLength[i];
|
||||
}
|
||||
|
||||
A.scounts = scounts;
|
||||
A.rcounts = rcounts;
|
||||
A.sdispls = sdispls;
|
||||
A.rdispls = rdispls;
|
||||
}
|
||||
}
|
||||
|
||||
// Store contents in our matrix struct
|
||||
A.numberOfExternalValues = totalToBeSent;
|
||||
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
|
||||
A.numberOfSendNeighbors = neiCount;
|
||||
A.totalToBeSent = totalToBeSent;
|
||||
A.elementsToSend = elementsToSend;
|
||||
A.gpuAux.elementsToSend = elementsToSendGpu;
|
||||
A.neighbors = neighbors;
|
||||
A.neighborsPhysical = neighborsPhysical;
|
||||
A.receiveLength = receiveLength;
|
||||
A.sendLength = sendLength;
|
||||
A.sendBuffer = sendBuffer;
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
void SetupHalo_Cpu(SparseMatrix& A)
|
||||
{
|
||||
// Extract Matrix pieces
|
||||
global_int_t nx = A.geom->nx;
|
||||
global_int_t ny = A.geom->ny;
|
||||
global_int_t nz = A.geom->nz;
|
||||
global_int_t gnx = A.geom->gnx;
|
||||
global_int_t gny = A.geom->gny;
|
||||
global_int_t gnz = A.geom->gnz;
|
||||
global_int_t gix0 = A.geom->gix0;
|
||||
global_int_t giy0 = A.geom->giy0;
|
||||
global_int_t giz0 = A.geom->giz0;
|
||||
int npx = A.geom->npx;
|
||||
int npy = A.geom->npy;
|
||||
|
||||
local_int_t localNumberOfRows = A.localNumberOfRows;
|
||||
local_int_t* nonzerosInRow = A.nonzerosInRow;
|
||||
global_int_t** mtxIndG = A.mtxIndG;
|
||||
local_int_t** mtxIndL = A.mtxIndL;
|
||||
|
||||
#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
int cur_nnz = nonzerosInRow[i];
|
||||
for (int j = 0; j < cur_nnz; j++)
|
||||
mtxIndL[i][j] = mtxIndG[i][j];
|
||||
}
|
||||
|
||||
#else // Run this section if compiling for MPI
|
||||
|
||||
// Scan global IDs of the nonzeros in the matrix. Determine if the column ID matches a row ID. If not:
|
||||
// 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
|
||||
// We need to receive this value of the x vector during the halo exchange.
|
||||
// 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
|
||||
std::map<local_int_t, std::map<global_int_t, local_int_t>> externalToLocalMap;
|
||||
local_int_t* extTemp = new local_int_t[localNumberOfRows];
|
||||
|
||||
// Okay Let us git rid of the map
|
||||
local_int_t sendbufld
|
||||
= std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
|
||||
local_int_t* send_buffer = new local_int_t[27 * sendbufld];
|
||||
char* has_external = new char[localNumberOfRows];
|
||||
local_int_t* sendcounter = new local_int_t[27];
|
||||
for (local_int_t i = 0; i < 27; i++)
|
||||
sendcounter[i] = 0;
|
||||
|
||||
// Goes through all local rows, for each local point
|
||||
// find its 27 3D neighbors (including the point itself).
|
||||
// For each neibor decide if it is on a different rank (halo) or local
|
||||
// If external, add to the send buffer
|
||||
// If local, create the local matrix
|
||||
#pragma omp parallel for
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
const local_int_t iz = (i / (nx * ny));
|
||||
const local_int_t iy = (i - iz * nx * ny) / nx;
|
||||
const local_int_t ix = i - (iz * ny + iy) * nx;
|
||||
const global_int_t gix = ix + gix0;
|
||||
const global_int_t giy = iy + giy0;
|
||||
const global_int_t giz = iz + giz0;
|
||||
global_int_t curcol;
|
||||
|
||||
int nnz_c = 0;
|
||||
bool rank_set[27];
|
||||
for (int j = 0; j < 27; j++)
|
||||
{
|
||||
rank_set[j] = false;
|
||||
}
|
||||
has_external[i] = 0;
|
||||
for (int k = 0; k < 27; k++)
|
||||
{
|
||||
long long int cgix = gix + tid2indCpu[k][0];
|
||||
long long int cgiy = giy + tid2indCpu[k][1];
|
||||
long long int cgiz = giz + tid2indCpu[k][2];
|
||||
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
|
||||
if (ok)
|
||||
{
|
||||
int ipz = cgiz / nz;
|
||||
int ipy = cgiy / ny;
|
||||
int ipx = cgix / nx;
|
||||
|
||||
// For GPUCPU exec mode, find the 3D rank coordinates.
|
||||
// For diff dim between CPU and GPU, we cannot
|
||||
// just divide on the local dim to find ipx/ipy/ipz
|
||||
// We must find it manually based on neighbor 3d coordinates
|
||||
// Note the halo size is always 1
|
||||
if (A.geom->different_dim == Z)
|
||||
{
|
||||
long long int local = cgiz - giz0;
|
||||
if (local >= 0 && local < nz)
|
||||
ipz = A.geom->ipz;
|
||||
else if (local < 0)
|
||||
ipz = A.geom->ipz - 1;
|
||||
else if (local >= nz)
|
||||
ipz = A.geom->ipz + 1;
|
||||
}
|
||||
else if (A.geom->different_dim == Y)
|
||||
{
|
||||
long long int local = cgiy - giy0;
|
||||
if (local >= 0 && local < ny)
|
||||
ipy = A.geom->ipy;
|
||||
else if (local < 0)
|
||||
ipy = A.geom->ipy - 1;
|
||||
else if (local >= ny)
|
||||
ipy = A.geom->ipy + 1;
|
||||
}
|
||||
else if (A.geom->different_dim == X)
|
||||
{
|
||||
long long int local = cgix - gix0;
|
||||
if (local >= 0 && local < nx)
|
||||
ipx = A.geom->ipx;
|
||||
else if (local < 0)
|
||||
ipx = A.geom->ipx - 1;
|
||||
else if (local >= nx)
|
||||
ipx = A.geom->ipx + 1;
|
||||
}
|
||||
|
||||
// Global rank Id
|
||||
int col_rank = ipx + ipy * npx + ipz * npy * npx;
|
||||
|
||||
// The neighbor point rank is diff than the current point rank
|
||||
if (A.geom->logical_rank != col_rank)
|
||||
{
|
||||
has_external[i] = 1;
|
||||
int rankId = rankToId_h[col_rank];
|
||||
local_int_t* p = &(sendcounter[rankId]);
|
||||
// Add the halo point atomically to send_buffer
|
||||
// For all the cols in a row that has the same rank,
|
||||
// we add the row once to the rank buffer
|
||||
if (!rank_set[rankId])
|
||||
{
|
||||
rank_set[rankId] = true;
|
||||
local_int_t t;
|
||||
#pragma omp atomic capture
|
||||
{
|
||||
t = *p;
|
||||
*p += 1;
|
||||
}
|
||||
send_buffer[rankId * sendbufld + t] = i;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// local neighbor, add it to the local matrix
|
||||
local_int_t zi = cgiz - giz0;
|
||||
local_int_t yi = cgiy - giy0;
|
||||
local_int_t xi = cgix - gix0;
|
||||
local_int_t lcol = zi * ny * nx + yi * nx + xi;
|
||||
mtxIndL[i][nnz_c] = lcol;
|
||||
}
|
||||
nnz_c++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now external data structures
|
||||
// 1 Create elements to send buffer (Sort the indicies for each neighbor)
|
||||
local_int_t totalToBeSent = 0;
|
||||
local_int_t* sendcounts = new local_int_t[A.geom->size + 1];
|
||||
sendcounts[0] = 0;
|
||||
int neighborCount = 0;
|
||||
#pragma omp parallel for
|
||||
for (local_int_t i = 0; i < 27; i++)
|
||||
{
|
||||
if (sendcounter[i] > 0)
|
||||
{
|
||||
std::sort(send_buffer + i * sendbufld, send_buffer + i * sendbufld + sendcounter[i]);
|
||||
}
|
||||
}
|
||||
for (local_int_t i = 0; i < 27; i++)
|
||||
{
|
||||
if (sendcounter[i] > 0)
|
||||
{
|
||||
totalToBeSent += sendcounter[i];
|
||||
sendcounts[neighborCount + 1] = sendcounts[neighborCount] + sendcounter[i];
|
||||
neighborCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// 2 Now find neighbor Ids, neighbor physical Ids (see GenerateGeometry), and elemets to send
|
||||
local_int_t sendEntryCount = 0;
|
||||
local_int_t* receiveLength = new local_int_t[neighborCount];
|
||||
local_int_t* sendLength = new local_int_t[neighborCount];
|
||||
// Build the arrays and lists needed by the ExchangeHalo function.
|
||||
double* sendBuffer = new double[totalToBeSent];
|
||||
int* neighbors = new int[neighborCount];
|
||||
int* neighborsPhysical = new int[neighborCount];
|
||||
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
|
||||
|
||||
neighborCount = 0;
|
||||
for (local_int_t i = 0; i < 27; i++)
|
||||
{
|
||||
if (sendcounter[i] > 0)
|
||||
{
|
||||
int neighborId = idToRank_h[i]; // logical Id
|
||||
int phys_neiId = logical_rank_to_phys[neighborId];
|
||||
|
||||
neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
|
||||
neighborsPhysical[neighborCount] = phys_neiId;
|
||||
receiveLength[neighborCount] = sendcounter[i];
|
||||
sendLength[neighborCount] = sendcounter[i];
|
||||
|
||||
for (int j = 0; j < sendcounter[i]; j++)
|
||||
{
|
||||
elementsToSend[sendEntryCount] = send_buffer[i * sendbufld + j];
|
||||
sendEntryCount++;
|
||||
}
|
||||
neighborCount++;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] send_buffer;
|
||||
delete[] sendcounter;
|
||||
|
||||
// Exchange elements to send wit other neighbors
|
||||
auto INDEX_TYPE = MPI_INT;
|
||||
#ifdef INDEX_64 // In src/Geometry
|
||||
INDEX_TYPE = MPI_LONG;
|
||||
#endif
|
||||
MPI_Status status;
|
||||
int MPI_MY_TAG = 93;
|
||||
MPI_Request* request = new MPI_Request[neighborCount];
|
||||
local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
|
||||
local_int_t* recv_ptr = eltsToRecv;
|
||||
for (int i = 0; i < neighborCount; i++)
|
||||
{
|
||||
int n_recv = sendLength[i];
|
||||
MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
|
||||
recv_ptr += n_recv;
|
||||
}
|
||||
|
||||
local_int_t* elts_ptr = elementsToSend;
|
||||
for (int i = 0; i < neighborCount; i++)
|
||||
{
|
||||
local_int_t n_send = sendLength[i];
|
||||
MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
elts_ptr += n_send;
|
||||
}
|
||||
for (int i = 0; i < neighborCount; i++)
|
||||
{
|
||||
MPI_Wait(request + i, &status);
|
||||
}
|
||||
delete[] request;
|
||||
|
||||
// Create a map to be used in the optimization step
|
||||
// Any external column index will be given a sequntail Id
|
||||
// after the number of rows (Will be used to access x vector)
|
||||
int prev_dim = 0;
|
||||
for (int nc = 0; nc < neighborCount; ++nc)
|
||||
{
|
||||
int neighborId = neighbors[nc];
|
||||
int phys_neiId = neighborsPhysical[nc];
|
||||
local_int_t str = sendcounts[nc];
|
||||
local_int_t end = sendcounts[nc + 1];
|
||||
for (int j = str; j < end; j++)
|
||||
{
|
||||
const local_int_t col = eltsToRecv[j];
|
||||
externalToLocalMap[neighborId][col] = localNumberOfRows + j;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] eltsToRecv;
|
||||
delete[] sendcounts;
|
||||
|
||||
if (totalToBeSent > 0)
|
||||
{
|
||||
// Last step sort all external IDs per rank Id, elements of neighbor 0 first, then 1, and so on
|
||||
#pragma omp parallel for
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
int nnz_ext = 0;
|
||||
if (has_external[i] == 1)
|
||||
{
|
||||
|
||||
const local_int_t iz = (i / (nx * ny));
|
||||
const local_int_t iy = (i - iz * nx * ny) / nx;
|
||||
const local_int_t ix = i - (iz * ny + iy) * nx;
|
||||
const global_int_t gix = ix + gix0;
|
||||
const global_int_t giy = iy + giy0;
|
||||
const global_int_t giz = iz + giz0;
|
||||
int nnz_c = 0;
|
||||
|
||||
for (int k = 0; k < 27; k++)
|
||||
{
|
||||
long long int cgix = gix + tid2indCpu[k][0];
|
||||
long long int cgiy = giy + tid2indCpu[k][1];
|
||||
long long int cgiz = giz + tid2indCpu[k][2];
|
||||
|
||||
local_int_t zi = (cgiz) % nz;
|
||||
local_int_t yi = (cgiy) % ny;
|
||||
local_int_t xi = (cgix) % nx;
|
||||
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
|
||||
int ipz = cgiz / nz;
|
||||
int ipy = cgiy / ny;
|
||||
int ipx = cgix / nx;
|
||||
|
||||
// The indices sent by the neighbor uses the neighbor's nx, ny, and nz which can
|
||||
// be deffirent than the current neighbor's dims. Thus, based on neighor location
|
||||
// and the diffrent_dim we adjust the indices if needed.
|
||||
// Also, the ipx, ipy, and ipz must be updated accordingly
|
||||
global_int_t new_nx = A.geom->nx;
|
||||
global_int_t new_ny = A.geom->ny;
|
||||
|
||||
if (A.geom->different_dim == Z)
|
||||
{
|
||||
long long int local = cgiz - giz0;
|
||||
if (local >= 0 && local < nz)
|
||||
{
|
||||
ipz = A.geom->ipz;
|
||||
zi = local;
|
||||
}
|
||||
else if (local < 0)
|
||||
{
|
||||
ipz = A.geom->ipz - 1;
|
||||
zi = A.geom->previous_neighbor_dim - 1;
|
||||
}
|
||||
else if (local >= nz)
|
||||
{
|
||||
ipz = A.geom->ipz + 1;
|
||||
zi = 0;
|
||||
}
|
||||
}
|
||||
else if (A.geom->different_dim == Y)
|
||||
{
|
||||
long long int local = cgiy - giy0;
|
||||
if (local >= 0 && local < ny)
|
||||
{
|
||||
ipy = A.geom->ipy;
|
||||
yi = local;
|
||||
}
|
||||
else if (local < 0)
|
||||
{
|
||||
ipy = A.geom->ipy - 1;
|
||||
yi = A.geom->previous_neighbor_dim - 1;
|
||||
new_ny = A.geom->previous_neighbor_dim;
|
||||
}
|
||||
else if (local >= ny)
|
||||
{
|
||||
ipy = A.geom->ipy + 1;
|
||||
yi = 0;
|
||||
new_ny = A.geom->next_neighbor_dim;
|
||||
}
|
||||
}
|
||||
else if (A.geom->different_dim == X)
|
||||
{
|
||||
long long int local = cgix - gix0;
|
||||
if (local >= 0 && local < nx)
|
||||
{
|
||||
ipx = A.geom->ipx;
|
||||
xi = local;
|
||||
}
|
||||
else if (local < 0)
|
||||
{
|
||||
ipx = A.geom->ipx - 1;
|
||||
xi = A.geom->previous_neighbor_dim - 1;
|
||||
new_nx = A.geom->previous_neighbor_dim;
|
||||
}
|
||||
else if (local >= nx)
|
||||
{
|
||||
ipx = A.geom->ipx + 1;
|
||||
xi = 0;
|
||||
new_nx = A.geom->next_neighbor_dim;
|
||||
}
|
||||
}
|
||||
local_int_t lcol = zi * new_ny * new_nx + yi * new_nx + xi;
|
||||
int row_rank = ipx + ipy * npx + ipz * npy * npx;
|
||||
|
||||
if (ok)
|
||||
{
|
||||
if (externalToLocalMap.find(row_rank) != externalToLocalMap.end())
|
||||
{
|
||||
mtxIndL[i][nnz_c] = externalToLocalMap[row_rank][lcol];
|
||||
nnz_ext++;
|
||||
}
|
||||
nnz_c++;
|
||||
}
|
||||
}
|
||||
}
|
||||
extTemp[i] = nnz_ext;
|
||||
}
|
||||
}
|
||||
|
||||
if (P2P_Mode == MPI_CPU_All2allv)
|
||||
{
|
||||
int* sdispls = new int[A.geom->size];
|
||||
int* rdispls = new int[A.geom->size];
|
||||
int* scounts = new int[A.geom->size];
|
||||
int* rcounts = new int[A.geom->size];
|
||||
int tmp_s = 0, tmp_r = 0;
|
||||
|
||||
if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
|
||||
return;
|
||||
|
||||
for (local_int_t i = 0; i < A.geom->size; i++)
|
||||
{
|
||||
scounts[i] = 0;
|
||||
rcounts[i] = 0;
|
||||
sdispls[i] = 0;
|
||||
rdispls[i] = 0;
|
||||
}
|
||||
|
||||
for (local_int_t i = 0; i < neighborCount; i++)
|
||||
{
|
||||
local_int_t root = neighborsPhysical[i];
|
||||
scounts[root] = sendLength[i];
|
||||
rcounts[root] = receiveLength[i];
|
||||
sdispls[root] = tmp_s;
|
||||
tmp_s += sendLength[i];
|
||||
rdispls[root] = tmp_r;
|
||||
tmp_r += receiveLength[i];
|
||||
}
|
||||
A.scounts = scounts;
|
||||
A.rcounts = rcounts;
|
||||
A.sdispls = sdispls;
|
||||
A.rdispls = rdispls;
|
||||
}
|
||||
|
||||
delete[] has_external;
|
||||
|
||||
// Store contents in our matrix struct
|
||||
A.numberOfExternalValues = totalToBeSent;
|
||||
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
|
||||
A.numberOfSendNeighbors = neighborCount;
|
||||
A.totalToBeSent = totalToBeSent;
|
||||
A.elementsToSend = elementsToSend;
|
||||
A.neighbors = neighbors;
|
||||
A.neighborsPhysical = neighborsPhysical;
|
||||
A.receiveLength = receiveLength;
|
||||
A.sendLength = sendLength;
|
||||
A.sendBuffer = sendBuffer;
|
||||
A.cpuAux.tempIndex = extTemp;
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
|
||||
<< ", number of neighbors = " << A.numberOfSendNeighbors << endl;
|
||||
for (int i = 0; i < A.numberOfSendNeighbors; i++)
|
||||
{
|
||||
HPCG_fout << " rank " << A.geom->rank << " neighbor " << neighbors[i]
|
||||
<< " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
|
||||
for (local_int_t j = 0; j < sendLength[i]; ++j)
|
||||
HPCG_fout << " rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
|
||||
<< endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
// ifdef HPCG_NO_MPI
|
||||
|
||||
return;
|
||||
}
|
||||
#endif // USE_GRACE
|
||||
|
||||
void SetupHalo(SparseMatrix& A)
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
SetupHalo_Gpu(A);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
SetupHalo_Cpu(A);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
21
src/SetupHalo.hpp
Normal file
21
src/SetupHalo.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef SETUPHALO_HPP
|
||||
#define SETUPHALO_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
|
||||
void SetupHalo(SparseMatrix& A);
|
||||
|
||||
#endif // SETUPHALO_HPP
|
||||
212
src/SetupHalo_ref.cpp
Normal file
212
src/SetupHalo_ref.cpp
Normal file
@@ -0,0 +1,212 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file SetupHalo_ref.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <map>
|
||||
#include <mpi.h>
|
||||
#include <set>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
#include <fstream>
|
||||
using std::endl;
|
||||
#include "hpcg.hpp"
|
||||
#include <cassert>
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include "SetupHalo_ref.hpp"
|
||||
#include "mytimer.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
/*!
|
||||
Reference version of SetupHalo that prepares system matrix data structure and creates data necessary
|
||||
for communication of boundary values of this process.
|
||||
|
||||
@param[inout] A The known system matrix
|
||||
|
||||
@see ExchangeHalo
|
||||
*/
|
||||
void SetupHalo_ref(SparseMatrix& A)
|
||||
{
|
||||
|
||||
// Extract Matrix pieces
|
||||
|
||||
local_int_t localNumberOfRows = A.localNumberOfRows;
|
||||
local_int_t* nonzerosInRow = A.nonzerosInRow;
|
||||
global_int_t** mtxIndG = A.mtxIndG;
|
||||
local_int_t** mtxIndL = A.mtxIndL;
|
||||
|
||||
#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
int cur_nnz = nonzerosInRow[i];
|
||||
for (int j = 0; j < cur_nnz; j++)
|
||||
mtxIndL[i][j] = mtxIndG[i][j];
|
||||
}
|
||||
|
||||
#else // Run this section if compiling for MPI
|
||||
|
||||
// Scan global IDs of the nonzeros in the matrix. Determine if the column ID matches a row ID. If not:
|
||||
// 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
|
||||
// We need to receive this value of the x vector during the halo exchange.
|
||||
// 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
|
||||
|
||||
std::map<int, std::set<global_int_t>> sendList, receiveList;
|
||||
typedef std::map<int, std::set<global_int_t>>::iterator map_iter;
|
||||
typedef std::set<global_int_t>::iterator set_iter;
|
||||
std::map<global_int_t, local_int_t> externalToLocalMap;
|
||||
|
||||
// TODO: With proper critical and atomic regions, this loop could be threaded, but not attempting it at this time
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
global_int_t currentGlobalRow = A.localToGlobalMap[i];
|
||||
for (int j = 0; j < nonzerosInRow[i]; j++)
|
||||
{
|
||||
global_int_t curIndex = mtxIndG[i][j];
|
||||
int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << "rank, row , col, globalToLocalMap[col] = " << A.geom->rank << " " << currentGlobalRow << " "
|
||||
<< curIndex << " " << A.globalToLocalMap[curIndex] << endl;
|
||||
#endif
|
||||
if (A.geom->rank != rankIdOfColumnEntry)
|
||||
{ // If column index is not a row index, then it comes from another processor
|
||||
receiveList[rankIdOfColumnEntry].insert(curIndex);
|
||||
sendList[rankIdOfColumnEntry].insert(
|
||||
currentGlobalRow); // Matrix symmetry means we know the neighbor process wants my value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Count number of matrix entries to send and receive
|
||||
local_int_t totalToBeSent = 0;
|
||||
for (map_iter curNeighbor = sendList.begin(); curNeighbor != sendList.end(); ++curNeighbor)
|
||||
{
|
||||
totalToBeSent += (curNeighbor->second).size();
|
||||
}
|
||||
local_int_t totalToBeReceived = 0;
|
||||
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
|
||||
{
|
||||
totalToBeReceived += (curNeighbor->second).size();
|
||||
}
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
// These are all attributes that should be true, due to symmetry
|
||||
HPCG_fout << "totalToBeSent = " << totalToBeSent << " totalToBeReceived = " << totalToBeReceived << endl;
|
||||
assert(totalToBeSent == totalToBeReceived); // Number of sent entry should equal number of received
|
||||
assert(sendList.size() == receiveList.size()); // Number of send-to neighbors should equal number of receive-from
|
||||
// Each receive-from neighbor should be a send-to neighbor, and send the same number of entries
|
||||
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
|
||||
{
|
||||
assert(sendList.find(curNeighbor->first) != sendList.end());
|
||||
assert(sendList[curNeighbor->first].size() == receiveList[curNeighbor->first].size());
|
||||
}
|
||||
#endif
|
||||
|
||||
// Build the arrays and lists needed by the ExchangeHalo function.
|
||||
double* sendBuffer = new double[totalToBeSent];
|
||||
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
|
||||
int* neighbors = new int[sendList.size()];
|
||||
local_int_t* receiveLength = new local_int_t[receiveList.size()];
|
||||
local_int_t* sendLength = new local_int_t[sendList.size()];
|
||||
int neighborCount = 0;
|
||||
local_int_t receiveEntryCount = 0;
|
||||
local_int_t sendEntryCount = 0;
|
||||
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor, ++neighborCount)
|
||||
{
|
||||
int neighborId = curNeighbor->first; // rank of current neighbor we are processing
|
||||
neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
|
||||
receiveLength[neighborCount] = receiveList[neighborId].size();
|
||||
sendLength[neighborCount] = sendList[neighborId].size(); // Get count if sends/receives
|
||||
for (set_iter i = receiveList[neighborId].begin(); i != receiveList[neighborId].end(); ++i, ++receiveEntryCount)
|
||||
{
|
||||
externalToLocalMap[*i]
|
||||
= localNumberOfRows + receiveEntryCount; // The remote columns are indexed at end of internals
|
||||
}
|
||||
for (set_iter i = sendList[neighborId].begin(); i != sendList[neighborId].end(); ++i, ++sendEntryCount)
|
||||
{
|
||||
// if (geom.rank==1) HPCG_fout << "*i, globalToLocalMap[*i], sendEntryCount = " << *i << " " <<
|
||||
// A.globalToLocalMap[*i] << " " << sendEntryCount << endl;
|
||||
elementsToSend[sendEntryCount] = A.globalToLocalMap[*i]; // store local ids of entry to send
|
||||
}
|
||||
}
|
||||
|
||||
#if 1
|
||||
// Convert matrix indices to local IDs
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localNumberOfRows; i++)
|
||||
{
|
||||
for (int j = 0; j < nonzerosInRow[i]; j++)
|
||||
{
|
||||
global_int_t curIndex = mtxIndG[i][j];
|
||||
int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
|
||||
if (A.geom->rank == rankIdOfColumnEntry)
|
||||
{ // My column index, so convert to local index
|
||||
mtxIndL[i][j] = A.globalToLocalMap[curIndex];
|
||||
}
|
||||
else
|
||||
{ // If column index is not a row index, then it comes from another processor
|
||||
mtxIndL[i][j] = externalToLocalMap[curIndex];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Store contents in our matrix struct
|
||||
A.numberOfExternalValues = externalToLocalMap.size();
|
||||
printf("%d %d\n", A.localNumberOfRows, A.numberOfExternalValues);
|
||||
fflush(0);
|
||||
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
|
||||
A.numberOfSendNeighbors = sendList.size();
|
||||
A.totalToBeSent = totalToBeSent;
|
||||
A.elementsToSend = elementsToSend;
|
||||
A.neighbors = neighbors;
|
||||
A.receiveLength = receiveLength;
|
||||
A.sendLength = sendLength;
|
||||
A.sendBuffer = sendBuffer;
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
|
||||
<< ", number of neighbors = " << A.numberOfSendNeighbors << endl;
|
||||
for (int i = 0; i < A.numberOfSendNeighbors; i++)
|
||||
{
|
||||
HPCG_fout << " rank " << A.geom->rank << " neighbor " << neighbors[i]
|
||||
<< " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
|
||||
for (local_int_t j = 0; j < sendLength[i]; ++j)
|
||||
HPCG_fout << " rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
|
||||
<< endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
// ifdef HPCG_NO_MPI
|
||||
|
||||
return;
|
||||
}
|
||||
21
src/SetupHalo_ref.hpp
Normal file
21
src/SetupHalo_ref.hpp
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef SETUPHALO_REF_HPP
|
||||
#define SETUPHALO_REF_HPP
|
||||
#include "SparseMatrix.hpp"
|
||||
|
||||
void SetupHalo_ref(SparseMatrix& A);
|
||||
|
||||
#endif // SETUPHALO_REF_HPP
|
||||
306
src/SparseMatrix.hpp
Normal file
306
src/SparseMatrix.hpp
Normal file
@@ -0,0 +1,306 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file SparseMatrix.hpp
|
||||
|
||||
HPCG data structures for the sparse matrix
|
||||
*/
|
||||
|
||||
#ifndef SPARSEMATRIX_HPP
|
||||
#define SPARSEMATRIX_HPP
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include <cuda.h>
|
||||
#include <cusparse.h>
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
#include <nvpl_sparse.h>
|
||||
#endif
|
||||
|
||||
#include "Cuda.hpp"
|
||||
#include "Geometry.hpp"
|
||||
#include "MGData.hpp"
|
||||
#include "Vector.hpp"
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
|
||||
extern bool Use_Hpcg_Mem_Reduction;
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
extern p2p_comm_mode_t P2P_Mode;
|
||||
#endif
|
||||
|
||||
#if __cplusplus < 201103L
|
||||
// for C++03
|
||||
#include <map>
|
||||
typedef std::map<global_int_t, local_int_t> GlobalToLocalMap;
|
||||
#else
|
||||
// for C++11 or greater
|
||||
#include <unordered_map>
|
||||
using GlobalToLocalMap = std::unordered_map<global_int_t, local_int_t>;
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
struct CUSPARSE_STRUCT
|
||||
{
|
||||
cusparseDnVecDescr_t vecX;
|
||||
cusparseDnVecDescr_t vecY;
|
||||
cusparseSpMatDescr_t matA;
|
||||
cusparseSpMatDescr_t matL;
|
||||
cusparseSpMatDescr_t matU;
|
||||
|
||||
// CUSPARSE SpSV
|
||||
cusparseSpSVDescr_t spsvDescrL, spsvDescrU;
|
||||
};
|
||||
|
||||
struct GPU_AUX_STRUCT
|
||||
{
|
||||
// Uncolored row related info
|
||||
local_int_t* nnzPerRow;
|
||||
local_int_t* columns;
|
||||
double* values;
|
||||
local_int_t* csrAPermOffsets;
|
||||
local_int_t* csrLPermOffsets;
|
||||
local_int_t* csrUPermOffsets;
|
||||
local_int_t* diagonalIdx;
|
||||
|
||||
// Sliced EllPACK Aux
|
||||
local_int_t* sellADiagonalIdx;
|
||||
|
||||
// Auxiliary data
|
||||
local_int_t* f2c;
|
||||
|
||||
local_int_t* color;
|
||||
int* colorCountCpu;
|
||||
|
||||
// MULTI-GPU Aux data
|
||||
local_int_t* map;
|
||||
local_int_t* ext2csrOffsets;
|
||||
local_int_t* elementsToSend;
|
||||
global_int_t* localToGlobalMap;
|
||||
local_int_t compressNumberOfRows;
|
||||
double* sendBuffer;
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
struct NVPL_SPARSE_STRUCT
|
||||
{
|
||||
nvpl_sparse_dn_vec_descr_t vecX;
|
||||
nvpl_sparse_dn_vec_descr_t vecY;
|
||||
|
||||
nvpl_sparse_sp_mat_descr_t matL;
|
||||
nvpl_sparse_sp_mat_descr_t matU;
|
||||
nvpl_sparse_sp_mat_descr_t matA;
|
||||
|
||||
nvpl_sparse_spsv_descr_t spsvDescrL, spsvDescrU;
|
||||
nvpl_sparse_spmv_descr_t spmvADescr, spmvLDescr, spmvUDescr;
|
||||
};
|
||||
|
||||
struct CPU_AUX_STRUCT
|
||||
{
|
||||
// Auxiliary data
|
||||
// Coloring info as number of colors and where each color starts
|
||||
// Also keep information on how many consecutive rows share the same color
|
||||
// This assumes matrix reordering (rows with same color are packed)
|
||||
local_int_t* color;
|
||||
local_int_t* firstRowOfColor;
|
||||
local_int_t* nRowsWithColor;
|
||||
local_int_t* tempIndex;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct SparseMatrix_STRUCT
|
||||
{
|
||||
rank_type_t rankType;
|
||||
int level;
|
||||
char* title; //!< name of the sparse matrix
|
||||
Geometry* geom; //!< geometry associated with this matrix
|
||||
global_int_t totalNumberOfRows; //!< total number of matrix rows across all processes
|
||||
global_int_t totalNumberOfNonzeros; //!< total number of matrix nonzeros across all processes
|
||||
local_int_t localNumberOfRows; //!< number of rows local to this process
|
||||
local_int_t localNumberOfColumns; //!< number of columns local to this process
|
||||
local_int_t localNumberOfNonzeros; //!< number of nonzeros local to this process
|
||||
local_int_t* nonzerosInRow; //!< The number of nonzeros in a row will always be 27 or fewer
|
||||
global_int_t** mtxIndG; //!< matrix indices as global values
|
||||
local_int_t** mtxIndL; //!< matrix indices as local values
|
||||
double** matrixValues; //!< values of matrix entries
|
||||
double** matrixDiagonal; //!< values of matrix diagonal entries
|
||||
GlobalToLocalMap globalToLocalMap; //!< global-to-local mapping
|
||||
std::vector<global_int_t> localToGlobalMap; //!< local-to-global mapping
|
||||
mutable bool isDotProductOptimized;
|
||||
mutable bool isSpmvOptimized;
|
||||
mutable bool isMgOptimized;
|
||||
mutable bool isWaxpbyOptimized;
|
||||
|
||||
mutable MGData* mgData; // Pointer to the coarse level data for this fine matrix
|
||||
void* optimizationData; // pointer that can be used to store implementation-specific data
|
||||
|
||||
local_int_t totalToBeSent; //!< total number of entries to be sent
|
||||
local_int_t slice_size;
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
local_int_t numberOfExternalValues; //!< number of entries that are external to this process
|
||||
int numberOfSendNeighbors; //!< number of neighboring processes that will be send local data
|
||||
local_int_t* elementsToSend; //!< elements to send to neighboring processes
|
||||
int* neighbors; //!< neighboring processes
|
||||
int* neighborsPhysical;
|
||||
local_int_t* receiveLength; //!< lenghts of messages received from neighboring processes
|
||||
local_int_t* sendLength; //!< lenghts of messages sent to neighboring processes
|
||||
double* sendBuffer; //!< send buffer for non-blocking sends
|
||||
local_int_t extNnz;
|
||||
#endif
|
||||
|
||||
// Optmization Data common between CPU and GPU
|
||||
// Coloring permutations
|
||||
local_int_t totalColors;
|
||||
local_int_t* ref2opt;
|
||||
local_int_t* opt2ref;
|
||||
local_int_t* f2cPerm;
|
||||
|
||||
// Sliced EllPACK
|
||||
local_int_t *sellASliceMrl, *sellLSliceMrl, *sellUSliceMrl;
|
||||
local_int_t *sellAPermColumns, *sellLPermColumns, *sellUPermColumns;
|
||||
double *sellAPermValues, *sellLPermValues, *sellUPermValues;
|
||||
double* diagonal;
|
||||
|
||||
char* bufferSvL = nullptr;
|
||||
char* bufferSvU = nullptr;
|
||||
char* bufferMvA = nullptr;
|
||||
char* bufferMvL = nullptr;
|
||||
char* bufferMvU = nullptr;
|
||||
|
||||
// MULTI-GPU data
|
||||
local_int_t* csrExtOffsets;
|
||||
local_int_t* csrExtColumns;
|
||||
double* csrExtValues;
|
||||
double* tempBuffer;
|
||||
|
||||
// When MPI_All2allv is used for P2P communication
|
||||
int* scounts;
|
||||
int* rcounts;
|
||||
int* sdispls;
|
||||
int* rdispls;
|
||||
|
||||
#ifdef USE_CUDA
|
||||
CUSPARSE_STRUCT cusparseOpt;
|
||||
GPU_AUX_STRUCT gpuAux;
|
||||
#endif
|
||||
|
||||
// #ifdef USE_GRACE
|
||||
// NVPL_SPARSE_STRUCT nvplSparseOpt;
|
||||
// CPU_AUX_STRUCT cpuAux;
|
||||
// #endif
|
||||
|
||||
mutable struct SparseMatrix_STRUCT* Ac; // Coarse grid matrix
|
||||
};
|
||||
|
||||
typedef struct SparseMatrix_STRUCT SparseMatrix;
|
||||
|
||||
/*!
|
||||
Initializes the known system matrix data structure members to 0.
|
||||
|
||||
@param[in] A the known system matrix
|
||||
*/
|
||||
inline void InitializeSparseMatrix(SparseMatrix& A, Geometry* geom)
|
||||
{
|
||||
A.title = 0;
|
||||
A.geom = geom;
|
||||
A.totalNumberOfRows = 0;
|
||||
A.totalNumberOfNonzeros = 0;
|
||||
A.localNumberOfRows = 0;
|
||||
A.localNumberOfColumns = 0;
|
||||
A.localNumberOfNonzeros = 0;
|
||||
A.nonzerosInRow = 0;
|
||||
A.mtxIndG = 0;
|
||||
A.mtxIndL = 0;
|
||||
A.matrixValues = 0;
|
||||
A.matrixDiagonal = 0;
|
||||
|
||||
// Optimization is ON by default. The code that switches it OFF is in the
|
||||
// functions that are meant to be optimized.
|
||||
A.isDotProductOptimized = true;
|
||||
A.isSpmvOptimized = true;
|
||||
A.isMgOptimized = true;
|
||||
A.isWaxpbyOptimized = true;
|
||||
|
||||
A.totalToBeSent = 0;
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
A.numberOfExternalValues = 0;
|
||||
A.numberOfSendNeighbors = 0;
|
||||
A.totalToBeSent = 0;
|
||||
A.elementsToSend = 0;
|
||||
A.neighbors = 0;
|
||||
A.neighborsPhysical = 0;
|
||||
A.receiveLength = 0;
|
||||
A.sendLength = 0;
|
||||
A.sendBuffer = 0;
|
||||
#endif
|
||||
A.mgData = 0; // Fine-to-coarse grid transfer initially not defined.
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Copy values from matrix diagonal into user-provided vector.
|
||||
|
||||
@param[in] A the known system matrix.
|
||||
@param[inout] diagonal Vector of diagonal values (must be allocated before call to this function).
|
||||
*/
|
||||
inline void CopyMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
|
||||
{
|
||||
double** curDiagA = A.matrixDiagonal;
|
||||
double* dv = diagonal.values;
|
||||
assert(A.localNumberOfRows == diagonal.localLength);
|
||||
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
|
||||
dv[i] = *(curDiagA[i]);
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Replace specified matrix diagonal value.
|
||||
|
||||
@param[inout] A The system matrix.
|
||||
@param[in] diagonal Vector of diagonal values that will replace existing matrix diagonal values.
|
||||
*/
|
||||
inline void ReplaceMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
|
||||
{
|
||||
double** curDiagA = A.matrixDiagonal;
|
||||
double* dv = diagonal.values;
|
||||
assert(A.localNumberOfRows == diagonal.localLength);
|
||||
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
|
||||
*(curDiagA[i]) = dv[i];
|
||||
return;
|
||||
}
|
||||
#endif // SPARSEMATRIX_HPP
|
||||
243
src/TestCG.cpp
Normal file
243
src/TestCG.cpp
Normal file
@@ -0,0 +1,243 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file TestCG.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
// Changelog
|
||||
//
|
||||
// Version 0.4
|
||||
// - Added timing of setup time for sparse MV
|
||||
// - Corrected percentages reported for sparse MV with overhead
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
using std::endl;
|
||||
#include "hpcg.hpp"
|
||||
#include <vector>
|
||||
|
||||
#include "CG.hpp"
|
||||
#include "CG_ref.hpp"
|
||||
#include "TestCG.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
|
||||
/*!
|
||||
Test the correctness of the Preconditined CG implementation by using a system matrix with a dominant diagonal.
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[in] A The known system matrix
|
||||
@param[in] data the data structure with all necessary CG vectors preallocated
|
||||
@param[in] b The known right hand side vector
|
||||
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
|
||||
@param[out] testcg_data the data structure with the results of the test including pass/fail information
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
|
||||
@see CG()
|
||||
*/
|
||||
|
||||
int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data)
|
||||
{
|
||||
// Use this array for collecting timing information
|
||||
std::vector<double> times(8, 0.0);
|
||||
// Temporary storage for holding original diagonal and RHS
|
||||
Vector origDiagA, exaggeratedDiagA, origB;
|
||||
InitializeVector(origDiagA, A.localNumberOfRows, A.rankType);
|
||||
InitializeVector(exaggeratedDiagA, A.localNumberOfRows, A.rankType);
|
||||
InitializeVector(origB, A.localNumberOfRows, A.rankType);
|
||||
CopyMatrixDiagonal(A, origDiagA);
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyMatrixDiagonalCuda(A, origDiagA);
|
||||
#endif
|
||||
}
|
||||
CopyVector(origDiagA, exaggeratedDiagA);
|
||||
CopyVector(b, origB);
|
||||
|
||||
// Modify the matrix diagonal to greatly exaggerate diagonal values.
|
||||
// CG should converge in about 10 iterations for this problem, regardless of problem size
|
||||
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
|
||||
{
|
||||
global_int_t globalRowID = A.localToGlobalMap[i];
|
||||
if (globalRowID < 9)
|
||||
{
|
||||
double scale = (globalRowID + 2) * 1.0e6;
|
||||
ScaleVectorValue(exaggeratedDiagA, i, scale);
|
||||
ScaleVectorValue(b, i, scale);
|
||||
}
|
||||
else
|
||||
{
|
||||
ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
|
||||
ScaleVectorValue(b, i, 1.0e6);
|
||||
}
|
||||
}
|
||||
|
||||
// Reference Matrix
|
||||
ReplaceMatrixDiagonal(A, exaggeratedDiagA);
|
||||
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyVectorH2D(exaggeratedDiagA);
|
||||
PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
|
||||
PermVectorCuda(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
|
||||
ReplaceMatrixDiagonalCuda(A, exaggeratedDiagA);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A.cusparseOpt.spsvDescrL, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A.cusparseOpt.spsvDescrU, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
|
||||
PermVectorCpu(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
|
||||
ReplaceMatrixDiagonalCpu(A, exaggeratedDiagA);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////
|
||||
|
||||
int niters = 0;
|
||||
double normr = 0.0;
|
||||
double normr0 = 0.0;
|
||||
int maxIters = 50;
|
||||
int numberOfCgCalls = 2;
|
||||
double tolerance = 1.0e-12; // Set tolerance to reasonable value for grossly scaled diagonal terms
|
||||
testcg_data.expected_niters_no_prec
|
||||
= 12; // For the unpreconditioned CG call, we should take about 10 iterations, permit 12
|
||||
testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2
|
||||
testcg_data.niters_max_no_prec = 0;
|
||||
testcg_data.niters_max_prec = 0;
|
||||
for (int k = 0; k < 2; ++k)
|
||||
{ // This loop tests both unpreconditioned and preconditioned runs
|
||||
int expected_niters = testcg_data.expected_niters_no_prec;
|
||||
if (k == 1)
|
||||
expected_niters = testcg_data.expected_niters_prec;
|
||||
for (int i = 0; i < numberOfCgCalls; ++i)
|
||||
{
|
||||
ZeroVector(x); // Zero out x
|
||||
int ierr = CG(A, data, b, x, maxIters, tolerance, niters, normr, normr0, ×[0], k == 1, 0);
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
if (niters <= expected_niters)
|
||||
{
|
||||
++testcg_data.count_pass;
|
||||
}
|
||||
else
|
||||
{
|
||||
++testcg_data.count_fail;
|
||||
}
|
||||
if (k == 0 && niters > testcg_data.niters_max_no_prec)
|
||||
testcg_data.niters_max_no_prec = niters; // Keep track of largest iter count
|
||||
if (k == 1 && niters > testcg_data.niters_max_prec)
|
||||
testcg_data.niters_max_prec = niters; // Same for preconditioned run
|
||||
if (A.geom->rank == 0)
|
||||
{
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
|
||||
<< normr / normr0 << "]" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
|
||||
<< normr / normr0 << "]" << endl;
|
||||
}
|
||||
if (niters > expected_niters)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << " Expected " << expected_niters << " iterations. Performed " << niters << "."
|
||||
<< endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << " Expected " << expected_niters << " iterations. Performed " << niters << "."
|
||||
<< endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Restore matrix diagonal and RHS
|
||||
ReplaceMatrixDiagonal(A, origDiagA);
|
||||
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ReplaceMatrixDiagonalCuda(A, origDiagA);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A.cusparseOpt.spsvDescrL, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
cusparseSpSV_updateMatrix(
|
||||
cusparsehandle, A.cusparseOpt.spsvDescrU, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ReplaceMatrixDiagonalCpu(A, origDiagA);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
nvpl_sparse_spsv_update_matrix(
|
||||
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
|
||||
#endif
|
||||
}
|
||||
|
||||
CopyVector(origB, b);
|
||||
// Delete vectors
|
||||
DeleteVector(origDiagA);
|
||||
DeleteVector(exaggeratedDiagA);
|
||||
DeleteVector(origB);
|
||||
testcg_data.normr = normr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
45
src/TestCG.hpp
Normal file
45
src/TestCG.hpp
Normal file
@@ -0,0 +1,45 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file TestCG.hpp
|
||||
|
||||
HPCG data structure
|
||||
*/
|
||||
|
||||
#ifndef TESTCG_HPP
|
||||
#define TESTCG_HPP
|
||||
|
||||
#include "CGData.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "Vector.hpp"
|
||||
#include "hpcg.hpp"
|
||||
|
||||
struct TestCGData_STRUCT
|
||||
{
|
||||
int count_pass; //!< number of succesful tests
|
||||
int count_fail; //!< number of succesful tests
|
||||
int expected_niters_no_prec; //!< expected number of test CG iterations without preconditioning with diagonally
|
||||
//!< dominant matrix (~12)
|
||||
int expected_niters_prec; //!< expected number of test CG iterations with preconditioning and with diagonally
|
||||
//!< dominant matrix (~1-2)
|
||||
int niters_max_no_prec; //!< maximum number of test CG iterations without predictitioner
|
||||
int niters_max_prec; //!< maximum number of test CG iterations without predictitioner
|
||||
double normr; //!< residual norm achieved during test CG iterations
|
||||
};
|
||||
typedef struct TestCGData_STRUCT TestCGData;
|
||||
|
||||
extern int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data);
|
||||
|
||||
#endif // TESTCG_HPP
|
||||
49
src/TestNorms.cpp
Normal file
49
src/TestNorms.cpp
Normal file
@@ -0,0 +1,49 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file TestNorms.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "TestNorms.hpp"
|
||||
#include <cmath>
|
||||
|
||||
/*!
|
||||
Computes the mean and standard deviation of the array of norm results.
|
||||
|
||||
@param[in] testnorms_data data structure with the results of norm test
|
||||
|
||||
@return Returns 0 upon success or non-zero otherwise
|
||||
*/
|
||||
int TestNorms(TestNormsData& testnorms_data)
|
||||
{
|
||||
double mean_delta = 0.0;
|
||||
for (int i = 0; i < testnorms_data.samples; ++i)
|
||||
mean_delta += (testnorms_data.values[i] - testnorms_data.values[0]);
|
||||
double mean = testnorms_data.values[0] + mean_delta / (double) testnorms_data.samples;
|
||||
testnorms_data.mean = mean;
|
||||
|
||||
// Compute variance
|
||||
double sumdiff = 0.0;
|
||||
for (int i = 0; i < testnorms_data.samples; ++i)
|
||||
sumdiff += (testnorms_data.values[i] - mean) * (testnorms_data.values[i] - mean);
|
||||
testnorms_data.variance = sumdiff / (double) testnorms_data.samples;
|
||||
|
||||
// Determine if variation is sufficiently small to declare success
|
||||
testnorms_data.pass = (testnorms_data.variance < 1.0e-6);
|
||||
|
||||
return 0;
|
||||
}
|
||||
36
src/TestNorms.hpp
Normal file
36
src/TestNorms.hpp
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file TestNorms.hpp
|
||||
|
||||
HPCG data structure
|
||||
*/
|
||||
|
||||
#ifndef TESTNORMS_HPP
|
||||
#define TESTNORMS_HPP
|
||||
|
||||
struct TestNormsData_STRUCT
|
||||
{
|
||||
double* values; //!< sample values
|
||||
double mean; //!< mean of all sampes
|
||||
double variance; //!< variance of mean
|
||||
int samples; //!< number of samples
|
||||
bool pass; //!< pass/fail indicator
|
||||
};
|
||||
typedef struct TestNormsData_STRUCT TestNormsData;
|
||||
|
||||
extern int TestNorms(TestNormsData& testnorms_data);
|
||||
|
||||
#endif // TESTNORMS_HPP
|
||||
298
src/TestSymmetry.cpp
Normal file
298
src/TestSymmetry.cpp
Normal file
@@ -0,0 +1,298 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file TestSymmetry.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
// The MPI include must be first for Windows platforms
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
#include <cfloat>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
using std::endl;
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
#include "ComputeDotProduct.hpp"
|
||||
#include "ComputeMG.hpp"
|
||||
#include "ComputeResidual.hpp"
|
||||
#include "ComputeSPMV.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#include "Geometry.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "TestSymmetry.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
/*!
|
||||
Tests symmetry-preserving properties of the sparse matrix vector multiply and multi-grid routines.
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[in] A The known system matrix
|
||||
@param[in] b The known right hand side vector
|
||||
@param[in] xexact The exact solution vector
|
||||
@param[inout] testsymmetry_data The data structure with the results of the CG symmetry test including pass/fail
|
||||
information
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see ComputeDotProduct
|
||||
@see ComputeDotProduct_ref
|
||||
@see ComputeSPMV
|
||||
@see ComputeSPMV_ref
|
||||
@see ComputeMG
|
||||
@see ComputeMG_ref
|
||||
*/
|
||||
int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data)
|
||||
{
|
||||
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
local_int_t ncol = A.localNumberOfColumns;
|
||||
Vector x_ncol, y_ncol, z_ncol;
|
||||
InitializeVector(x_ncol, ncol, A.rankType);
|
||||
InitializeVector(y_ncol, ncol, A.rankType);
|
||||
InitializeVector(z_ncol, ncol, A.rankType);
|
||||
|
||||
double t4 = 0.0; // Needed for dot-product call, otherwise unused
|
||||
testsymmetry_data.count_fail = 0;
|
||||
|
||||
// Test symmetry of matrix
|
||||
// First load vectors with random values
|
||||
FillRandomVector(x_ncol);
|
||||
FillRandomVector(y_ncol);
|
||||
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyVectorH2D(y_ncol);
|
||||
CopyVectorH2D(x_ncol);
|
||||
#endif
|
||||
}
|
||||
int ierr;
|
||||
|
||||
double xNorm2, yNorm2;
|
||||
double ANorm = 2 * 26.0;
|
||||
|
||||
// Next, compute x'*A*y
|
||||
ComputeDotProduct(nrow, y_ncol, y_ncol, yNorm2, t4, A.isDotProductOptimized, A.rankType);
|
||||
ierr = ComputeSPMV(A, y_ncol, z_ncol); // z_nrow = A*y_overlap
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
double xtAy = 0.0;
|
||||
ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtAy, t4, A.isDotProductOptimized, A.rankType); // x'*A*y
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
|
||||
// Next, compute y'*A*x
|
||||
ComputeDotProduct(nrow, x_ncol, x_ncol, xNorm2, t4, A.isDotProductOptimized, A.rankType);
|
||||
ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
double ytAx = 0.0;
|
||||
ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytAx, t4, A.isDotProductOptimized, A.rankType); // y'*A*x
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
|
||||
testsymmetry_data.depsym_spmv = std::fabs((long double) (xtAy - ytAx))
|
||||
/ ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
|
||||
if (testsymmetry_data.depsym_spmv > 1.0)
|
||||
++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
|
||||
if (A.geom->rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
|
||||
<< testsymmetry_data.depsym_spmv << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
|
||||
<< testsymmetry_data.depsym_spmv << endl;
|
||||
}
|
||||
|
||||
// Test symmetry of multi-grid
|
||||
// Compute x'*Minv*y
|
||||
ierr = ComputeMG(A, y_ncol, z_ncol); // z_ncol = Minv*y_ncol
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
double xtMinvy = 0.0;
|
||||
ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtMinvy, t4, A.isDotProductOptimized, A.rankType); // x'*Minv*y
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
// Next, compute z'*Minv*x
|
||||
ierr = ComputeMG(A, x_ncol, z_ncol); // z_ncol = Minv*x_ncol
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
double ytMinvx = 0.0;
|
||||
ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytMinvx, t4, A.isDotProductOptimized, A.rankType); // y'*Minv*x
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
|
||||
}
|
||||
testsymmetry_data.depsym_mg = std::fabs((long double) (xtMinvy - ytMinvx))
|
||||
/ ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
|
||||
if (testsymmetry_data.depsym_mg > 1.0)
|
||||
++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
|
||||
if (A.geom->rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
|
||||
<< testsymmetry_data.depsym_mg << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
|
||||
<< testsymmetry_data.depsym_mg << endl;
|
||||
}
|
||||
|
||||
CopyVector(xexact, x_ncol); // Copy exact answer into overlap vector
|
||||
|
||||
int numberOfCalls = 2;
|
||||
double residual = 0.0;
|
||||
for (int i = 0; i < numberOfCalls; ++i)
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
CopyVectorH2D(x_ncol);
|
||||
#endif
|
||||
}
|
||||
|
||||
ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
|
||||
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
PermVectorCuda(A.ref2opt, z_ncol, nrow);
|
||||
CopyVectorD2H(z_ncol);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
PermVectorCpu(A.ref2opt, z_ncol, nrow);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
if ((ierr = ComputeResidual(A.localNumberOfRows, b, z_ncol, residual)))
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
|
||||
}
|
||||
if (A.geom->rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
|
||||
}
|
||||
}
|
||||
DeleteVector(x_ncol);
|
||||
DeleteVector(y_ncol);
|
||||
DeleteVector(z_ncol);
|
||||
|
||||
return 0;
|
||||
}
|
||||
38
src/TestSymmetry.hpp
Normal file
38
src/TestSymmetry.hpp
Normal file
@@ -0,0 +1,38 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file TestSymmetry.hpp
|
||||
|
||||
HPCG data structures for symmetry testing
|
||||
*/
|
||||
|
||||
#ifndef TESTSYMMETRY_HPP
|
||||
#define TESTSYMMETRY_HPP
|
||||
|
||||
#include "CGData.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "hpcg.hpp"
|
||||
|
||||
struct TestSymmetryData_STRUCT
|
||||
{
|
||||
double depsym_spmv; //!< departure from symmetry for the SPMV kernel
|
||||
double depsym_mg; //!< departure from symmetry for the MG kernel
|
||||
int count_fail; //!< number of failures in the symmetry tests
|
||||
};
|
||||
typedef struct TestSymmetryData_STRUCT TestSymmetryData;
|
||||
|
||||
extern int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data);
|
||||
|
||||
#endif // TESTSYMMETRY_HPP
|
||||
240
src/Vector.hpp
Normal file
240
src/Vector.hpp
Normal file
@@ -0,0 +1,240 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file Vector.hpp
|
||||
|
||||
HPCG data structures for dense vectors
|
||||
*/
|
||||
|
||||
#ifndef VECTOR_HPP
|
||||
#define VECTOR_HPP
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <cuda_runtime.h>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
#include "Geometry.hpp"
|
||||
|
||||
struct Vector_STRUCT
|
||||
{
|
||||
rank_type_t rt;
|
||||
local_int_t localLength; //!< length of local portion of the vector
|
||||
bool isCudaHost;
|
||||
double* values; //!< array of values
|
||||
/*!
|
||||
This is for storing optimized data structures created in OptimizeProblem and
|
||||
used inside optimized ComputeSPMV().
|
||||
*/
|
||||
void* optimizationData;
|
||||
#ifdef USE_CUDA
|
||||
double* values_d = nullptr;
|
||||
#endif
|
||||
|
||||
bool initialized = false;
|
||||
};
|
||||
typedef struct Vector_STRUCT Vector;
|
||||
|
||||
/*!
|
||||
Initializes input vector.
|
||||
|
||||
@param[in] v
|
||||
@param[in] localLength Length of local portion of input vector
|
||||
*/
|
||||
|
||||
inline void InitializeVector(Vector& v, local_int_t localLength, rank_type_t rt, bool isCudaHost = false)
|
||||
{
|
||||
v.localLength = localLength;
|
||||
v.isCudaHost = isCudaHost;
|
||||
v.rt = rt;
|
||||
|
||||
#ifdef USE_CUDA
|
||||
if (v.rt == GPU && v.isCudaHost)
|
||||
cudaMallocHost(&(v.values), sizeof(double) * localLength);
|
||||
else
|
||||
#endif
|
||||
v.values = new double[localLength];
|
||||
|
||||
v.optimizationData = 0;
|
||||
#ifdef USE_CUDA
|
||||
if (v.rt == GPU)
|
||||
cudaMalloc((void**) &(v.values_d), sizeof(double) * localLength);
|
||||
#endif
|
||||
v.initialized = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Fill the input vector with zero values.
|
||||
|
||||
@param[inout] v - On entrance v is initialized, on exit all its values are zero.
|
||||
*/
|
||||
|
||||
inline void ZeroVector(Vector& v)
|
||||
{
|
||||
|
||||
assert(v.initialized);
|
||||
|
||||
local_int_t localLength = v.localLength;
|
||||
double* vv = v.values;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localLength; ++i)
|
||||
vv[i] = 0.0;
|
||||
#ifdef USE_CUDA
|
||||
if (v.rt == GPU)
|
||||
{
|
||||
cudaMemset(v.values_d, 0, sizeof(double) * localLength);
|
||||
}
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
/*!
|
||||
Multiply (scale) a specific vector entry by a given value.
|
||||
|
||||
@param[inout] v Vector to be modified
|
||||
@param[in] index Local index of entry to scale
|
||||
@param[in] value Value to scale by
|
||||
*/
|
||||
inline void ScaleVectorValue(Vector& v, local_int_t index, double value)
|
||||
{
|
||||
assert(index >= 0 && index < v.localLength);
|
||||
double* vv = v.values;
|
||||
vv[index] *= value;
|
||||
return;
|
||||
}
|
||||
/*!
|
||||
Fill the input vector with pseudo-random values.
|
||||
|
||||
@param[in] v
|
||||
*/
|
||||
inline void FillRandomVector(Vector& v)
|
||||
{
|
||||
local_int_t localLength = v.localLength;
|
||||
double* vv = v.values;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < localLength; ++i)
|
||||
vv[i] = rand() / (double) (RAND_MAX) + 1.0;
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Copy input vector to output vector.
|
||||
|
||||
@param[in] v Input vector
|
||||
@param[in] w Output vector
|
||||
*/
|
||||
inline void CopyVector(const Vector& v, Vector& w)
|
||||
{
|
||||
local_int_t len = std::min(v.localLength, w.localLength);
|
||||
assert(v.initialized && w.initialized);
|
||||
double* vv = v.values;
|
||||
double* wv = w.values;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (local_int_t i = 0; i < len; ++i)
|
||||
wv[i] = vv[i];
|
||||
#ifdef USE_CUDA
|
||||
if (v.rt == GPU && w.rt == GPU)
|
||||
{
|
||||
cudaMemcpy(w.values_d, v.values_d, sizeof(double) * len, cudaMemcpyDeviceToDevice);
|
||||
}
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef USE_CUDA
|
||||
inline void CopyVectorD2H(const Vector& v)
|
||||
{
|
||||
local_int_t localLength = v.localLength;
|
||||
cudaMemcpy(v.values, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToHost);
|
||||
return;
|
||||
}
|
||||
|
||||
inline void CopyVectorD2D(const Vector& v, Vector& w)
|
||||
{
|
||||
local_int_t localLength = v.localLength;
|
||||
cudaMemcpy(w.values_d, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToDevice);
|
||||
return;
|
||||
}
|
||||
|
||||
inline void CopyVectorH2D(const Vector& v)
|
||||
{
|
||||
local_int_t localLength = v.localLength;
|
||||
cudaMemcpy(v.values_d, v.values, sizeof(double) * localLength, cudaMemcpyHostToDevice);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
inline void CopyAndReorderVector(const Vector& v, Vector& w, local_int_t* perm)
|
||||
{
|
||||
local_int_t localLength = v.localLength;
|
||||
assert(w.localLength >= localLength);
|
||||
double* vv = v.values;
|
||||
double* wv = w.values;
|
||||
local_int_t i;
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#pragma omp parallel for
|
||||
#endif
|
||||
for (i = 0; i < localLength; ++i)
|
||||
{
|
||||
wv[i] = vv[perm[i]];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/*!
|
||||
Deallocates the members of the data structure of the known system matrix provided they are not 0.
|
||||
|
||||
@param[in] A the known system matrix
|
||||
*/
|
||||
inline void DeleteVector(Vector& v)
|
||||
{
|
||||
if (v.isCudaHost)
|
||||
cudaFreeHost(v.values);
|
||||
else
|
||||
{
|
||||
delete[] v.values;
|
||||
}
|
||||
v.localLength = 0;
|
||||
#ifdef USE_CUDA
|
||||
if (v.values_d)
|
||||
cudaFree(v.values_d);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
#endif // VECTOR_HPP
|
||||
98
src/WriteProblem.cpp
Normal file
98
src/WriteProblem.cpp
Normal file
@@ -0,0 +1,98 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file WriteProblem.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "WriteProblem.hpp"
|
||||
#include <cstdio>
|
||||
|
||||
/*!
|
||||
Routine to dump:
|
||||
- matrix in row, col, val format for analysis with MATLAB
|
||||
- x, xexact, b as simple arrays of numbers.
|
||||
|
||||
Writes to A.dat, x.dat, xexact.dat and b.dat, respectivly.
|
||||
|
||||
NOTE: THIS CODE ONLY WORKS ON SINGLE PROCESSOR RUNS
|
||||
|
||||
Read into MATLAB using:
|
||||
|
||||
load A.dat
|
||||
A=spconvert(A);
|
||||
load x.dat
|
||||
load xexact.dat
|
||||
load b.dat
|
||||
|
||||
@param[in] geom The description of the problem's geometry.
|
||||
@param[in] A The known system matrix
|
||||
@param[in] b The known right hand side vector
|
||||
@param[in] x The solution vector computed by CG iteration
|
||||
@param[in] xexact Generated exact solution
|
||||
|
||||
@return Returns with -1 if used with more than one MPI process. Returns with 0 otherwise.
|
||||
|
||||
@see GenerateProblem
|
||||
*/
|
||||
int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact)
|
||||
{
|
||||
|
||||
if (geom.size != 1)
|
||||
return -1; // TODO Only works on one processor. Need better error handler
|
||||
const global_int_t nrow = A.totalNumberOfRows;
|
||||
|
||||
FILE *fA = 0, *fx = 0, *fxexact = 0, *fb = 0;
|
||||
fA = fopen("A.dat", "w");
|
||||
fx = fopen("x.dat", "w");
|
||||
fxexact = fopen("xexact.dat", "w");
|
||||
fb = fopen("b.dat", "w");
|
||||
|
||||
if (!fA || !fx || !fxexact || !fb)
|
||||
{
|
||||
if (fb)
|
||||
fclose(fb);
|
||||
if (fxexact)
|
||||
fclose(fxexact);
|
||||
if (fx)
|
||||
fclose(fx);
|
||||
if (fA)
|
||||
fclose(fA);
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (global_int_t i = 0; i < nrow; i++)
|
||||
{
|
||||
const double* const currentRowValues = A.matrixValues[i];
|
||||
const global_int_t* const currentRowIndices = A.mtxIndG[i];
|
||||
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
|
||||
for (int j = 0; j < currentNumberOfNonzeros; j++)
|
||||
#ifdef HPCG_NO_LONG_LONG
|
||||
fprintf(fA, " %d %d %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
|
||||
#else
|
||||
fprintf(fA, " %lld %lld %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
|
||||
#endif
|
||||
fprintf(fx, "%22.16e\n", x.values[i]);
|
||||
fprintf(fxexact, "%22.16e\n", xexact.values[i]);
|
||||
fprintf(fb, "%22.16e\n", b.values[i]);
|
||||
}
|
||||
|
||||
fclose(fA);
|
||||
fclose(fx);
|
||||
fclose(fxexact);
|
||||
fclose(fb);
|
||||
return 0;
|
||||
}
|
||||
22
src/WriteProblem.hpp
Normal file
22
src/WriteProblem.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef WRITEPROBLEM_HPP
|
||||
#define WRITEPROBLEM_HPP
|
||||
#include "Geometry.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include <string>
|
||||
|
||||
int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact);
|
||||
#endif // WRITEPROBLEM_HPP
|
||||
107
src/YAML_Doc.cpp
Normal file
107
src/YAML_Doc.cpp
Normal file
@@ -0,0 +1,107 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "YAML_Doc.hpp"
|
||||
#include <cassert>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
using namespace std;
|
||||
|
||||
/*!
|
||||
Sets the application name and version which will become part of the YAML doc.
|
||||
|
||||
@param[in] miniApp_Name application name
|
||||
@param[in] miniApp_Version application name
|
||||
@param[in] destination_Directory destination directory for the YAML document
|
||||
@param[in] destination_FileName file name for the YAML document
|
||||
*/
|
||||
YAML_Doc::YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
|
||||
const std::string& destination_Directory, const std::string& destination_FileName)
|
||||
{
|
||||
miniAppName = miniApp_Name;
|
||||
miniAppVersion = miniApp_Version;
|
||||
destinationDirectory = destination_Directory;
|
||||
destinationFileName = destination_FileName;
|
||||
}
|
||||
|
||||
// inherits the destructor from YAML_Element
|
||||
YAML_Doc::~YAML_Doc(void) {}
|
||||
|
||||
/*!
|
||||
Generates YAML from the elements of the document and saves it to a file.
|
||||
|
||||
@return returns the complete YAML document as a string
|
||||
*/
|
||||
string YAML_Doc::generateYAML()
|
||||
{
|
||||
string yaml;
|
||||
|
||||
yaml = yaml + miniAppName + " version: " + miniAppVersion + "\n";
|
||||
|
||||
for (size_t i = 0; i < children.size(); i++)
|
||||
{
|
||||
yaml = yaml + children[i]->printYAML("");
|
||||
}
|
||||
|
||||
time_t rawtime;
|
||||
tm* ptm;
|
||||
time(&rawtime);
|
||||
ptm = localtime(&rawtime);
|
||||
char sdate[64];
|
||||
// use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
|
||||
sprintf(sdate, "%04d.%02d.%02d.%02d.%02d.%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
|
||||
ptm->tm_min, ptm->tm_sec);
|
||||
|
||||
string filename;
|
||||
if (destinationFileName == "")
|
||||
filename = miniAppName + "-" + miniAppVersion + "_";
|
||||
else
|
||||
filename = destinationFileName;
|
||||
filename = filename + string(sdate) + ".yaml";
|
||||
if (destinationDirectory != "" && destinationDirectory != ".")
|
||||
{
|
||||
string mkdir_cmd = "mkdir " + destinationDirectory;
|
||||
int result = system(mkdir_cmd.c_str());
|
||||
assert(result == 0);
|
||||
filename = destinationDirectory + "/" + destinationFileName;
|
||||
}
|
||||
else
|
||||
filename = "./" + filename;
|
||||
|
||||
ofstream myfile;
|
||||
myfile.open(filename.c_str());
|
||||
myfile << yaml;
|
||||
myfile.close();
|
||||
return yaml;
|
||||
}
|
||||
117
src/YAML_Doc.hpp
Normal file
117
src/YAML_Doc.hpp
Normal file
@@ -0,0 +1,117 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file YAML_Doc.hpp
|
||||
|
||||
HPCG YAML classes
|
||||
*/
|
||||
|
||||
// Changelog
|
||||
//
|
||||
// Version 0.1
|
||||
// - Initial version.
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef YAML_DOC_HPP
|
||||
#define YAML_DOC_HPP
|
||||
#include "YAML_Element.hpp"
|
||||
#include <string>
|
||||
|
||||
//! The YAML_Doc class for the uniform collecting and reporting of performance data for HPCG
|
||||
|
||||
/*!
|
||||
|
||||
The YAML_Doc class works in conjunction with the YAML_Element class to facilitate easy collecting and reporting of
|
||||
YAML-formatted data that can be then registered with the HPCG results collection website.
|
||||
|
||||
\code
|
||||
|
||||
//EXAMPLE CODE FOR GENERATING YAML
|
||||
|
||||
YAML_Doc doc("hpcg","0.1");
|
||||
doc.add("final_residual",1.4523e-13);
|
||||
doc.add("time","4.893");
|
||||
|
||||
//note: the following line will remove the data (4.890) associated with "time"
|
||||
doc.get("time")->add("total",4.243);
|
||||
|
||||
//note: the following line will likewise remove the data (1.243) associated with "time"
|
||||
doc.get("time")->get("total")->add("time",2.457);
|
||||
doc.get("time")->get("total")->add("flops",4.88e5);
|
||||
doc.get("time")->add("ddot",1.243);
|
||||
doc.get("time")->add("sparsemv","");
|
||||
doc.get("time")->get("sparsemv")->add("time",0.3445);
|
||||
doc.get("time")->get("sparsemv")->add("overhead","");
|
||||
doc.get("time")->get("sparsemv")->get("overhead")->add("time",0.0123);
|
||||
doc.get("time")->get("sparsemv")->get("overhead")->add("percentage",0.034);
|
||||
cout << doc.generateYAML() << endl;
|
||||
return 0;
|
||||
|
||||
\endcode
|
||||
|
||||
Below is the output generated by the above code:
|
||||
|
||||
\verbatim
|
||||
|
||||
final_residual: 1.4523e-13
|
||||
time:
|
||||
total:
|
||||
time: 2.457
|
||||
flops: 4.88e5
|
||||
ddot: 1.243
|
||||
sparsemv:
|
||||
time: 0.3445
|
||||
overhead:
|
||||
time: 0.0123
|
||||
percentage: 0.034
|
||||
|
||||
\endverbatim
|
||||
|
||||
\note {No value is allowed to be attached to a key that has children. If children are added to a key, the value is
|
||||
simply set to "".}
|
||||
|
||||
*/
|
||||
class YAML_Doc : public YAML_Element
|
||||
{
|
||||
public:
|
||||
//! Constructor: accepts mini-application name and version as strings, optionally accepts directory and file name
|
||||
//! for printing results.
|
||||
/*!
|
||||
The sole constructor for this class accepts and name and version number for the mini-application as well as
|
||||
optional directory and file name information for results that are generated by the generateYAML() method. \param
|
||||
miniApp_Name (in) string containing name of the mini-application \param miniApp_Version (in) string containing the
|
||||
version of the mini-application \param destination_Directory (in, optional) path of directory where results file
|
||||
will be stored, relative to current working directory. If this value is not supplied, the results file will be
|
||||
stored in the current working directory. If the directory does not exist it will be created. \param
|
||||
destination_FileName (in, optional) root name of the results file. A suffix of ".yaml" will be automatically
|
||||
appended. If no file name is specified the filename will be constructed by concatenating the miniAppName +
|
||||
miniAppVersion + ".yaml" strings.
|
||||
*/
|
||||
YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
|
||||
const std::string& destination_Directory = "", const std::string& destination_FileName = "");
|
||||
//! Destructor
|
||||
~YAML_Doc();
|
||||
//! Generate YAML results to standard out and to a file using specified directory and filename, using current
|
||||
//! directory and miniAppName + miniAppVersion + ".yaml" by default
|
||||
std::string generateYAML();
|
||||
|
||||
protected:
|
||||
std::string miniAppName; //!< the name of the application that generated the YAML output
|
||||
std::string miniAppVersion; //!< the version of the application that generated the YAML output
|
||||
std::string destinationDirectory; //!< the destination directory for the generated the YAML output
|
||||
std::string destinationFileName; //!< the filename for the generated the YAML output
|
||||
};
|
||||
#endif // YAML_DOC_HPP
|
||||
220
src/YAML_Element.cpp
Normal file
220
src/YAML_Element.cpp
Normal file
@@ -0,0 +1,220 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file YAML_Element.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
|
||||
#include "YAML_Element.hpp"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
using namespace std;
|
||||
YAML_Element::YAML_Element(const std::string& key_arg, const std::string& value_arg)
|
||||
{
|
||||
key = key_arg;
|
||||
value = value_arg;
|
||||
}
|
||||
|
||||
YAML_Element::~YAML_Element()
|
||||
{
|
||||
for (size_t i = 0; i < children.size(); i++)
|
||||
{
|
||||
delete children[i];
|
||||
}
|
||||
children.clear();
|
||||
}
|
||||
|
||||
/*!
|
||||
Add an element to the vector
|
||||
QUESTION: if an element is not added because the key already exists,
|
||||
will this lead to memory leakage?
|
||||
|
||||
@param[in] key_arg The key under which the element is stored
|
||||
@param[in] value_arg The value of the element
|
||||
|
||||
@return Returns the added element
|
||||
*/
|
||||
YAML_Element* YAML_Element::add(const std::string& key_arg, double value_arg)
|
||||
{
|
||||
this->value = "";
|
||||
string converted_value = convert_double_to_string(value_arg);
|
||||
YAML_Element* element = new YAML_Element(key_arg, converted_value);
|
||||
children.push_back(element);
|
||||
return element;
|
||||
}
|
||||
|
||||
/*!
|
||||
Add an element to the vector
|
||||
|
||||
@param[in] key_arg The key under which the element is stored
|
||||
@param[in] value_arg The value of the element
|
||||
|
||||
@return Returns the added element
|
||||
*/
|
||||
YAML_Element* YAML_Element::add(const std::string& key_arg, int value_arg)
|
||||
{
|
||||
this->value = "";
|
||||
string converted_value = convert_int_to_string(value_arg);
|
||||
YAML_Element* element = new YAML_Element(key_arg, converted_value);
|
||||
children.push_back(element);
|
||||
return element;
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
|
||||
/*!
|
||||
Add an element to the vector
|
||||
|
||||
@param[in] key_arg The key under which the element is stored
|
||||
@param[in] value_arg The value of the element
|
||||
|
||||
@return Returns the added element
|
||||
*/
|
||||
YAML_Element* YAML_Element::add(const std::string& key_arg, long long value_arg)
|
||||
{
|
||||
this->value = "";
|
||||
string converted_value = convert_long_long_to_string(value_arg);
|
||||
YAML_Element* element = new YAML_Element(key_arg, converted_value);
|
||||
children.push_back(element);
|
||||
return element;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Add an element to the vector
|
||||
|
||||
@param[in] key_arg The key under which the element is stored
|
||||
@param[in] value_arg The value of the element
|
||||
|
||||
@return Returns the added element
|
||||
*/
|
||||
YAML_Element* YAML_Element::add(const std::string& key_arg, size_t value_arg)
|
||||
{
|
||||
this->value = "";
|
||||
string converted_value = convert_size_t_to_string(value_arg);
|
||||
YAML_Element* element = new YAML_Element(key_arg, converted_value);
|
||||
children.push_back(element);
|
||||
return element;
|
||||
}
|
||||
|
||||
/*!
|
||||
Add an element to the vector
|
||||
|
||||
@param[in] key_arg The key under which the element is stored
|
||||
@param[in] value_arg The value of the element
|
||||
|
||||
@return Returns the added element
|
||||
*/
|
||||
YAML_Element* YAML_Element::add(const std::string& key_arg, const std::string& value_arg)
|
||||
{
|
||||
this->value = "";
|
||||
YAML_Element* element = new YAML_Element(key_arg, value_arg);
|
||||
children.push_back(element);
|
||||
return element;
|
||||
}
|
||||
|
||||
/*!
|
||||
Returns the pointer to the YAML_Element for the given key.
|
||||
@param[in] key_arg The key under which the element was stored
|
||||
|
||||
@return If found, returns the element, otherwise returns NULL
|
||||
*/
|
||||
YAML_Element* YAML_Element::get(const std::string& key_arg)
|
||||
{
|
||||
for (size_t i = 0; i < children.size(); i++)
|
||||
{
|
||||
if (children[i]->getKey() == key_arg)
|
||||
{
|
||||
return children[i];
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*!
|
||||
Prints a line of a YAML document. Correct YAML depends on
|
||||
correct spacing; the parameter space should be the proper
|
||||
amount of space for the parent element
|
||||
|
||||
@param[in] space spacing inserted at the beginning of the line
|
||||
|
||||
@return Returns a single line of the YAML document without the leading white space
|
||||
*/
|
||||
string YAML_Element::printYAML(std::string space)
|
||||
{
|
||||
string yaml_line = space + key + ": " + value + "\n";
|
||||
for (int i = 0; i < 2; i++)
|
||||
space = space + " ";
|
||||
for (size_t i = 0; i < children.size(); i++)
|
||||
{
|
||||
yaml_line = yaml_line + children[i]->printYAML(space);
|
||||
}
|
||||
return yaml_line;
|
||||
}
|
||||
|
||||
/*!
|
||||
Converts a double precision value to a string.
|
||||
|
||||
@param[in] value_arg The value to be converted.
|
||||
*/
|
||||
string YAML_Element::convert_double_to_string(double value_arg)
|
||||
{
|
||||
stringstream strm;
|
||||
strm << value_arg;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
/*!
|
||||
Converts a integer value to a string.
|
||||
|
||||
@param[in] value_arg The value to be converted.
|
||||
*/
|
||||
string YAML_Element::convert_int_to_string(int value_arg)
|
||||
{
|
||||
stringstream strm;
|
||||
strm << value_arg;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
|
||||
/*!
|
||||
Converts a "long long" integer value to a string.
|
||||
|
||||
@param[in] value_arg The value to be converted.
|
||||
*/
|
||||
string YAML_Element::convert_long_long_to_string(long long value_arg)
|
||||
{
|
||||
stringstream strm;
|
||||
strm << value_arg;
|
||||
return strm.str();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Converts a "size_t" integer value to a string.
|
||||
|
||||
@param[in] value_arg The value to be converted.
|
||||
*/
|
||||
string YAML_Element::convert_size_t_to_string(size_t value_arg)
|
||||
{
|
||||
stringstream strm;
|
||||
strm << value_arg;
|
||||
return strm.str();
|
||||
}
|
||||
87
src/YAML_Element.hpp
Normal file
87
src/YAML_Element.hpp
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*!
|
||||
@file YAML_Element.hpp
|
||||
|
||||
HPCG data structures for YAML output
|
||||
*/
|
||||
|
||||
// Changelog
|
||||
//
|
||||
// Version 0.1
|
||||
// - Initial version.
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef YAML_ELEMENT_HPP
|
||||
#define YAML_ELEMENT_HPP
|
||||
#include "Geometry.hpp"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
//! HPCG YAML_Element class, from the HPCG YAML_Element class for registering key-value pairs of performance data
|
||||
|
||||
/*!
|
||||
HPCG generates a collection of performance data for each run of the executable. YAML_Element, and
|
||||
the related YAML_Doc class, provide a uniform facility for gathering and reporting this data using the YAML text
|
||||
format.
|
||||
*/
|
||||
class YAML_Element
|
||||
{
|
||||
public:
|
||||
//! Default constructor.
|
||||
YAML_Element()
|
||||
{
|
||||
key = "";
|
||||
value = "";
|
||||
}
|
||||
//! Construct with known key-value pair
|
||||
YAML_Element(const std::string& key_arg, const std::string& value_arg);
|
||||
//! Destructor
|
||||
~YAML_Element();
|
||||
//! Key accessor method
|
||||
std::string getKey()
|
||||
{
|
||||
return key;
|
||||
}
|
||||
//! Add a child element to an element list associated with this element, value of type double
|
||||
YAML_Element* add(const std::string& key_arg, double value_arg);
|
||||
//! Add a child element to an element list associated with this element, value of type int
|
||||
YAML_Element* add(const std::string& key_arg, int value_arg);
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
//! Add a child element to an element list associated with this element, value of type long long
|
||||
YAML_Element* add(const std::string& key_arg, long long value_arg);
|
||||
#endif
|
||||
//! Add a child element to an element list associated with this element, value of type size_t
|
||||
YAML_Element* add(const std::string& key_arg, size_t value_arg);
|
||||
//! Add a child element to an element list associated with this element, value of type string
|
||||
YAML_Element* add(const std::string& key_arg, const std::string& value_arg);
|
||||
//! get the element in the list with the given key
|
||||
YAML_Element* get(const std::string& key_arg);
|
||||
std::string printYAML(std::string space);
|
||||
|
||||
protected:
|
||||
std::string key; //!< the key under which the element is stored
|
||||
std::string value; //!< the value of the stored element
|
||||
std::vector<YAML_Element*> children; //!< children elements of this element
|
||||
|
||||
private:
|
||||
std::string convert_double_to_string(double value_arg);
|
||||
std::string convert_int_to_string(int value_arg);
|
||||
#ifndef HPCG_NO_LONG_LONG
|
||||
std::string convert_long_long_to_string(long long value_arg);
|
||||
#endif
|
||||
std::string convert_size_t_to_string(size_t value_arg);
|
||||
};
|
||||
#endif // YAML_ELEMENT_HPP
|
||||
49
src/finalize.cpp
Normal file
49
src/finalize.cpp
Normal file
@@ -0,0 +1,49 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
extern int use_output_file;
|
||||
/*!
|
||||
Closes the I/O stream used for logging information throughout the HPCG run.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see HPCG_Init
|
||||
*/
|
||||
int HPCG_Finalize(void)
|
||||
{
|
||||
if (use_output_file)
|
||||
HPCG_fout.close();
|
||||
return 0;
|
||||
}
|
||||
150
src/hpcg.hpp
Normal file
150
src/hpcg.hpp
Normal file
@@ -0,0 +1,150 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file hpcg.hpp
|
||||
|
||||
HPCG data structures and functions
|
||||
*/
|
||||
|
||||
/*
|
||||
Hitory:
|
||||
*05.28.2023: HPC-Benchmark 23.5 release
|
||||
*/
|
||||
|
||||
#ifndef HPCG_HPP
|
||||
#define HPCG_HPP
|
||||
|
||||
#include "Geometry.hpp"
|
||||
#include <fstream>
|
||||
|
||||
#ifndef USE_CUDA
|
||||
#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64) || defined(__amd64__) || defined(__amd64) \
|
||||
|| defined(_M_X64)
|
||||
#define USE_CUDA
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#endif
|
||||
|
||||
#define XSTR(s) STR(s)
|
||||
#define STR(s) #s
|
||||
|
||||
#define EMPTY_MACRO_ 1
|
||||
#define CHECK_EMPTY_MACRO_(x) EMPTY_MACRO_##x
|
||||
#define CHECK_EMPTY_MACRO(x) CHECK_EMPTY_MACRO_(x)
|
||||
|
||||
#ifndef make_HPCG_VER_MAJOR
|
||||
#define HPCG_VER_MAJOR 24
|
||||
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MAJOR) == 1
|
||||
#define HPCG_VER_MAJOR 24
|
||||
#else
|
||||
#define HPCG_VER_MAJOR make_HPCG_VER_MAJOR
|
||||
#endif
|
||||
|
||||
#ifndef make_HPCG_VER_MINOR
|
||||
#define HPCG_VER_MINOR 09
|
||||
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MINOR) == 1
|
||||
#define HPCG_VER_MINOR 09
|
||||
#else
|
||||
#define HPCG_VER_MINOR make_HPCG_VER_MINOR
|
||||
#endif
|
||||
|
||||
#ifndef make_HPCG_VER_PATCH
|
||||
#define HPCG_VER_PATCH 0
|
||||
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_PATCH) == 1
|
||||
#define HPCG_VER_PATCH 0
|
||||
#else
|
||||
#define HPCG_VER_PATCH make_HPCG_VER_PATCH
|
||||
#endif
|
||||
|
||||
#ifndef make_HPCG_VER_BUILD
|
||||
#define HPCG_VER_BUILD 0
|
||||
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_BUILD) == 1
|
||||
#define HPCG_VER_BUILD 0
|
||||
#else
|
||||
#define HPCG_VER_BUILD make_HPCG_VER_BUILD
|
||||
#endif
|
||||
|
||||
#define HPCG_VERSION (HPCG_VER_MAJOR * 1000 + HPCG_VER_MINOR * 100 + HPCG_VER_PATCH)
|
||||
|
||||
#define VER_HEADER \
|
||||
"HPCG-NVIDIA " XSTR(HPCG_VER_MAJOR)"." XSTR(HPCG_VER_MINOR) "." XSTR(HPCG_VER_PATCH) " -- NVIDIA accelerated HPCG benchmark -- NVIDIA\n"
|
||||
|
||||
#define HPCG_LINE_MAX 256
|
||||
|
||||
extern std::ofstream HPCG_fout;
|
||||
|
||||
// Refer to src/init.cpp for possible user-defined values
|
||||
struct HPCG_Params_STRUCT
|
||||
{
|
||||
int comm_size; //!< Number of MPI processes in MPI_COMM_WORLD
|
||||
int comm_rank; //!< This process' MPI rank in the range [0 to comm_size - 1]
|
||||
int numThreads; //!< This process' number of threads
|
||||
local_int_t nx; //!< Number of processes in x-direction of 3D process grid
|
||||
local_int_t ny; //!< Number of processes in y-direction of 3D process grid
|
||||
local_int_t nz; //!< Number of processes in z-direction of 3D process grid
|
||||
int runningTime; //!< Number of seconds to run the timed portion of the benchmark
|
||||
int npx; //!< Number of x-direction grid points for each local subdomain
|
||||
int npy; //!< Number of y-direction grid points for each local subdomain
|
||||
int npz; //!< Number of z-direction grid points for each local subdomain
|
||||
int pz; //!< Partition in the z processor dimension, default is npz
|
||||
local_int_t zl; //!< nz for processors in the z dimension with value less than pz
|
||||
local_int_t zu; //!< nz for processors in the z dimension with value greater than pz
|
||||
bool benchmark_mode; // !< Skips running reference code
|
||||
bool use_l2compression; // !< Activates GPU L2 Compression
|
||||
bool use_hpcg_mem_reduction; // !< Not passed as parameter. Set in main to true. Activates aggressive memory
|
||||
// reduction optimizations
|
||||
rank_type_t rank_type; // !< Not passed as parameter. GPU or CPU
|
||||
p2p_comm_mode_t p2_mode; // !< We have 4 methods to do p2p comm in MV and MG, refer to Geometry.hpp
|
||||
exec_mode_t exec_mode = GPUONLY; // !< Three modes supported: GPUONLY, CPUONLY, GPUCPU.
|
||||
int g2c; // !< Related to GPU/CPU local problem definition
|
||||
dim_3d_t diff_dim; // !< Specifies the dim that is different for the CPU and GPU ranks
|
||||
local_problem_def_t local_problem_def; // !< Specifies how nx, ny, nz, and g2c are interpreted (4 possibilites)
|
||||
bool cpu_allowed_to_print; // !< Not passed as parameter. Specifies the CPU rank (opposite to GPU rank) that is
|
||||
// allowed to print
|
||||
bool use_output_file; // !< There is a global variable with the same name defined in src/init.cpp and used
|
||||
// throughout the files
|
||||
local_int_t gpu_slice_size;
|
||||
local_int_t cpu_slice_size;
|
||||
};
|
||||
/*!
|
||||
HPCG_Params is a shorthand for HPCG_Params_STRUCT
|
||||
*/
|
||||
typedef HPCG_Params_STRUCT HPCG_Params;
|
||||
|
||||
extern void InitializeRanks(HPCG_Params& params);
|
||||
extern int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params);
|
||||
extern int HPCG_Finalize(void);
|
||||
|
||||
#endif // HPCG_HPP
|
||||
444
src/init.cpp
Normal file
444
src/init.cpp
Normal file
@@ -0,0 +1,444 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
const char* NULLDEVICE = "nul";
|
||||
#else
|
||||
const char* NULLDEVICE = "/dev/null";
|
||||
#endif
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "hpcg.hpp"
|
||||
|
||||
#include "ReadHpcgDat.hpp"
|
||||
|
||||
int use_output_file = 0;
|
||||
std::ofstream HPCG_fout; //!< output file stream for logging activities during HPCG run
|
||||
#if defined(USE_CUDA) && defined(USE_NCCL)
|
||||
ncclComm_t Nccl_Comm;
|
||||
#endif
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
char host_name[MPI_MAX_PROCESSOR_NAME];
|
||||
char pro_name[MPI_MAX_PROCESSOR_NAME];
|
||||
MPI_Comm proComm;
|
||||
int global_rank = 0;
|
||||
int global_total_ranks = 0;
|
||||
int program_rank = 0;
|
||||
int program_total_ranks = 0;
|
||||
int* physical_rank_dims;
|
||||
int* logical_rank_to_phys;
|
||||
int* physical_rank_dims_d;
|
||||
int* logical_rank_to_phys_d;
|
||||
#else
|
||||
char host_name[1000];
|
||||
char pro_name[1000];
|
||||
#endif
|
||||
|
||||
static int startswith(const char* s, const char* prefix)
|
||||
{
|
||||
size_t n = strlen(prefix);
|
||||
if (strncmp(s, prefix, n))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int stringCmp(const void* a, const void* b)
|
||||
{
|
||||
return strcmp((const char*) a, (const char*) b);
|
||||
}
|
||||
|
||||
/*!
|
||||
Initializes an HPCG run by obtaining problem parameters (from a file or
|
||||
command line) and then broadcasts them to all nodes. It also initializes
|
||||
login I/O streams that are used throughout the HPCG run. Only MPI rank 0
|
||||
performs I/O operations.
|
||||
|
||||
The function assumes that MPI has already been initialized for MPI runs.
|
||||
|
||||
@param[in] argc_p the pointer to the "argc" parameter passed to the main() function
|
||||
@param[in] argv_p the pointer to the "argv" parameter passed to the main() function
|
||||
@param[out] params the reference to the data structures that is filled the basic parameters of the run
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@see HPCG_Finalize
|
||||
*/
|
||||
|
||||
void InitializeRanks(HPCG_Params& params)
|
||||
{
|
||||
char(*host_names)[MPI_MAX_PROCESSOR_NAME];
|
||||
char(*program_names)[MPI_MAX_PROCESSOR_NAME];
|
||||
MPI_Comm nodeComm;
|
||||
int n, namelen, color, local_procs;
|
||||
size_t bytes;
|
||||
|
||||
int deviceCount;
|
||||
int local_rank = 0;
|
||||
|
||||
// 1) Find global
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &global_rank); // GLobal rank for CPU and GPU
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &global_total_ranks); // Global Number of ranks for CPU and GPU
|
||||
|
||||
physical_rank_dims = new int[3 * global_total_ranks];
|
||||
logical_rank_to_phys = new int[global_total_ranks];
|
||||
|
||||
bytes = global_total_ranks * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
|
||||
|
||||
// Color ranks by program name (if more than one binary executed, e.g., one for CPU and one for GPU)
|
||||
program_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
|
||||
strcpy(program_names[global_rank], __FILE__);
|
||||
for (n = 0; n < global_total_ranks; n++)
|
||||
{
|
||||
MPI_Bcast(&(program_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
|
||||
}
|
||||
qsort(program_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
|
||||
|
||||
color = 0;
|
||||
for (n = 0; n < global_total_ranks; n++)
|
||||
{
|
||||
if (n > 0 && strcmp(program_names[n - 1], program_names[n]))
|
||||
color++;
|
||||
if (strcmp(__FILE__, program_names[n]) == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
MPI_Comm_split(MPI_COMM_WORLD, color, 0, &proComm);
|
||||
MPI_Comm_rank(proComm, &program_rank);
|
||||
MPI_Comm_size(proComm, &program_total_ranks);
|
||||
free(program_names);
|
||||
|
||||
MPI_Get_processor_name(host_name, &namelen); // Host name
|
||||
host_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
|
||||
strcpy(host_names[global_rank], host_name);
|
||||
|
||||
for (n = 0; n < global_total_ranks; n++)
|
||||
{
|
||||
MPI_Bcast(&(host_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
qsort(host_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
|
||||
|
||||
color = 0;
|
||||
for (n = 0; n < global_total_ranks; n++)
|
||||
{
|
||||
if (n > 0 && strcmp(host_names[n - 1], host_names[n]))
|
||||
color++;
|
||||
if (strcmp(host_name, host_names[n]) == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
MPI_Comm_split(proComm, color, 0, &nodeComm);
|
||||
MPI_Comm_rank(nodeComm, &local_rank);
|
||||
MPI_Comm_size(nodeComm, &local_procs);
|
||||
|
||||
free(host_names);
|
||||
#ifdef USE_CUDA
|
||||
cudaGetDeviceCount(&deviceCount);
|
||||
#endif
|
||||
|
||||
// Figure out the rank type, based on execution mode (params.exec_mode)
|
||||
if (params.exec_mode == CPUONLY)
|
||||
{
|
||||
params.rank_type = CPU;
|
||||
}
|
||||
else if (params.exec_mode == GPUONLY)
|
||||
{
|
||||
params.rank_type = GPU;
|
||||
#ifdef USE_CUDA
|
||||
cudaGetDeviceCount(&deviceCount);
|
||||
cudaSetDevice(local_rank % deviceCount);
|
||||
|
||||
// Touch Pinned Memory
|
||||
double* t;
|
||||
cudaMallocHost((void**) (&(t)), sizeof(double));
|
||||
cudaFreeHost(t);
|
||||
|
||||
if (params.p2_mode == NCCL)
|
||||
{
|
||||
#ifdef USE_NCCL
|
||||
ncclUniqueId id;
|
||||
if (global_rank == 0)
|
||||
ncclGetUniqueId(&id);
|
||||
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
ncclCommInitRank(&Nccl_Comm, global_total_ranks, id, global_rank);
|
||||
#endif // USE_NCCL
|
||||
}
|
||||
|
||||
#endif // USE_CUDA
|
||||
}
|
||||
else /*CPUGPU*/
|
||||
{
|
||||
// Here we assume that a node has the same number of GPU and CPU ranks
|
||||
// This design is rigid but it is difficult to assign ranks automatically
|
||||
// to GPUs and CPUs otherwise
|
||||
params.cpu_allowed_to_print = false; // Enable printing for the first CPU rank only
|
||||
int ranks_for_numa = local_procs / deviceCount;
|
||||
if (ranks_for_numa == 1)
|
||||
{
|
||||
if (global_rank == 0)
|
||||
printf("Warning: All Ranks will be Assigned to GPUs, check the total number of ranks\n");
|
||||
}
|
||||
if (local_rank % ranks_for_numa == 0)
|
||||
{
|
||||
params.rank_type = GPU;
|
||||
#ifdef USE_CUDA
|
||||
cudaSetDevice(local_rank / ranks_for_numa);
|
||||
// Touch Pinned Memory
|
||||
double* t;
|
||||
cudaMallocHost((void**) (&(t)), sizeof(double));
|
||||
cudaFreeHost(t);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
params.rank_type = CPU;
|
||||
if (local_rank == 1 && color == 0)
|
||||
{
|
||||
params.cpu_allowed_to_print = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params)
|
||||
{
|
||||
int argc = *argc_p;
|
||||
char** argv = *argv_p;
|
||||
char fname[80];
|
||||
int i, j, *iparams;
|
||||
char cparams[][9] = {"--nx=", "--ny=", "--nz=", "--rt=", "--npx=", "--npy=", "--npz=", "--b=", "--l2cmp=", "--mr=",
|
||||
"--exm=", "--g2c=", "--ddm=", "--lpm=", "--p2p=", "--of=", "--gss=", "--css="};
|
||||
time_t rawtime;
|
||||
tm* ptm;
|
||||
const int nparams = (sizeof cparams) / (sizeof cparams[0]);
|
||||
bool broadcastParams = false; // Make true if parameters read from file.
|
||||
|
||||
const char* name = "HPCG_USE_OUTPUT_FILE";
|
||||
char* value;
|
||||
value = getenv(name);
|
||||
if (value != NULL)
|
||||
{
|
||||
use_output_file = atoi(value);
|
||||
}
|
||||
iparams = (int*) malloc(sizeof(int) * nparams);
|
||||
|
||||
// Initialize iparams
|
||||
for (i = 0; i < nparams; ++i)
|
||||
iparams[i] = 0;
|
||||
|
||||
/* for sequential and some MPI implementations it's OK to read first three args */
|
||||
for (i = 0; i < nparams; ++i)
|
||||
if (argc <= i + 1 || sscanf(argv[i + 1], "%d", iparams + i) != 1 || iparams[i] < 11)
|
||||
iparams[i] = 0;
|
||||
|
||||
/* for some MPI environments, command line arguments may get complicated so we need a prefix */
|
||||
for (i = 1; i <= argc && argv[i]; ++i)
|
||||
for (j = 0; j < nparams; ++j)
|
||||
if (startswith(argv[i], cparams[j]))
|
||||
if (sscanf(argv[i] + strlen(cparams[j]), "%d", iparams + j) != 1)
|
||||
iparams[j] = 0;
|
||||
|
||||
// Check if --rt was specified on the command line
|
||||
int* rt = iparams + 3; // Assume runtime was not specified and will be read from the hpcg.dat file
|
||||
if (iparams[3])
|
||||
rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file
|
||||
if (!iparams[0] && !iparams[1] && !iparams[2])
|
||||
{ /* no geometry arguments on the command line */
|
||||
char HPCG_DAT_FILE[HPCG_LINE_MAX];
|
||||
if (argc > 1)
|
||||
{
|
||||
strcpy(HPCG_DAT_FILE, argv[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
strcpy(HPCG_DAT_FILE, "./hpcg.dat");
|
||||
}
|
||||
if (ReadHpcgDat(iparams, rt, iparams + 7, HPCG_DAT_FILE) == -1)
|
||||
{
|
||||
printf("No input data. Possible options:\n");
|
||||
fflush(0);
|
||||
printf("\t1) Specify path to input file: ./xhpcg <path to *.dat file>\n");
|
||||
printf("\t2) Copy hpcg.dat to the run directory\n");
|
||||
printf("\t3) Use command line parameters: ./xhpcg --nx <x> --ny <y> --nz <z> --rt <t>\n");
|
||||
exit(-1);
|
||||
}
|
||||
broadcastParams = true;
|
||||
}
|
||||
|
||||
// Check for small or unspecified nx, ny, nz values
|
||||
// If any dimension is less than 16, make it the max over the other two dimensions, or 16, whichever is largest
|
||||
for (i = 0; i < 3; ++i)
|
||||
{
|
||||
if (iparams[i] < 16)
|
||||
for (j = 1; j <= 2; ++j)
|
||||
if (iparams[(i + j) % 3] > iparams[i])
|
||||
iparams[i] = iparams[(i + j) % 3];
|
||||
if (iparams[i] < 16)
|
||||
iparams[i] = 16;
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, ¶ms.comm_rank);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, ¶ms.comm_size);
|
||||
#else
|
||||
params.comm_rank = 0;
|
||||
params.comm_size = 1;
|
||||
#endif
|
||||
|
||||
// Broadcast values of iparams to all MPI processes
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (broadcastParams)
|
||||
{
|
||||
MPI_Bcast(iparams, nparams, MPI_INT, 0, MPI_COMM_WORLD);
|
||||
}
|
||||
#endif
|
||||
|
||||
params.nx = iparams[0];
|
||||
params.ny = iparams[1];
|
||||
params.nz = iparams[2];
|
||||
|
||||
params.runningTime = iparams[3];
|
||||
|
||||
params.npx = iparams[4];
|
||||
params.npy = iparams[5];
|
||||
params.npz = iparams[6];
|
||||
|
||||
params.benchmark_mode = iparams[7] > 0;
|
||||
params.use_l2compression = iparams[8] > 0;
|
||||
params.use_hpcg_mem_reduction = iparams[9] > 0;
|
||||
|
||||
/* 0: CPU only | 1: GPU only | 2: GPUCPU */
|
||||
params.exec_mode = iparams[10] == 2 ? GPUCPU : (iparams[10] == 1 ? CPUONLY : GPUONLY);
|
||||
params.g2c = iparams[11] == 0 ? 1 : iparams[11];
|
||||
|
||||
/* 0: NONE | 1: X | 1: Y | 2: Z */
|
||||
params.diff_dim = iparams[12] == 3 ? Z : (iparams[12] == 2 ? Y : (iparams[12] == 1 ? X : NONE));
|
||||
|
||||
// GPU_RATIO=0/*NX, NY, NZ are local to GPU and g2c is a ratio*/
|
||||
// GPU_ABS=1/*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
|
||||
// GPU_CPU_RATIO=2/*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
|
||||
// GPU_CPU_ABS=3/*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
|
||||
if (iparams[13] == 1)
|
||||
params.local_problem_def = GPU_ABS;
|
||||
else if (iparams[13] == 2)
|
||||
params.local_problem_def = GPU_CPU_RATIO;
|
||||
else if (iparams[13] == 3)
|
||||
params.local_problem_def = GPU_CPU_ABS;
|
||||
else
|
||||
params.local_problem_def = GPU_RATIO;
|
||||
|
||||
// P2P Communication method
|
||||
if (iparams[14] == 1)
|
||||
params.p2_mode = MPI_CPU_All2allv;
|
||||
else if (iparams[14] == 2)
|
||||
params.p2_mode = MPI_CUDA_AWARE;
|
||||
else if (iparams[14] == 3)
|
||||
params.p2_mode = MPI_GPU_All2allv;
|
||||
else if (iparams[14] == 4)
|
||||
params.p2_mode = NCCL;
|
||||
else
|
||||
params.p2_mode = MPI_CPU;
|
||||
|
||||
if (iparams[15] == 1)
|
||||
{
|
||||
params.use_output_file = 1;
|
||||
use_output_file = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
params.use_output_file = 0;
|
||||
use_output_file = 0;
|
||||
}
|
||||
|
||||
// --gss
|
||||
params.gpu_slice_size = iparams[16] > 0 ? iparams[16] : 4096;
|
||||
|
||||
// --css
|
||||
params.cpu_slice_size = iparams[17] > 0 ? iparams[17] : 8;
|
||||
|
||||
if (params.comm_rank == 0)
|
||||
{
|
||||
printf("%s", VER_HEADER);
|
||||
}
|
||||
|
||||
#ifdef HPCG_NO_OPENMP
|
||||
params.numThreads = 1;
|
||||
#else
|
||||
#pragma omp parallel
|
||||
#pragma omp single
|
||||
params.numThreads = omp_get_num_threads();
|
||||
#endif
|
||||
|
||||
time(&rawtime);
|
||||
ptm = localtime(&rawtime);
|
||||
sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
|
||||
ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
|
||||
|
||||
if (use_output_file)
|
||||
{
|
||||
if (0 == params.comm_rank)
|
||||
{
|
||||
HPCG_fout.open(fname);
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
|
||||
sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d_%d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
|
||||
ptm->tm_hour, ptm->tm_min, ptm->tm_sec, params.comm_rank);
|
||||
HPCG_fout.open(fname);
|
||||
#else
|
||||
HPCG_fout.open(NULLDEVICE);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
free(iparams);
|
||||
|
||||
return 0;
|
||||
}
|
||||
878
src/main.cpp
Normal file
878
src/main.cpp
Normal file
@@ -0,0 +1,878 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file main.cpp
|
||||
|
||||
- All emums are in Geomerty.hpp
|
||||
- Supports GPU-only, Grace-only, and GPU-Grace. GPU and Grace are different MPI ranks.
|
||||
- The dimensions of GPU rank and CPU rank can only differ in one dimension (nx, ny, or nz).
|
||||
- Parameters are explained in bin/RUNNING-*
|
||||
*/
|
||||
|
||||
// Main routine of a program that calls the HPCG conjugate gradient
|
||||
// solver to solve the problem, and then prints results.
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#ifdef USE_GRACE
|
||||
#include <nvpl_sparse.h>
|
||||
#endif
|
||||
|
||||
#include "CG.hpp"
|
||||
#include "CGData.hpp"
|
||||
#include "CG_ref.hpp"
|
||||
#include "CheckAspectRatio.hpp"
|
||||
#include "CheckProblem.hpp"
|
||||
#include "ComputeMG_ref.hpp"
|
||||
#include "ComputeResidual.hpp"
|
||||
#include "ComputeSPMV_ref.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
#include "ExchangeHalo.hpp"
|
||||
#include "GenerateCoarseProblem.hpp"
|
||||
#include "GenerateGeometry.hpp"
|
||||
#include "GenerateProblem.hpp"
|
||||
#include "Geometry.hpp"
|
||||
#include "OptimizeProblem.hpp"
|
||||
#include "ReportResults.hpp"
|
||||
#include "SetupHalo.hpp"
|
||||
#include "SparseMatrix.hpp"
|
||||
#include "TestCG.hpp"
|
||||
#include "TestNorms.hpp"
|
||||
#include "TestSymmetry.hpp"
|
||||
#include "Vector.hpp"
|
||||
#include "WriteProblem.hpp"
|
||||
#include "hpcg.hpp"
|
||||
#include "mytimer.hpp"
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
using std::cin;
|
||||
#endif
|
||||
using std::endl;
|
||||
|
||||
// Prints in a file or terminal
|
||||
extern int use_output_file;
|
||||
|
||||
#ifdef USE_CUDA
|
||||
cusparseHandle_t cusparsehandle;
|
||||
cublasHandle_t cublashandle;
|
||||
cudaStream_t stream;
|
||||
cudaEvent_t copy_done;
|
||||
cudaStream_t copy_stream;
|
||||
int* ranktoId;
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
nvpl_sparse_handle_t nvpl_sparse_handle;
|
||||
#endif
|
||||
|
||||
// The communication mode used to send point-to-point messages
|
||||
#ifndef HPCG_NO_MPI
|
||||
p2p_comm_mode_t P2P_Mode;
|
||||
#endif
|
||||
|
||||
// USE CUDA L2 compression
|
||||
bool Use_Compression;
|
||||
|
||||
// USE HPCG aggresive memory reduction
|
||||
bool Use_Hpcg_Mem_Reduction;
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Used to find ranks for CPU and GPU programs
|
||||
int* rankToId_h;
|
||||
int* idToRank_h;
|
||||
extern int* physical_rank_dims;
|
||||
extern int* logical_rank_to_phys;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report
|
||||
results.
|
||||
|
||||
@param[in] argc Standard argument count. Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in)
|
||||
@param[in] argv Standard argument array. If argc==1, argv is unused. If argc==4, argv[1], argv[2], argv[3] will be
|
||||
interpreted as nx, ny, nz, resp.
|
||||
|
||||
@return Returns zero on success and a non-zero value otherwise.
|
||||
|
||||
*/
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Init(&argc, &argv);
|
||||
#endif
|
||||
|
||||
// Here I read all the parameters, including the execution mode (CPUONLY, GPUONLY, GPUCPU)
|
||||
HPCG_Params params;
|
||||
HPCG_Init(&argc, &argv, params);
|
||||
bool quickPath = (params.runningTime == 0);
|
||||
int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
|
||||
|
||||
bool benchmark_mode = params.benchmark_mode;
|
||||
Use_Compression = params.use_l2compression;
|
||||
Use_Hpcg_Mem_Reduction = true; // params.use_hpcg_mem_reduction;
|
||||
P2P_Mode = params.p2_mode;
|
||||
|
||||
if (rank == 0)
|
||||
{
|
||||
printf("Build v0.6.0 \n");
|
||||
|
||||
#ifdef HPCG_ENG_VERSION
|
||||
printf("\n%s%s\n", "========================================", "========================================");
|
||||
#ifdef HPCG_COMMIT_HASH
|
||||
printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit: %s\n",
|
||||
XSTR(HPCG_COMMIT_HASH));
|
||||
#else
|
||||
printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit:\n");
|
||||
#endif
|
||||
printf("%s%s\n", "========================================", "========================================");
|
||||
#endif
|
||||
printf("\nStart of application (%s) ...\n",
|
||||
params.exec_mode == GPUONLY ? "GPU-Only"
|
||||
: params.exec_mode == CPUONLY ? "Grace-Only"
|
||||
: "GPU+Grace");
|
||||
|
||||
if (benchmark_mode)
|
||||
printf(" | Benchmark Mode !!!! CPU reference code is not performed \n");
|
||||
|
||||
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
|
||||
if (Use_Compression)
|
||||
printf(
|
||||
" | L2 compression is activated !!!! Currently, it is not legal to submit HPCG results with L2 "
|
||||
"compression\n");
|
||||
#ifdef INDEX_64
|
||||
printf(" | Using INT64 Indexing \n");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Check P2P comm mode
|
||||
// if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
|
||||
// {
|
||||
// #ifndef USE_GRACE
|
||||
// if (rank == 0)
|
||||
// printf(
|
||||
// "Error: HPCG was not compiled for Grace execution. USE --exm=0 for GPU-only execution or add "
|
||||
// "-DUSE_GRACE. Exiting ...\n");
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// MPI_Finalize();
|
||||
// #endif
|
||||
// return 0;
|
||||
// #endif // USE_GRACE
|
||||
|
||||
bool invalid = false;
|
||||
if (P2P_Mode == NCCL)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf("Invalid P2P communication mode (NCCL) for CPUs, Exiting ...\n");
|
||||
invalid = true;
|
||||
}
|
||||
if (P2P_Mode == MPI_GPU_All2allv)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf("Invalid P2P communication mode (MPI GPU All2allv) for CPUs, Exiting ...\n");
|
||||
invalid = true;
|
||||
}
|
||||
if (P2P_Mode == MPI_CUDA_AWARE)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf("Invalid P2P communication mode (CUDA-Aware MPI) for CPUs, Exiting ...\n");
|
||||
invalid = true;
|
||||
}
|
||||
if (invalid)
|
||||
{
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef USE_NCCL
|
||||
if (params.exec_mode == GPUONLY)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf(
|
||||
"Error: HPCG was not compiled with NCCL. USE --exm=1 for Grace-only execution or add -DUSE_NCCL. "
|
||||
"Exiting ...\n");
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif // USE_NCCL
|
||||
|
||||
// Check whether total number of ranks == npx*npy*npz
|
||||
auto rank_grid_size = params.npx * params.npy * params.npz;
|
||||
if (rank_grid_size > 0 && size != rank_grid_size)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf("Error: Total Number of ranks != npx*npy*npz. Exiting ...\n");
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifndef USE_CUDA
|
||||
if (params.exec_mode != CPUONLY)
|
||||
{
|
||||
if (rank == 0)
|
||||
printf(
|
||||
"Error: HPCG was not compiled for GPU execution. USE --exm=1 for Grace-only execution or add "
|
||||
"-DUSE_CUDA. Exiting ...\n");
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Here, we decide the rank type
|
||||
// assign a rank to GPU and CPU
|
||||
InitializeRanks(params);
|
||||
|
||||
// Check if QuickPath option is enabled.
|
||||
// If the running time is set to zero, we minimize all paths through the program
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
if (size < 100 && rank == 0)
|
||||
HPCG_fout << "Process " << rank << " of " << size << " is alive with " << params.numThreads << " threads."
|
||||
<< endl;
|
||||
if (rank == 0)
|
||||
{
|
||||
char c;
|
||||
std::cout << "Press key to continue" << std::endl;
|
||||
std::cin.get(c);
|
||||
}
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/////////////////////////
|
||||
// Problem setup Phase //
|
||||
/////////////////////////
|
||||
#ifdef HPCG_DEBUG
|
||||
double t1 = mytimer();
|
||||
#endif
|
||||
|
||||
// Construct the geometry and linear system
|
||||
Geometry* geom = new Geometry;
|
||||
GenerateGeometry(params, geom);
|
||||
int ierr = CheckAspectRatio(0.125, geom->nx, geom->ny, geom->nz, "local problem", rank == 0);
|
||||
if (ierr)
|
||||
return ierr;
|
||||
|
||||
ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank == 0);
|
||||
if (ierr)
|
||||
return ierr;
|
||||
|
||||
// Sync All Ranks
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
// Test Library versions for cuSPARSE or NVPL Sparse
|
||||
// The two librray versions has to be tested in
|
||||
// GPU or Grace ranks
|
||||
int cusparseMajor = 0, cusparseMinor = 0;
|
||||
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
// Cusparse Version
|
||||
cusparseGetProperty(MAJOR_VERSION, &cusparseMajor);
|
||||
cusparseGetProperty(MINOR_VERSION, &cusparseMinor);
|
||||
|
||||
if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 2))
|
||||
{
|
||||
if (rank == 0)
|
||||
printf("cuSPARSE version must be 12.2 or higher (found v%d.%d) \n", cusparseMajor, cusparseMinor);
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int nvspMajor = 0, nvspMinor = 0, nvspPatch = 0, nvspVersion = 0;
|
||||
// if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
|
||||
// {
|
||||
// #ifdef USE_GRACE
|
||||
// // NVPL Sparse Version
|
||||
// nvpl_sparse_create(&(nvpl_sparse_handle));
|
||||
// nvpl_sparse_get_version(nvpl_sparse_handle, &nvspVersion);
|
||||
// nvspMajor = nvspVersion / 1000;
|
||||
// nvspMinor = (nvspVersion % 1000) / 100;
|
||||
// nvspPatch = nvspVersion % 100;
|
||||
// if (nvspMajor < 0 || (nvspMajor == 0 && nvspMinor < 2))
|
||||
// {
|
||||
// if (rank == 0)
|
||||
// printf("NVPL Sparse version must be 0.2 or higher (found v%d.%d) \n", nvspMajor, nvspMinor);
|
||||
// #ifndef HPCG_NO_MPI
|
||||
// MPI_Finalize();
|
||||
// #endif
|
||||
// return 0;
|
||||
// }
|
||||
// #endif // USE_GRACE
|
||||
// }
|
||||
|
||||
SparseMatrix A;
|
||||
Vector x_overlap, b_computed;
|
||||
Vector b, x, xexact;
|
||||
std::vector<double> times(10, 0.0);
|
||||
CGData data;
|
||||
InitializeSparseMatrix(A, geom);
|
||||
size_t cpuRefMemory = 0;
|
||||
int numberOfMgLevels = 4; // Number of levels including first
|
||||
SparseMatrix* curLevelMatrix = &A;
|
||||
if (params.rank_type == GPU)
|
||||
{
|
||||
A.rankType = GPU;
|
||||
A.slice_size = params.gpu_slice_size;
|
||||
cublasCreate(&(cublashandle));
|
||||
cusparseCreate(&(cusparsehandle));
|
||||
cudaStreamCreate(&(stream));
|
||||
cudaStreamCreate(&(copy_stream));
|
||||
cusparseSetStream(cusparsehandle, stream);
|
||||
cublasSetStream(cublashandle, stream);
|
||||
cusparseSetPointerMode(cusparsehandle, CUSPARSE_POINTER_MODE_HOST);
|
||||
cublasSetPointerMode(cublashandle, CUBLAS_POINTER_MODE_HOST);
|
||||
cudaEventCreate(©_done);
|
||||
|
||||
// Allocate GPU related data
|
||||
AllocateMemCuda(A);
|
||||
|
||||
double setup_time = mytimer();
|
||||
GenerateProblem(A, &b, &x, &xexact);
|
||||
SetupHalo(A);
|
||||
for (int level = 1; level < numberOfMgLevels; ++level)
|
||||
{
|
||||
GenerateCoarseProblem(*curLevelMatrix);
|
||||
curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
|
||||
}
|
||||
setup_time = mytimer() - setup_time; // Capture total time of setup
|
||||
delete[] physical_rank_dims;
|
||||
delete[] logical_rank_to_phys;
|
||||
times[9] = setup_time; // Save it for reporting
|
||||
|
||||
// Copy data from Device to Host.
|
||||
// Note: exclude this from setup_time, as soon as it is needed only for reference calls.
|
||||
cpuRefMemory = CopyDataToHostCuda(A, &b, &x, &xexact);
|
||||
|
||||
// Alocate the GPU data for optimized data structures
|
||||
AllocateMemOptCuda(A);
|
||||
}
|
||||
// else
|
||||
// {
|
||||
// #ifdef USE_GRACE
|
||||
// A.rankType = CPU;
|
||||
// A.slice_size = params.cpu_slice_size;
|
||||
// // Use this array for collecting timing information
|
||||
// double setup_time = mytimer();
|
||||
// GenerateProblem(A, &b, &x, &xexact);
|
||||
// SetupHalo(A);
|
||||
// for (int level = 1; level < numberOfMgLevels; ++level)
|
||||
// {
|
||||
// GenerateCoarseProblem(*curLevelMatrix);
|
||||
// curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
|
||||
// }
|
||||
|
||||
// // These global buffers only needed for problem setup
|
||||
// delete[] rankToId_h;
|
||||
// delete[] idToRank_h;
|
||||
// delete[] physical_rank_dims;
|
||||
// delete[] logical_rank_to_phys;
|
||||
|
||||
// setup_time = mytimer() - setup_time; // Capture total time of setup
|
||||
// times[9] = setup_time; // Save it for reporting
|
||||
// #endif // USE_GRACE
|
||||
// }
|
||||
|
||||
curLevelMatrix = &A;
|
||||
Vector* curb = &b;
|
||||
Vector* curx = &x;
|
||||
Vector* curxexact = &xexact;
|
||||
for (int level = 0; level < numberOfMgLevels; ++level)
|
||||
{
|
||||
// Doesn't work for GPU or GPUCPU cases
|
||||
// Data need to be transfered between CPU and GPU, which is not feasible
|
||||
if (params.exec_mode == CPUONLY)
|
||||
{
|
||||
CheckProblem(*curLevelMatrix, curb, curx, curxexact);
|
||||
//Delete mtxIndG since it is not needed anymore
|
||||
delete [] curLevelMatrix->mtxIndG[0];
|
||||
}
|
||||
curLevelMatrix = curLevelMatrix->Ac; // Make the nextcoarse grid the next level
|
||||
curb = 0; // No vectors after the top level
|
||||
curx = 0;
|
||||
curxexact = 0;
|
||||
}
|
||||
|
||||
InitializeSparseCGData(A, data);
|
||||
|
||||
////////////////////////////////////
|
||||
// Reference SpMV+MG Timing Phase //
|
||||
////////////////////////////////////
|
||||
// Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
local_int_t ncol = A.localNumberOfColumns;
|
||||
InitializeVector(x_overlap, ncol, A.rankType); // Overlapped copy of x vector
|
||||
InitializeVector(b_computed, nrow, A.rankType); // Computed RHS vector
|
||||
|
||||
// Record execution time of reference SpMV and MG kernels for reporting times
|
||||
// First load vector with random values
|
||||
FillRandomVector(x_overlap);
|
||||
|
||||
int numberOfCalls = 10;
|
||||
if (quickPath)
|
||||
numberOfCalls = 1; // QuickPath means we do on one call of each block of repetitive code
|
||||
if (!benchmark_mode)
|
||||
{
|
||||
double t_begin = mytimer();
|
||||
for (int i = 0; i < numberOfCalls; ++i)
|
||||
{
|
||||
ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
|
||||
}
|
||||
ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
}
|
||||
times[8] = (mytimer() - t_begin) / ((double) numberOfCalls); // Total time divided by number of calls.
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
///////////////////////////////
|
||||
// Reference CG Timing Phase //
|
||||
///////////////////////////////
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
t1 = mytimer();
|
||||
#endif
|
||||
int global_failure = 0; // assume all is well: no failures
|
||||
|
||||
int niters = 0;
|
||||
int totalNiters_ref = 0;
|
||||
double normr = 1.0;
|
||||
double normr0 = 1.0;
|
||||
int refMaxIters = 50;
|
||||
numberOfCalls = 1; // Only need to run the residual reduction analysis once
|
||||
|
||||
// Compute the residual reduction for the natural ordering and reference kernels
|
||||
std::vector<double> ref_times(9, 0.0);
|
||||
double tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations
|
||||
int err_count = 0;
|
||||
double refTolerance = 0.0055;
|
||||
if (!benchmark_mode)
|
||||
{
|
||||
for (int i = 0; i < numberOfCalls; ++i)
|
||||
{
|
||||
ZeroVector(x);
|
||||
ierr = CG_ref(A, data, b, x, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true,
|
||||
i == 0); // TODO: TRUE
|
||||
if (ierr)
|
||||
++err_count; // count the number of errors in CG
|
||||
totalNiters_ref += niters;
|
||||
}
|
||||
if (rank == 0 && err_count)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << err_count << " error(s) in call(s) to reference CG." << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << err_count << " error(s) in call(s) to reference CG." << endl;
|
||||
}
|
||||
refTolerance = normr / normr0;
|
||||
}
|
||||
|
||||
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 5))
|
||||
{
|
||||
// Test for the most course matrix
|
||||
if(A.localNumberOfRows/(8 * 8 * 8) < A.slice_size) {
|
||||
if (rank == 0)
|
||||
printf("cuSPARSE version must be 12.5 or higher (found v%d.%d) to allow a GPU slice size (%lld) larger than the matrix number of rows (%lld). Use --gss to set GPU slice size \n",
|
||||
cusparseMajor, cusparseMinor, (long long)A.slice_size, (long long)(A.localNumberOfRows/(8*8*8)));
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Call user-tunable set up function.
|
||||
double t7 = mytimer();
|
||||
size_t opt_mem = OptimizeProblem(A, data, b, x, xexact);
|
||||
t7 = mytimer() - t7;
|
||||
times[7] = t7;
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
std::cout << "Total problem optimize in main (sec) = " << t7 << endl;
|
||||
#endif
|
||||
|
||||
if (params.rank_type == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
int dev;
|
||||
cudaDeviceProp props;
|
||||
CHECK_CUDART(cudaGetDevice(&dev));
|
||||
CHECK_CUDART(cudaGetDeviceProperties(&props, dev));
|
||||
size_t free_bytes, total_bytes;
|
||||
CHECK_CUDART(cudaMemGetInfo(&free_bytes, &total_bytes));
|
||||
|
||||
//Find the number of SMS
|
||||
int numSMS = props.multiProcessorCount;
|
||||
|
||||
if (rank == 0)
|
||||
printf(
|
||||
"GPU Rank Info:\n"
|
||||
" | cuSPARSE version %d.%d\n%s"
|
||||
" | Reference CPU memory = %.2f MB\n"
|
||||
" | GPU Name: '%s'\n"
|
||||
" | Number of SMs: %d\n"
|
||||
" | GPU Memory Use: %ld MB / %ld MB\n"
|
||||
" | Process Grid: %dx%dx%d\n"
|
||||
" | Local Domain: %dx%dx%d\n"
|
||||
" | Number of CPU Threads: %d\n"
|
||||
" | Slice Size: %lld\n",
|
||||
cusparseMajor, cusparseMinor, Use_Compression ? " | L2 compression is activated\n" : "",
|
||||
cpuRefMemory / 1024.0 / 1024.0, props.name, numSMS, (total_bytes - free_bytes) >> 20, total_bytes >> 20,
|
||||
A.geom->npx, A.geom->npy, A.geom->npz, (int)A.geom->nx, (int)A.geom->ny, (int)A.geom->nz, params.numThreads, (long long)A.slice_size);
|
||||
CHECK_CUDART(cudaDeviceSynchronize());
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
cpuRefMemory = EstimateCpuRefMem(A);
|
||||
if (rank == 0 || (params.exec_mode == GPUCPU && params.cpu_allowed_to_print))
|
||||
printf(
|
||||
"CPU Rank Info:\n"
|
||||
" | NVPL Sparse version %d.%d.%d\n"
|
||||
" | Reference CPU memory = %.2f MB\n"
|
||||
" | Optimization Memory Use: %.2f MB\n"
|
||||
" | Process Grid: %dx%dx%d\n"
|
||||
" | Local Domain: %dx%dx%d\n"
|
||||
" | Number of CPU Threads: %d\n"
|
||||
" | Slice Size: %d\n",
|
||||
nvspMajor, nvspMinor, nvspPatch, cpuRefMemory / 1024.0 / 1024.0, opt_mem / 1024.0 / 1024.0, A.geom->npx,
|
||||
A.geom->npy, A.geom->npz, A.geom->nx, A.geom->ny, A.geom->nz, params.numThreads, A.slice_size);
|
||||
#endif // USE_GRACE
|
||||
}
|
||||
|
||||
#ifdef HPCG_DETAILED_DEBUG
|
||||
if (geom->size == 1)
|
||||
WriteProblem(*geom, A, b, x, xexact);
|
||||
#endif
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
//////////////////////////////
|
||||
// Validation Testing Phase //
|
||||
//////////////////////////////
|
||||
#ifdef HPCG_DEBUG
|
||||
t1 = mytimer();
|
||||
#endif
|
||||
TestCGData testcg_data;
|
||||
testcg_data.count_pass = testcg_data.count_fail = 0;
|
||||
TestCG(A, data, b, x, testcg_data);
|
||||
|
||||
TestSymmetryData testsymmetry_data;
|
||||
TestSymmetry(A, b, xexact, testsymmetry_data);
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
HPCG_fout << "Total validation (TestCG and TestSymmetry) execution time in main (sec) = " << mytimer() - t1
|
||||
<< endl;
|
||||
#endif
|
||||
|
||||
//////////////////////////////
|
||||
// Optimized CG Setup Phase //
|
||||
//////////////////////////////
|
||||
|
||||
// Need to permute the b vector
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
|
||||
#endif
|
||||
}
|
||||
|
||||
niters = 0;
|
||||
normr = 0.0;
|
||||
normr0 = 0.0;
|
||||
err_count = 0;
|
||||
int tolerance_failures = 0;
|
||||
|
||||
int optMaxIters = 10 * refMaxIters;
|
||||
int optNiters = refMaxIters;
|
||||
double opt_worst_time = 0.0;
|
||||
double opt_best_time = 9999999.0;
|
||||
|
||||
std::vector<double> bleh_times(9, 0.0);
|
||||
ZeroVector(x); // start x at all zeros
|
||||
ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &bleh_times[0], true, 1);
|
||||
std::vector<double> opt_times(9, 0.0);
|
||||
numberOfCalls = 1;
|
||||
|
||||
// Compute the residual reduction and residual count for the user ordering and optimized kernels.
|
||||
for (int i = 0; i < numberOfCalls; ++i)
|
||||
{
|
||||
ZeroVector(x); // start x at all zeros
|
||||
double last_cummulative_time = opt_times[0];
|
||||
ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &opt_times[0], true, 0); // TODO:
|
||||
// TRUE
|
||||
if (ierr)
|
||||
++err_count; // count the number of errors in CG
|
||||
if (normr / normr0 > refTolerance)
|
||||
++tolerance_failures; // the number of failures to reduce residual
|
||||
|
||||
// pick the largest number of iterations to guarantee convergence
|
||||
if (niters > optNiters)
|
||||
optNiters = niters;
|
||||
|
||||
double current_time = opt_times[0] - last_cummulative_time;
|
||||
if (current_time > opt_worst_time)
|
||||
opt_worst_time = current_time;
|
||||
if (current_time < opt_best_time)
|
||||
opt_best_time = current_time;
|
||||
}
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// Get the absolute worst time across all MPI ranks (time in CG can be different)
|
||||
double local_opt_worst_time = opt_worst_time;
|
||||
MPI_Allreduce(&local_opt_worst_time, &opt_worst_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
if (rank == 0 && err_count)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << err_count << " error(s) in call(s) to optimized CG." << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << err_count << " error(s) in call(s) to optimized CG." << endl;
|
||||
}
|
||||
if (tolerance_failures)
|
||||
{
|
||||
global_failure = 1;
|
||||
if (rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////
|
||||
// Optimized CG Timing Phase //
|
||||
///////////////////////////////
|
||||
|
||||
// Here we finally run the benchmark phase
|
||||
// The variable total_runtime is the target benchmark execution time in seconds
|
||||
|
||||
double total_runtime = params.runningTime;
|
||||
int numberOfCgSets = int(total_runtime / opt_worst_time) + 1; // Run at least once, account for rounding
|
||||
|
||||
#ifdef HPCG_DEBUG
|
||||
if (rank == 0)
|
||||
{
|
||||
HPCG_fout << "Projected running time: " << total_runtime << " seconds" << endl;
|
||||
HPCG_fout << "Number of CG sets: " << numberOfCgSets << endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* This is the timed run for a specified amount of time. */
|
||||
|
||||
optMaxIters = optNiters;
|
||||
double optTolerance = 0.0; // Force optMaxIters iterations
|
||||
TestNormsData testnorms_data;
|
||||
testnorms_data.samples = numberOfCgSets;
|
||||
testnorms_data.values = new double[numberOfCgSets];
|
||||
|
||||
#ifndef HPCG_NOMPI
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < numberOfCgSets; ++i)
|
||||
{
|
||||
ZeroVector(x); // Zero out x
|
||||
ierr = CG(A, data, b, x, optMaxIters, optTolerance, niters, normr, normr0, ×[0], true, 0); // TODO: TRUE
|
||||
if (ierr)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
|
||||
}
|
||||
if (rank == 0)
|
||||
if (use_output_file)
|
||||
{
|
||||
HPCG_fout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
|
||||
}
|
||||
testnorms_data.values[i] = normr / normr0; // Record scaled residual from this run
|
||||
}
|
||||
|
||||
if (params.rank_type == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
PermVectorCuda(A.ref2opt, x, A.localNumberOfRows);
|
||||
CopyVectorD2H(x);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
// Reorder vector
|
||||
Vector xOrdered;
|
||||
InitializeVector(xOrdered, x.localLength, A.rankType);
|
||||
CopyVector(x, xOrdered);
|
||||
CopyAndReorderVector(xOrdered, x, A.ref2opt);
|
||||
DeleteVector(xOrdered);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Compute difference between known exact solution and computed solution
|
||||
// All processors are needed here.
|
||||
#ifdef HPCG_DEBUG
|
||||
double residual = 0;
|
||||
ierr = ComputeResidual(A.localNumberOfRows, x, xexact, residual);
|
||||
if (ierr)
|
||||
HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
|
||||
if (rank == 0)
|
||||
HPCG_fout << "Difference between computed and exact = " << residual << ".\n" << endl;
|
||||
#endif
|
||||
|
||||
// Test Norm Results
|
||||
ierr = TestNorms(testnorms_data);
|
||||
|
||||
//////////////////
|
||||
// Report Results //
|
||||
//////////////////
|
||||
|
||||
// Report results to YAML file
|
||||
ReportResults(A, numberOfMgLevels, numberOfCgSets, refMaxIters, optMaxIters, ×[0], testcg_data,
|
||||
testsymmetry_data, testnorms_data, global_failure, quickPath);
|
||||
|
||||
if (params.rank_type == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
DeleteMatrixGpu(A); // This delete will recursively delete all coarse grid data
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
DeleteMatrixCpu(A); // This delete will recursively delete all coarse grid data
|
||||
#endif
|
||||
}
|
||||
|
||||
DeleteCGData(data);
|
||||
DeleteVector(x);
|
||||
DeleteVector(b);
|
||||
DeleteVector(xexact);
|
||||
DeleteVector(x_overlap);
|
||||
DeleteVector(b_computed);
|
||||
delete[] testnorms_data.values;
|
||||
|
||||
// Clean cuSPARSE data
|
||||
if (params.rank_type == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
cublasDestroy(cublashandle);
|
||||
cusparseDestroy(cusparsehandle);
|
||||
cudaStreamDestroy(stream);
|
||||
cudaStreamDestroy(copy_stream);
|
||||
cudaEventDestroy(copy_done);
|
||||
#endif
|
||||
}
|
||||
|
||||
// We create the handle even in GPU ranks tp find library version
|
||||
if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
nvpl_sparse_destroy(nvpl_sparse_handle);
|
||||
#endif
|
||||
}
|
||||
|
||||
HPCG_Finalize();
|
||||
|
||||
// Finish up
|
||||
#ifndef HPCG_NO_MPI
|
||||
MPI_Finalize();
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
59
src/mytimer.cpp
Normal file
59
src/mytimer.cpp
Normal file
@@ -0,0 +1,59 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Function to return time in seconds.
|
||||
// If compiled with no flags, return CPU time (user and system).
|
||||
// If compiled with -DWALL, returns elapsed time.
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include <mpi.h>
|
||||
|
||||
double mytimer(void)
|
||||
{
|
||||
return MPI_Wtime();
|
||||
}
|
||||
|
||||
#elif !defined(HPCG_NO_OPENMP)
|
||||
|
||||
// If this routine is compiled with HPCG_NO_MPI defined and not compiled with HPCG_NO_OPENMP then use the OpenMP timer
|
||||
#include <omp.h>
|
||||
double mytimer(void)
|
||||
{
|
||||
return omp_get_wtime();
|
||||
}
|
||||
#else
|
||||
|
||||
#include <cstdlib>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/time.h>
|
||||
double mytimer(void)
|
||||
{
|
||||
struct timeval tp;
|
||||
static long start = 0, startu;
|
||||
if (!start)
|
||||
{
|
||||
gettimeofday(&tp, NULL);
|
||||
start = tp.tv_sec;
|
||||
startu = tp.tv_usec;
|
||||
return 0.0;
|
||||
}
|
||||
gettimeofday(&tp, NULL);
|
||||
return ((double) (tp.tv_sec - start)) + (tp.tv_usec - startu) / 1000000.0;
|
||||
}
|
||||
|
||||
#endif
|
||||
18
src/mytimer.hpp
Normal file
18
src/mytimer.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
#ifndef MYTIMER_HPP
|
||||
#define MYTIMER_HPP
|
||||
double mytimer(void);
|
||||
#endif // MYTIMER_HPP
|
||||
Reference in New Issue
Block a user