first commit

This commit is contained in:
2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions

241
src/CG.cpp Normal file
View File

@@ -0,0 +1,241 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file CG.cpp
HPCG routine
*/
#include <fstream>
#include <cmath>
#include "hpcg.hpp"
#include "CG.hpp"
#include "ComputeDotProduct.hpp"
#include "ComputeMG.hpp"
#include "ComputeSPMV.hpp"
#include "ComputeWAXPBY.hpp"
#include "mytimer.hpp"
#include <iostream>
#include "CpuKernels.hpp"
#include <mpi.h>
extern int use_output_file;
#define TICKD() t0 = mytimer() //!< record current time in 't0'
#define TOCKD(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
/*!
Routine to compute an approximate solution to Ax = b
@param[in] geom The description of the problem's geometry.
@param[inout] A The known system matrix
@param[inout] data The data structure with all necessary CG vectors preallocated
@param[in] b The known right hand side vector
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
@param[in] max_iter The maximum number of iterations to perform, even if tolerance is not met.
@param[in] tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
@param[out] niters The number of iterations actually performed.
@param[out] normr The 2-norm of the residual vector after the last iteration.
@param[out] normr0 The 2-norm of the residual vector before the first iteration.
@param[out] times The 7-element vector of the timing information accumulated during all of the iterations.
@param[in] doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
@return Returns zero on success and a non-zero value otherwise.
@see CG_ref()
*/
int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
{
double t_begin = mytimer(); // Start timing right away
normr = 0.0;
double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
// #ifndef HPCG_NO_MPI
// double t6 = 0.0;
// #endif
local_int_t nrow = A.localNumberOfRows;
Vector& r = data.r; // Residual vector
Vector& z = data.z; // Preconditioned residual vector
Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
Vector& Ap = data.Ap;
if (!doPreconditioning && A.geom->rank == 0)
if (use_output_file)
{
HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
}
else
{
std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
}
int print_freq = 1;
if (print_freq > 50)
print_freq = 50;
if (print_freq < 1)
print_freq = 1;
// p is of length ncols, copy x to p for sparse MV operation
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyVectorD2D(x, p);
#endif
}
else
{
CopyVector(x, p);
}
TICKD();
ComputeSPMV(A, p, Ap);
TOCKD(t3); // Ap = A*p
TICKD();
ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized, A.rankType);
TOCKD(t2); // r = b - Ax (x stored in p)
TICKD();
ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
TOCKD(t1);
normr = sqrt(normr);
if (A.geom->rank == 0 && flag)
if (use_output_file)
{
HPCG_fout << "Initial Residual = " << normr << std::endl;
}
else
{
std::cout << "Initial Residual = " << normr << std::endl;
}
// Record initial residual for convergence testing
normr0 = normr;
// Start iterations
for (int k = 1; k <= max_iter && normr / normr0 * (1.0 + 1.0e-6) > tolerance; k++)
{
TICKD();
if (doPreconditioning)
{
ComputeMG(A, r, z); // Apply preconditioner
if (A.rankType == GPU)
{
#ifdef USE_CUDA
cudaStreamSynchronize(stream);
#endif
}
}
else
{
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyVectorD2D(r, z); // copy r to z (no preconditioning)
#endif
}
else
{
CopyVector(r, z); // copy r to z (no preconditioning)
}
}
TOCKD(t5); // Preconditioner apply time
if (k == 1)
{
TICKD();
ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized, A.rankType);
TOCKD(t2); // Copy Mr to p
TICKD();
ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
TOCKD(t1); // rtz = r'*z
}
else
{
oldrtz = rtz;
TICKD();
ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
TOCKD(t1); // rtz = r'*z
beta = rtz / oldrtz;
TICKD();
ComputeWAXPBY(nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized, A.rankType);
TOCKD(t2); // p = beta*p + z
}
TICKD();
ComputeSPMV(A, p, Ap);
TOCKD(t3); // Ap = A*p
TICKD();
ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized, A.rankType);
TOCKD(t1); // alpha = p'*Ap
alpha = rtz / pAp;
TICKD();
ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized, A.rankType); // x = x + alpha*p
ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized, A.rankType);
TOCKD(t2); // r = r - alpha*Ap
TICKD();
ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
TOCKD(t1);
normr = sqrt(normr);
if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
if (use_output_file)
{
HPCG_fout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
}
else
{
std::cout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
}
niters = k;
}
// Store times
times[1] += t1; // dot-product time
times[2] += t2; // WAXPBY time
times[3] += t3; // SPMV time
times[4] += t4; // AllReduce time
times[5] += t5; // preconditioner apply time
// #ifndef HPCG_NO_MPI
// times[6] += t6; // exchange halo time
// #endif
times[0] += mytimer() - t_begin; // Total time. All done...
return 0;
}

55
src/CG.hpp Normal file
View File

@@ -0,0 +1,55 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CG_HPP
#define CG_HPP
#include "CGData.hpp"
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
// this function will compute the Conjugate Gradient iterations.
// geom - Domain and processor topology information
// A - Matrix
// b - constant
// x - used for return value
// max_iter - how many times we iterate
// tolerance - Stopping tolerance for preconditioned iterations.
// niters - number of iterations performed
// normr - computed residual norm
// normr0 - Original residual
// times - array of timing information
// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
#endif // CG_HPP

84
src/CGData.hpp Normal file
View File

@@ -0,0 +1,84 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file CGData.hpp
HPCG data structure
*/
#ifndef CGDATA_HPP
#define CGDATA_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
struct CGData_STRUCT
{
Vector r; //!< pointer to residual vector
Vector z; //!< pointer to preconditioned residual vector
Vector p; //!< pointer to direction vector
Vector Ap; //!< pointer to Krylov vector
};
typedef struct CGData_STRUCT CGData;
/*!
Constructor for the data structure of CG vectors.
@param[in] A the data structure that describes the problem matrix and its structure
@param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
*/
inline void InitializeSparseCGData(SparseMatrix& A, CGData& data)
{
local_int_t nrow = A.localNumberOfRows;
local_int_t ncol = A.localNumberOfColumns;
InitializeVector(data.r, nrow, A.rankType);
InitializeVector(data.z, ncol, A.rankType, true /*Only when rank type is GPU*/);
InitializeVector(data.p, ncol, A.rankType, true);
InitializeVector(data.Ap, nrow, A.rankType);
return;
}
/*!
Destructor for the CG vectors data.
@param[inout] data the CG vectors data structure whose storage is deallocated
*/
inline void DeleteCGData(CGData& data)
{
DeleteVector(data.r);
DeleteVector(data.z);
DeleteVector(data.p);
DeleteVector(data.Ap);
return;
}
#endif // CGDATA_HPP

198
src/CG_ref.cpp Normal file
View File

@@ -0,0 +1,198 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file CG_ref.cpp
HPCG routine
*/
#include <cmath>
#include <fstream>
#include <iostream>
#include "hpcg.hpp"
#include "CG_ref.hpp"
#include "ComputeDotProduct_ref.hpp"
#include "ComputeMG_ref.hpp"
#include "ComputeSPMV_ref.hpp"
#include "ComputeWAXPBY_ref.hpp"
#include "mytimer.hpp"
extern int use_output_file;
// Use TICK and TOCK to time a code section in MATLAB-like fashion
#define TICK() t0 = mytimer() //!< record current time in 't0'
#define TOCK(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
/*!
Reference routine to compute an approximate solution to Ax = b
@param[inout] A The known system matrix
@param[inout] data The data structure with all necessary CG vectors preallocated
@param[in] b The known right hand side vector
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
@param[in] max_iter The maximum number of iterations to perform, even if tolerance is not met.
@param[in] tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
@param[out] niters The number of iterations actually performed.
@param[out] normr The 2-norm of the residual vector after the last iteration.
@param[out] normr0 The 2-norm of the residual vector before the first iteration.
@param[out] times The 7-element vector of the timing information accumulated during all of the iterations.
@param[in] doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
@return Returns zero on success and a non-zero value otherwise.
@see CG()
*/
int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
{
double t_begin = mytimer(); // Start timing right away
normr = 0.0;
double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
// #ifndef HPCG_NO_MPI
// double t6 = 0.0;
// #endif
local_int_t nrow = A.localNumberOfRows;
Vector& r = data.r; // Residual vector
Vector& z = data.z; // Preconditioned residual vector
Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
Vector& Ap = data.Ap;
if (!doPreconditioning && A.geom->rank == 0)
if (use_output_file)
{
HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
}
else
{
std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
}
#if 1
// def HPCG_DEBUG
int print_freq = 1;
if (print_freq > 50)
print_freq = 50;
if (print_freq < 1)
print_freq = 1;
#endif
// p is of length ncols, copy x to p for sparse MV operation
CopyVector(x, p);
TICK();
ComputeSPMV_ref(A, p, Ap);
TOCK(t3); // Ap = A*p
TICK();
ComputeWAXPBY_ref(nrow, 1.0, b, -1.0, Ap, r);
TOCK(t2); // r = b - Ax (x stored in p)
TICK();
ComputeDotProduct_ref(nrow, r, r, normr, t4);
TOCK(t1);
normr = sqrt(normr);
#if 1
// def HPCG_DEBUG
if (A.geom->rank == 0 && flag)
if (use_output_file)
{
HPCG_fout << "Initial Residual = " << normr << std::endl;
}
else
{
std::cout << "Initial Residual = " << normr << std::endl;
}
#endif
// Record initial residual for convergence testing
normr0 = normr;
// Start iterations
for (int k = 1; k <= max_iter && normr / normr0 > tolerance; k++)
{
TICK();
if (doPreconditioning)
ComputeMG_ref(A, r, z); // Apply preconditioner
else
ComputeWAXPBY_ref(nrow, 1.0, r, 0.0, r, z); // copy r to z (no preconditioning)
TOCK(t5); // Preconditioner apply time
if (k == 1)
{
CopyVector(z, p);
TOCK(t2); // Copy Mr to p
TICK();
ComputeDotProduct_ref(nrow, r, z, rtz, t4);
TOCK(t1); // rtz = r'*z
}
else
{
oldrtz = rtz;
TICK();
ComputeDotProduct_ref(nrow, r, z, rtz, t4);
TOCK(t1); // rtz = r'*z
beta = rtz / oldrtz;
TICK();
ComputeWAXPBY_ref(nrow, 1.0, z, beta, p, p);
TOCK(t2); // p = beta*p + z
}
TICK();
ComputeSPMV_ref(A, p, Ap);
TOCK(t3); // Ap = A*p
TICK();
ComputeDotProduct_ref(nrow, p, Ap, pAp, t4);
TOCK(t1); // alpha = p'*Ap
alpha = rtz / pAp;
TICK();
ComputeWAXPBY_ref(nrow, 1.0, x, alpha, p, x); // x = x + alpha*p
ComputeWAXPBY_ref(nrow, 1.0, r, -alpha, Ap, r);
TOCK(t2); // r = r - alpha*Ap
TICK();
ComputeDotProduct_ref(nrow, r, r, normr, t4);
TOCK(t1);
normr = sqrt(normr);
#if 1
// def HPCG_DEBUG
if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
if (use_output_file)
{
HPCG_fout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
}
else
{
std::cout << "Iteration = " << k << " Scaled Residual = " << normr / normr0 << std::endl;
}
#endif
niters = k;
}
// Store times
times[1] += t1; // dot product time
times[2] += t2; // WAXPBY time
times[3] += t3; // SPMV time
times[4] += t4; // AllReduce time
times[5] += t5; // preconditioner apply time
// #ifndef HPCG_NO_MPI
// times[6] += t6; // exchange halo time
// #endif
times[0] += mytimer() - t_begin; // Total time. All done...
return 0;
}

42
src/CG_ref.hpp Normal file
View File

@@ -0,0 +1,42 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef CG_REF_HPP
#define CG_REF_HPP
#include "CGData.hpp"
#include "SparseMatrix.hpp"
#include "Vector.hpp"
// The use of CPU and GPU Sparse Matrix is intended to resolve
// the linked list structures for MG coarse levels
// There is no change of th erefernce code
int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
// this function will compute the Conjugate Gradient iterations.
// geom - Domain and processor topology information
// A - Matrix
// b - constant
// x - used for return value
// max_iter - how many times we iterate
// tolerance - Stopping tolerance for preconditioned iterations.
// niters - number of iterations performed
// normr - computed residual norm
// normr0 - Original residual
// times - array of timing information
// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
#endif // CG_REF_HPP

84
src/CheckAspectRatio.cpp Normal file
View File

@@ -0,0 +1,84 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file CheckAspectRatio.cpp
HPCG routine
*/
#include <algorithm>
#include <iostream>
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#include "hpcg.hpp"
#include "CheckAspectRatio.hpp"
extern int use_output_file;
int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo)
{
double current_ratio = std::min(std::min(x, y), z) / double(std::max(std::max(x, y), z));
if (current_ratio < smallest_ratio)
{ // ratio of the smallest to the largest
if (DoIo)
{
if (use_output_file)
{
HPCG_fout << "The " << what << " sizes (" << x << "," << y << "," << z
<< ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
<< " is too small (at least " << smallest_ratio << " is required)." << std::endl;
HPCG_fout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl;
HPCG_fout.flush();
}
else
{
std::cout << "The " << what << " sizes (" << x << "," << y << "," << z
<< ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
<< " is too small (at least " << smallest_ratio << " is required)." << std::endl;
std::cout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl
<< std::flush;
}
}
#ifndef HPCG_NO_MPI
MPI_Abort(MPI_COMM_WORLD, 127);
#endif
return 127;
}
return 0;
}

18
src/CheckAspectRatio.hpp Normal file
View File

@@ -0,0 +1,18 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef CHECKASPECTRATIO_HPP
#define CHECKASPECTRATIO_HPP
extern int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo);
#endif // CHECKASPECTRATIO_HPP

192
src/CheckProblem.cpp Normal file
View File

@@ -0,0 +1,192 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file CheckProblem.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
#include <fstream>
using std::endl;
#include "hpcg.hpp"
#endif
#include <cassert>
#include "CheckProblem.hpp"
/*!
Check the contents of the generated sparse matrix to see if values match expected contents.
@param[in] A The known system matrix
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
non-zero on entry)
@see GenerateGeometry
*/
void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
{
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
// below may result in global range values.
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
double* bv = 0;
double* xv = 0;
double* xexactv = 0;
if (b != 0)
bv = b->values; // Only compute exact solution if requested
if (x != 0)
xv = x->values; // Only compute exact solution if requested
if (xexact != 0)
xexactv = xexact->values; // Only compute exact solution if requested
local_int_t localNumberOfNonzeros = 0;
// TODO: This triply nested loop could be flattened or use nested parallelism
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t iz = 0; iz < nz; iz++)
{
global_int_t giz = giz0 + iz;
for (local_int_t iy = 0; iy < ny; iy++)
{
global_int_t giy = giy0 + iy;
for (local_int_t ix = 0; ix < nx; ix++)
{
global_int_t gix = gix0 + ix;
local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
assert(A.localToGlobalMap[currentLocalRow] == currentGlobalRow);
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
<< A.globalToLocalMap.find(currentGlobalRow)->second << endl;
#endif
char numberOfNonzerosInRow = 0;
double* currentValuePointer
= A.matrixValues[currentLocalRow]; // Pointer to current value in current row
global_int_t* currentIndexPointerG
= A.mtxIndG[currentLocalRow]; // Pointer to current index in current row
for (int sz = -1; sz <= 1; sz++)
{
if (giz + sz > -1 && giz + sz < gnz)
{
for (int sy = -1; sy <= 1; sy++)
{
if (giy + sy > -1 && giy + sy < gny)
{
for (int sx = -1; sx <= 1; sx++)
{
if (gix + sx > -1 && gix + sx < gnx)
{
global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
if (curcol == currentGlobalRow)
{
assert(A.matrixDiagonal[currentLocalRow] == currentValuePointer);
assert(*currentValuePointer++ == 26.0);
}
else
{
assert(*currentValuePointer++ == -1.0);
}
assert(*currentIndexPointerG++ == curcol);
numberOfNonzerosInRow++;
} // end x bounds test
} // end sx loop
} // end y bounds test
} // end sy loop
} // end z bounds test
} // end sz loop
assert(A.nonzerosInRow[currentLocalRow] == numberOfNonzerosInRow);
#ifndef HPCG_NO_OPENMP
#pragma omp critical
#endif
localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
if (b != 0)
assert(bv[currentLocalRow] == 26.0 - ((double) (numberOfNonzerosInRow - 1)));
if (x != 0)
assert(xv[currentLocalRow] == 0.0);
if (xexact != 0)
assert(xexactv[currentLocalRow] == 1.0);
} // end ix loop
} // end iy loop
} // end iz loop
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
<< endl
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
<< " nonzeros." << endl;
#endif
global_int_t totalNumberOfNonzeros = 0;
#ifndef HPCG_NO_MPI
// Use MPI's reduce function to sum all nonzeros
#ifdef HPCG_NO_LONG_LONG
MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
#else
long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
totalNumberOfNonzeros = gnnz; // Copy back
#endif
#else
totalNumberOfNonzeros = localNumberOfNonzeros;
#endif
assert(A.totalNumberOfRows == totalNumberOfRows);
assert(A.totalNumberOfNonzeros == totalNumberOfNonzeros);
assert(A.localNumberOfRows == localNumberOfRows);
assert(A.localNumberOfNonzeros == localNumberOfNonzeros);
return;
}

21
src/CheckProblem.hpp Normal file
View File

@@ -0,0 +1,21 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef CHECKPROBLEM_HPP
#define CHECKPROBLEM_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
#endif // CHECKPROBLEM_HPP

114
src/ComputeDotProduct.cpp Normal file
View File

@@ -0,0 +1,114 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeDotProduct.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include "mytimer.hpp"
#include <mpi.h>
#endif
#include "ComputeDotProduct.hpp"
#include "ComputeDotProduct_ref.hpp"
#ifdef USE_CUDA
#include "Cuda.hpp"
#define CHECK_CUBLAS(x) \
do \
{ \
cublasStatus_t cublasStatus = (x); \
if (cublasStatus != CUBLAS_STATUS_SUCCESS) \
{ \
fprintf(stderr, "CUBLAS: %s = %d at (%s:%d)\n", #x, cublasStatus, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
#endif
#ifdef USE_GRACE
#include "CpuKernels.hpp"
#endif
/*!
Routine to compute the dot product of two vectors.
This routine calls the reference dot-product implementation by default, but
can be replaced by a custom routine that is optimized and better suited for
the target system.
@param[in] n the number of vector elements (on this processor)
@param[in] x, y the input vectors
@param[out] result a pointer to scalar value, on exit will contain the result.
@param[out] time_allreduce the time it took to perform the communication between processes
@param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
otherwise leave it unchanged
@return returns 0 upon success and non-zero otherwise
@see ComputeDotProduct_ref
*/
int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
bool& isOptimized, rank_type_t rt)
{
double local_result = 0.0;
if (rt == GPU)
{
#ifdef USE_CUDA
cublasStatus_t t = cublasDdot(cublashandle, n, x.values_d, 1, y.values_d, 1, &local_result);
#endif
}
else
{
#ifdef USE_GRACE
// Consider replacing with NVPL BLAS dot product
ComputeDotProductCpu(n, x, y, local_result, isOptimized);
#endif
}
#ifndef HPCG_NO_MPI
// Use MPI's reduce function to collect all partial sums
double t0 = mytimer();
double global_result = 0.0;
MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
result = global_result;
t0 = mytimer() - t0;
time_allreduce += t0;
#else
time_allreduce += 0.0;
result = local_result;
#endif
return 0;
}

39
src/ComputeDotProduct.hpp Normal file
View File

@@ -0,0 +1,39 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COMPUTEDOTPRODUCT_HPP
#define COMPUTEDOTPRODUCT_HPP
#include "Vector.hpp"
int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
bool& isOptimized, rank_type_t rt);
#endif // COMPUTEDOTPRODUCT_HPP

View File

@@ -0,0 +1,84 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeDotProduct_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include "mytimer.hpp"
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeDotProduct_ref.hpp"
#include <cassert>
/*!
Routine to compute the dot product of two vectors where:
This is the reference dot-product implementation. It _CANNOT_ be modified for the
purposes of this benchmark.
@param[in] n the number of vector elements (on this processor)
@param[in] x, y the input vectors
@param[in] result a pointer to scalar value, on exit will contain result.
@param[out] time_allreduce the time it took to perform the communication between processes
@return returns 0 upon success and non-zero otherwise
@see ComputeDotProduct
*/
int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
{
assert(x.localLength >= n); // Test vector lengths
assert(y.localLength >= n);
double local_result = 0.0;
double* xv = x.values;
double* yv = y.values;
if (yv == xv)
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for reduction(+ : local_result)
#endif
for (local_int_t i = 0; i < n; i++)
local_result += xv[i] * xv[i];
}
else
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for reduction(+ : local_result)
#endif
for (local_int_t i = 0; i < n; i++)
local_result += xv[i] * yv[i];
}
#ifndef HPCG_NO_MPI
// Use MPI's reduce function to collect all partial sums
double t0 = mytimer();
double global_result = 0.0;
MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
result = global_result;
time_allreduce += mytimer() - t0;
#else
time_allreduce += 0.0;
result = local_result;
#endif
return 0;
}

View File

@@ -0,0 +1,21 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEDOTPRODUCT_REF_HPP
#define COMPUTEDOTPRODUCT_REF_HPP
#include "Vector.hpp"
int ComputeDotProduct_ref(
const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce);
#endif // COMPUTEDOTPRODUCT_REF_HPP

96
src/ComputeMG.cpp Normal file
View File

@@ -0,0 +1,96 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeMG.cpp
HPCG routine
*/
#include "ComputeMG.hpp"
#include "ComputeProlongation.hpp"
#include "ComputeRestriction.hpp"
#include "ComputeSYMGS.hpp"
#include "CudaKernels.hpp"
/*!
@param[in] A the known system matrix
@param[in] r the input vector
@param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
r.
@return returns 0 upon success and non-zero otherwise
@see ComputeMG_ref
*/
int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x)
{
int ierr = 0;
if (A.mgData != 0)
{ // Go to next coarse level if defined
ComputeSYMGS(A, r, x, 1);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
ComputeRestrictionCuda(A, r);
#endif
}
else
{
#ifdef USE_GRACE
ComputeRestriction(A, r);
#endif
}
ierr = ComputeMG(*A.Ac, *A.mgData->rc, *A.mgData->xc);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
ComputeProlongationCuda(A, x);
#endif
}
else
{
#ifdef USE_GRACE
ComputeProlongation(A, x);
#endif
}
ComputeSYMGS(A, r, x, 0);
}
else
{
ComputeSYMGS(A, r, x, 1);
}
return 0;
}

22
src/ComputeMG.hpp Normal file
View File

@@ -0,0 +1,22 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEMG_HPP
#define COMPUTEMG_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x);
#endif // COMPUTEMG_HPP

81
src/ComputeMG_ref.cpp Normal file
View File

@@ -0,0 +1,81 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeSYMGS_ref.cpp
HPCG routine
*/
#include "ComputeMG_ref.hpp"
#include "ComputeProlongation_ref.hpp"
#include "ComputeRestriction_ref.hpp"
#include "ComputeSPMV_ref.hpp"
#include "ComputeSYMGS_ref.hpp"
#include <cassert>
#include <iostream>
/*!
@param[in] A the known system matrix
@param[in] r the input vector
@param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
r.
@return returns 0 upon success and non-zero otherwise
@see ComputeMG
*/
int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x)
{
assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
ZeroVector(x); // initialize x to zero
int ierr = 0;
if (A.mgData != 0)
{ // Go to next coarse level if defined
int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
for (int i = 0; i < numberOfPresmootherSteps; ++i)
ierr += ComputeSYMGS_ref(A, r, x);
if (ierr != 0)
return ierr;
ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf);
if (ierr != 0)
return ierr;
// Perform restriction operation using simple injection
ierr = ComputeRestriction_ref(A, r);
if (ierr != 0)
return ierr;
ierr = ComputeMG_ref(*A.Ac, *A.mgData->rc, *A.mgData->xc);
if (ierr != 0)
return ierr;
ierr = ComputeProlongation_ref(A, x);
if (ierr != 0)
return ierr;
int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
for (int i = 0; i < numberOfPostsmootherSteps; ++i)
ierr += ComputeSYMGS_ref(A, r, x);
if (ierr != 0)
return ierr;
}
else
{
ierr = ComputeSYMGS_ref(A, r, x);
if (ierr != 0)
return ierr;
}
return 0;
}

26
src/ComputeMG_ref.hpp Normal file
View File

@@ -0,0 +1,26 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEMG_REF_HPP
#define COMPUTEMG_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
// The use of CPU and GPU Sparse Matrix is intended to resolve
// the linked list structures for MG coarse levels (A->Ac)
// There is no change of th erefernce code
int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x);
#endif // COMPUTEMG_REF_HPP

View File

@@ -0,0 +1,175 @@
#include <cmath>
#include <cstdlib>
#ifdef HPCG_CUBIC_RADICAL_SEARCH
#include <algorithm>
#endif
#include <map>
#include "ComputeOptimalShapeXYZ.hpp"
#include "MixedBaseCounter.hpp"
#ifdef HPCG_CUBIC_RADICAL_SEARCH
static int min3(int a, int b, int c)
{
return std::min(a, std::min(b, c));
}
static int max3(int a, int b, int c)
{
return std::max(a, std::max(b, c));
}
static void cubic_radical_search(int n, int& x, int& y, int& z)
{
double best = 0.0;
for (int f1 = (int) (pow(n, 1.0 / 3.0) + 0.5); f1 > 0; --f1)
if (n % f1 == 0)
{
int n1 = n / f1;
for (int f2 = (int) (pow(n1, 0.5) + 0.5); f2 > 0; --f2)
if (n1 % f2 == 0)
{
int f3 = n1 / f2;
double current = (double) min3(f1, f2, f3) / max3(f1, f2, f3);
if (current > best)
{
best = current;
x = f1;
y = f2;
z = f3;
}
}
}
}
#else
static void ComputePrimeFactors(int n, std::map<int, int>& factors)
{
int d, sq = int((sqrt(double(n))) + 1L);
div_t r;
// remove 2 as a factor with shifts instead "/" and "%"
for (; n > 1 && (n & 1) == 0; n >>= 1)
{
factors[2]++;
}
// keep removing subsequent odd numbers
for (d = 3; d <= sq; d += 2)
{
while (1)
{
r = div(n, d);
if (r.rem == 0)
{
factors[d]++;
n = r.quot;
continue;
}
break;
}
}
if (n > 1 || factors.size() == 0) // left with a prime or x==1
factors[n]++;
}
static int pow_i(int x, int p)
{
int v;
if (0 == x || 1 == x)
return x;
if (p < 0)
return 0;
for (v = 1; p; p >>= 1)
{
if (1 & p)
v *= x;
x *= x;
}
return v;
}
#endif
void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z)
{
#ifdef HPCG_CUBIC_RADICAL_SEARCH
cubic_radical_search(xyz, x, y, z);
#else
std::map<int, int> factors;
ComputePrimeFactors(xyz, factors); // factors are sorted: ascending order
std::map<int, int>::iterator iter = factors.begin();
// there is at least one prime factor
x = (iter++)->first; // cache the first factor, move to the next one
y = iter != factors.end() ? (iter++)->first : y; // try to cache the second factor in "y"
if (factors.size() == 1)
{ // only a single factor
z = pow_i(x, factors[x] / 3);
y = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0));
x = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0));
}
else if (factors.size() == 2 && factors[x] == 1 && factors[y] == 1)
{ // two distinct prime factors
z = 1;
}
else if (factors.size() == 2 && factors[x] + factors[y] == 3)
{ // three prime factors, one repeated
z = factors[x] == 2 ? x : y; // test which factor is repeated
}
else if (factors.size() == 3 && factors[x] == 1 && factors[y] == 1 && iter->second == 1)
{ // three distinct and single prime factors
z = iter->first;
}
else
{ // 3 or more prime factors so try all possible 3-subsets
int i, distinct_factors[32 + 1], count_factors[32 + 1];
i = 0;
for (std::map<int, int>::iterator iter = factors.begin(); iter != factors.end(); ++iter, ++i)
{
distinct_factors[i] = iter->first;
count_factors[i] = iter->second;
}
// count total number of prime factors in "c_main" and distribute some factors into "c1"
MixedBaseCounter c_main(count_factors, factors.size()), c1(count_factors, factors.size());
// at the beginning, minimum area is the maximum area
double area, min_area = 2.0 * xyz + 1.0;
for (c1.next(); !c1.is_zero(); c1.next())
{
MixedBaseCounter c2(c_main, c1); // "c2" gets the factors remaining in "c_main" that "c1" doesn't have
for (c2.next(); !c2.is_zero(); c2.next())
{
int tf1 = c1.product(distinct_factors);
int tf2 = c2.product(distinct_factors);
int tf3 = xyz / tf1 / tf2; // we derive the third dimension, we don't keep track of the factors it has
area = tf1 * double(tf2) + tf2 * double(tf3) + tf1 * double(tf3);
if (area < min_area)
{
min_area = area;
x = tf1;
y = tf2;
z = tf3;
}
}
}
}
#endif
}

View File

@@ -0,0 +1,2 @@
void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);

View File

@@ -0,0 +1,72 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeProlongation.cpp
HPCG routine
*/
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeProlongation.hpp"
/*!
Routine to compute the coarse residual vector.
@param[in] Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
operator.
@param[inout] xf - Fine grid solution vector, update with coarse grid correction.
Note that the fine grid residual is never explicitly constructed.
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
@return Returns zero on success and a non-zero value otherwise.
*/
int ComputeProlongation(const SparseMatrix& Af, Vector& xf)
{
double* xfv = xf.values;
double* xcv = Af.mgData->xc->values;
local_int_t* f2c = Af.mgData->f2cOperator;
local_int_t nc = Af.mgData->rc->localLength;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < nc; ++i)
{
xfv[Af.f2cPerm[i]] += xcv[i];
}
return 0;
}

View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEPROLONGATION_HPP
#define COMPUTEPROLONGATION_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeProlongation(const SparseMatrix& Af, Vector& xf);
#endif // COMPUTEPROLONGATION_HPP

View File

@@ -0,0 +1,55 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeProlongation_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeProlongation_ref.hpp"
/*!
Routine to compute the coarse residual vector.
@param[in] Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
operator.
@param[inout] xf - Fine grid solution vector, update with coarse grid correction.
Note that the fine grid residual is never explicitly constructed.
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
@return Returns zero on success and a non-zero value otherwise.
*/
int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf)
{
double* xfv = xf.values;
double* xcv = Af.mgData->xc->values;
local_int_t* f2c = Af.mgData->f2cOperator;
local_int_t nc = Af.mgData->rc->localLength;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
// TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
for (local_int_t i = 0; i < nc; ++i)
xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
return 0;
}

View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEPROLONGATION_REF_HPP
#define COMPUTEPROLONGATION_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf);
#endif // COMPUTEPROLONGATION_REF_HPP

95
src/ComputeResidual.cpp Normal file
View File

@@ -0,0 +1,95 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeResidual.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "Vector.hpp"
#ifdef HPCG_DETAILED_DEBUG
#include "hpcg.hpp"
#include <fstream>
#endif
#include "ComputeResidual.hpp"
#include <cmath> // needed for fabs
#ifdef HPCG_DETAILED_DEBUG
#include <iostream>
#endif
/*!
Routine to compute the inf-norm difference between two vectors where:
@param[in] n number of vector elements (local to this processor)
@param[in] v1, v2 input vectors
@param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference
@return Returns zero on success and a non-zero value otherwise.
*/
int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual)
{
double* v1v = v1.values;
double* v2v = v2.values;
double local_residual = 0.0;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel shared(local_residual, v1v, v2v)
{
double threadlocal_residual = 0.0;
#pragma omp for
for (local_int_t i = 0; i < n; i++)
{
double diff = std::fabs(v1v[i] - v2v[i]);
if (diff > threadlocal_residual)
threadlocal_residual = diff;
}
#pragma omp critical
{
if (threadlocal_residual > local_residual)
local_residual = threadlocal_residual;
}
}
#else // No threading
for (local_int_t i = 0; i < n; i++)
{
double diff = std::fabs(v1v[i] - v2v[i]);
if (diff > local_residual)
local_residual = diff;
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << " Computed, exact, diff = " << v1v[i] << " " << v2v[i] << " " << diff << std::endl;
#endif
}
#endif
#ifndef HPCG_NO_MPI
// Use MPI's reduce function to collect all partial sums
double global_residual = 0;
MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
residual = global_residual;
#else
residual = local_residual;
#endif
return 0;
}

19
src/ComputeResidual.hpp Normal file
View File

@@ -0,0 +1,19 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTERESIDUAL_HPP
#define COMPUTERESIDUAL_HPP
#include "Vector.hpp"
int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual);
#endif // COMPUTERESIDUAL_HPP

View File

@@ -0,0 +1,75 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeRestriction.cpp
HPCG routine
*/
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeRestriction.hpp"
/*!
Routine to compute the coarse residual vector.
@param[inout] A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
mgData->rc the coarse residual vector.
@param[in] rf - Fine grid RHS.
Note that the fine grid residual is never explicitly constructed.
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
@return Returns zero on success and a non-zero value otherwise.
*/
int ComputeRestriction(const SparseMatrix& A, const Vector& rf)
{
double* Axfv = A.mgData->Axf->values;
double* rfv = rf.values;
double* rcv = A.mgData->rc->values;
local_int_t* f2c = A.mgData->f2cOperator;
local_int_t nc = A.mgData->rc->localLength;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < nc; ++i)
{
rcv[i] = rfv[A.f2cPerm[i]] - Axfv[A.f2cPerm[i]];
}
return 0;
}

View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTERESTRICTION_HPP
#define COMPUTERESTRICTION_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeRestriction(const SparseMatrix& A, const Vector& rf);
#endif // COMPUTERESTRICTION_HPP

View File

@@ -0,0 +1,56 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeRestriction_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "ComputeRestriction_ref.hpp"
/*!
Routine to compute the coarse residual vector.
@param[inout] A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
mgData->rc the coarse residual vector.
@param[in] rf - Fine grid RHS.
Note that the fine grid residual is never explicitly constructed.
We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
@return Returns zero on success and a non-zero value otherwise.
*/
int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf)
{
double* Axfv = A.mgData->Axf->values;
double* rfv = rf.values;
double* rcv = A.mgData->rc->values;
local_int_t* f2c = A.mgData->f2cOperator;
local_int_t nc = A.mgData->rc->localLength;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < nc; ++i)
rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
return 0;
}

View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTERESTRICTION_REF_HPP
#define COMPUTERESTRICTION_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf);
#endif // COMPUTERESTRICTION_REF_HPP

111
src/ComputeSPMV.cpp Normal file
View File

@@ -0,0 +1,111 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeSPMV.cpp
HPCG routine
*/
#include "ComputeSPMV.hpp"
#include "ComputeSPMV_ref.hpp"
#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#endif
#ifdef USE_CUDA
#include "Cuda.hpp"
#include "CudaKernels.hpp"
#endif
#include "CpuKernels.hpp"
/*!
Routine to compute sparse matrix vector product y = Ax where:
Precondition: First call exchange_externals to get off-processor values of x
This routine calls the reference SpMV implementation by default, but
can be replaced by a custom, optimized routine suited for
the target system.
@param[in] A the known system matrix
@param[in] x the known vector
@param[out] y the On exit contains the result: Ax.
@return returns 0 upon success and non-zero otherwise
@see ComputeSPMV_ref
*/
int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y)
{
double one = 1.0, zero = 0.0;
if (A.rankType == GPU)
{
// #ifdef USE_CUDA
#ifndef HPCG_NO_MPI
PackSendBufferCuda(A, x, false, copy_stream);
#endif
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, y.values_d);
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, A.cusparseOpt.matA, A.cusparseOpt.vecX,
&zero, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
#ifndef HPCG_NO_MPI
if (A.totalToBeSent > 0)
{
ExchangeHaloCuda(A, x, copy_stream);
ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, y.values_d);
}
#endif
cudaStreamSynchronize(stream);
// #endif
}
// else
// {
// #ifdef USE_GRACE
// nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, x.values);
// nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, y.values);
// nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matA,
// A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
// NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvADescr);
// #ifndef HPCG_NO_MPI
// if (A.totalToBeSent > 0)
// {
// ExchangeHaloCpu(A, x);
// ExtSpMVCpu(A, A.localNumberOfRows, 1.0, x.values, y.values);
// }
// #endif
// #endif // USE_GRACE
// }
return 0;
}

22
src/ComputeSPMV.hpp Normal file
View File

@@ -0,0 +1,22 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTESPMV_HPP
#define COMPUTESPMV_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y);
#endif // COMPUTESPMV_HPP

74
src/ComputeSPMV_ref.cpp Normal file
View File

@@ -0,0 +1,74 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeSPMV_ref.cpp
HPCG routine
*/
#include "ComputeSPMV_ref.hpp"
#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include <cassert>
/*!
Routine to compute matrix vector product y = Ax where:
Precondition: First call exchange_externals to get off-processor values of x
This is the reference SPMV implementation. It CANNOT be modified for the
purposes of this benchmark.
@param[in] A the known system matrix
@param[in] x the known vector
@param[out] y the On exit contains the result: Ax.
@return returns 0 upon success and non-zero otherwise
@see ComputeSPMV
*/
int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y)
{
assert(x.localLength >= A.localNumberOfColumns); // Test vector lengths
assert(y.localLength >= A.localNumberOfRows);
#ifndef HPCG_NO_MPI
ExchangeHalo(A, x);
#endif
const double* const xv = x.values;
double* const yv = y.values;
const local_int_t nrow = A.localNumberOfRows;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < nrow; i++)
{
double sum = 0.0;
const double* const cur_vals = A.matrixValues[i];
const local_int_t* const cur_inds = A.mtxIndL[i];
const int cur_nnz = A.nonzerosInRow[i];
for (int j = 0; j < cur_nnz; j++)
sum += cur_vals[j] * xv[cur_inds[j]];
yv[i] = sum;
}
return 0;
}

22
src/ComputeSPMV_ref.hpp Normal file
View File

@@ -0,0 +1,22 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTESPMV_REF_HPP
#define COMPUTESPMV_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y);
#endif // COMPUTESPMV_REF_HPP

309
src/ComputeSYMGS.cpp Normal file
View File

@@ -0,0 +1,309 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeSYMGS.cpp
HPCG routine
*/
#ifdef USE_CUDA
#include "Cuda.hpp"
#endif
#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#endif
#include "ComputeSPMV.hpp"
#include "ComputeSYMGS.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
/*!
Routine to compute one step of symmetric Gauss-Seidel:
Assumption about the structure of matrix A:
- Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
- Entries in row 'i' are ordered such that:
- lower triangular terms are stored before the diagonal element.
- upper triangular terms are stored after the diagonal element.
- No other assumptions are made about entry ordering.
Symmetric Gauss-Seidel notes:
- We use the input vector x as the RHS and start with an initial guess for y of all zeros.
- We perform one forward sweep. Since y is initially zero we can ignore the upper triangular terms of A.
- We then perform one back sweep.
- For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
@param[in] A the known system matrix
@param[in] r the input vector
@param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
with r as the RHS.
@return returns 0 upon success and non-zero otherwise
@warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
of sync with other kernels.
@see ComputeSYMGS_ref
*/
#ifdef USE_CUDA
int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
{
double* tmp_d;
if (step == 1 && A.mgData != 0)
{
tmp_d = (*A.mgData->Axf).values_d;
}
else
{
tmp_d = A.tempBuffer;
}
const local_int_t nrow = A.localNumberOfRows;
double alpha = 1.0;
cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
if (step == 1)
{
// TRSV(D+L, r, t)
cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
// SPMV(D, t, t)
SpmvDiagCuda(nrow, tmp_d, A.diagonal);
// TRSV(D+U, t, x)
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
if (A.mgData != 0)
{
#ifndef HPCG_NO_MPI
cudaStreamSynchronize(stream);
PackSendBufferCuda(A, x, false, copy_stream);
#endif
// SPMV(L, x, t): t = t + L * x
double alpha = 1.0;
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
#ifndef HPCG_NO_MPI
if (A.totalToBeSent > 0)
{
ExchangeHaloCuda(A, x, copy_stream);
double one = 1.0, zero = 0.0;
ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
}
#endif
}
}
else
{ // step == 0
#ifndef HPCG_NO_MPI
cudaStreamSynchronize(stream);
PackSendBufferCuda(A, x, false, copy_stream);
#endif
// SPMV(U, x, t): t = U * x
double alpha = 1.0, beta = 0.0;
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
&beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
// tmp = rv - t
AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
#ifndef HPCG_NO_MPI
if (A.totalToBeSent > 0)
{
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
double mone = -1.0, zero = 0.0;
ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
}
#endif
// TRSV(D+L, r-t, x)
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
// SPMV(D, x, t) t += D*x
SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
// TRSV(D+U, x, x)
cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
}
return 0;
}
#endif
#ifdef USE_GRACE
int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
{
local_int_t nrow = A.localNumberOfRows;
double* temp;
if (step == 1 && A.mgData != 0)
{
temp = (*A.mgData->Axf).values;
}
else
{
temp = A.tempBuffer;
}
double* xv = x.values;
double* rv = r.values;
double one = 1.0, zero = 0.0;
nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
if (step == 1)
{
// TRSV(L, r, x)
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
nvpl_sparse_sp_mat_set_attribute(
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
A.nvplSparseOpt.spsvDescrL);
// SPMV(D, x, t) t = D*x
SpmvDiagCpu(nrow, A.diagonal, xv, temp);
// TRSV(U, x, x)
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
nvpl_sparse_sp_mat_set_attribute(
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
A.nvplSparseOpt.spsvDescrU);
if (A.mgData != 0)
{
// SPMV(L, x, t): t += L*x
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
#ifndef HPCG_NO_MPI
ExchangeHaloCpu(A, x);
if (A.totalToBeSent > 0)
{
ExtSpMVCpu(A, nrow, 1.0, xv, temp);
}
#endif
}
}
else if (step == 0)
{
// SPMV(U, x, t) t = U*x
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
// axpy: t = r-t
AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
#ifndef HPCG_NO_MPI
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
if (A.totalToBeSent > 0)
{
ExtSpMVCpu(A, nrow, -1.0, xv, temp);
}
#endif
// TRSV(L, r-t, x)
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
nvpl_sparse_sp_mat_set_attribute(
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
A.nvplSparseOpt.spsvDescrL);
// SPMV(D, x, t) t += D*x
SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
// TRSV(U, x, x)
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
nvpl_sparse_sp_mat_set_attribute(
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
A.nvplSparseOpt.spsvDescrU);
}
return 0;
}
#endif // USE_GRACE
int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
{
if (A.rankType == GPU)
{
#ifdef USE_CUDA
ComputeSYMGS_Gpu(A, r, x, step);
#endif
}
else
{
#ifdef USE_GRACE
ComputeSYMGS_Cpu(A, r, x, step);
#endif
}
return 0;
}

39
src/ComputeSYMGS.hpp Normal file
View File

@@ -0,0 +1,39 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COMPUTESYMGS_HPP
#define COMPUTESYMGS_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step);
#endif // COMPUTESYMGS_HPP

110
src/ComputeSYMGS_ref.cpp Normal file
View File

@@ -0,0 +1,110 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeSYMGS_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#endif
#include "ComputeSYMGS_ref.hpp"
#include <cassert>
/*!
Computes one step of symmetric Gauss-Seidel:
Assumption about the structure of matrix A:
- Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
- Entries in row 'i' are ordered such that:
- lower triangular terms are stored before the diagonal element.
- upper triangular terms are stored after the diagonal element.
- No other assumptions are made about entry ordering.
Symmetric Gauss-Seidel notes:
- We use the input vector x as the RHS and start with an initial guess for y of all zeros.
- We perform one forward sweep. x should be initially zero on the first GS sweep, but we do not attempt to exploit
this fact.
- We then perform one back sweep.
- For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
@param[in] A the known system matrix
@param[in] r the input vector
@param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
with r as the RHS.
@warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
of sync with other kernels.
@return returns 0 upon success and non-zero otherwise
@see ComputeSYMGS
*/
int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x)
{
assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
#ifndef HPCG_NO_MPI
ExchangeHalo(A, x);
#endif
const local_int_t nrow = A.localNumberOfRows;
double** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues
const double* const rv = r.values;
double* const xv = x.values;
for (local_int_t i = 0; i < nrow; i++)
{
const double* const currentValues = A.matrixValues[i];
const local_int_t* const currentColIndices = A.mtxIndL[i];
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
double sum = rv[i]; // RHS value
for (int j = 0; j < currentNumberOfNonzeros; j++)
{
local_int_t curCol = currentColIndices[j];
sum -= currentValues[j] * xv[curCol];
}
sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
xv[i] = sum / currentDiagonal;
}
// Now the back sweep.
for (local_int_t i = nrow - 1; i >= 0; i--)
{
const double* const currentValues = A.matrixValues[i];
const local_int_t* const currentColIndices = A.mtxIndL[i];
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
double sum = rv[i]; // RHS value
for (int j = 0; j < currentNumberOfNonzeros; j++)
{
local_int_t curCol = currentColIndices[j];
sum -= currentValues[j] * xv[curCol];
}
sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
xv[i] = sum / currentDiagonal;
}
return 0;
}

22
src/ComputeSYMGS_ref.hpp Normal file
View File

@@ -0,0 +1,22 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTESYMGS_REF_HPP
#define COMPUTESYMGS_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x);
#endif // COMPUTESYMGS_REF_HPP

89
src/ComputeWAXPBY.cpp Normal file
View File

@@ -0,0 +1,89 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ComputeWAXPBY.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include "mytimer.hpp"
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#ifdef USE_CUDA
#include "Cuda.hpp"
#endif
#include "ComputeWAXPBY.hpp"
#include "ComputeWAXPBY_ref.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
#include "SparseMatrix.hpp"
/*!
Routine to compute the update of a vector with the sum of two
scaled vectors where: w = alpha*x + beta*y
This routine calls the reference WAXPBY implementation by default, but
can be replaced by a custom, optimized routine suited for
the target system.
@param[in] n the number of vector elements (on this processor)
@param[in] alpha, beta the scalars applied to x and y respectively.
@param[in] x, y the input vectors
@param[out] w the output vector
@param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
otherwise leave it unchanged
@return returns 0 upon success and non-zero otherwise
@see ComputeWAXPBY_ref
*/
int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
Vector& w, bool& isOptimized, rank_type_t rt)
{
if (rt == GPU)
{
#ifdef USE_CUDA
ComputeWAXPBYCuda(n, alpha, x, beta, y, w);
#endif
}
else
{
#ifdef USE_GRACE
ComputeWAXPBYCpu(n, alpha, x, beta, y, w, isOptimized);
#endif
}
return 0;
}

39
src/ComputeWAXPBY.hpp Normal file
View File

@@ -0,0 +1,39 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef COMPUTEWAXPBY_HPP
#define COMPUTEWAXPBY_HPP
#include "Vector.hpp"
int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
Vector& w, bool& isOptimized, rank_type_t rt);
#endif // COMPUTEWAXPBY_HPP

79
src/ComputeWAXPBY_ref.cpp Normal file
View File

@@ -0,0 +1,79 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file ComputeWAXPBY_ref.cpp
HPCG routine
*/
#include "ComputeWAXPBY_ref.hpp"
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include <cassert>
/*!
Routine to compute the update of a vector with the sum of two
scaled vectors where: w = alpha*x + beta*y
This is the reference WAXPBY impmentation. It CANNOT be modified for the
purposes of this benchmark.
@param[in] n the number of vector elements (on this processor)
@param[in] alpha, beta the scalars applied to x and y respectively.
@param[in] x, y the input vectors
@param[out] w the output vector.
@return returns 0 upon success and non-zero otherwise
@see ComputeWAXPBY
*/
int ComputeWAXPBY_ref(
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w)
{
assert(x.localLength >= n); // Test vector lengths
assert(y.localLength >= n);
const double* const xv = x.values;
const double* const yv = y.values;
double* const wv = w.values;
if (alpha == 1.0)
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < n; i++)
wv[i] = xv[i] + beta * yv[i];
}
else if (beta == 1.0)
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < n; i++)
wv[i] = alpha * xv[i] + yv[i];
}
else
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < n; i++)
wv[i] = alpha * xv[i] + beta * yv[i];
}
return 0;
}

20
src/ComputeWAXPBY_ref.hpp Normal file
View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef COMPUTEWAXPBY_REF_HPP
#define COMPUTEWAXPBY_REF_HPP
#include "Vector.hpp"
int ComputeWAXPBY_ref(
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
#endif // COMPUTEWAXPBY_REF_HPP

1351
src/CpuKernels.cpp Normal file

File diff suppressed because it is too large Load Diff

92
src/CpuKernels.hpp Normal file
View File

@@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CPUKERNELS_HPP
#define CPUKERNELS_HPP
#ifdef USE_GRACE
#include <nvpl_sparse.h>
extern nvpl_sparse_handle_t nvpl_sparse_handle;
#include "SparseMatrix.hpp"
#include "Vector.hpp"
#include <algorithm>
#include <random>
#include <vector>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
///////// Deallocate CPU Memory for data structures //
void DeleteMatrixCpu(SparseMatrix& A);
///////// Find the size of CPU reference allocated memory //
size_t EstimateCpuRefMem(SparseMatrix& A);
/*
Translation of a 3D point in all directions
27 possibilities
*/
constexpr int tid2indCpu[32][4] = {{-1, -1, -1, 0}, {0, -1, -1, 0}, {1, -1, -1, 0}, {-1, 0, -1, 0}, {0, 0, -1, 0},
{1, 0, -1, 0}, {-1, 1, -1, 0}, {0, 1, -1, 0}, {1, 1, -1, 0}, {-1, -1, 0, 0}, {0, -1, 0, 0}, {1, -1, 0, 0},
{-1, 0, 0, 0}, {0, 0, 0, 0}, {1, 0, 0, 0}, {-1, 1, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {-1, -1, 1, 0}, {0, -1, 1, 0},
{1, -1, 1, 0}, {-1, 0, 1, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {-1, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 0},
{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
// Generate Problem
// Inclusive Prefix Sum
void PrefixsumCpu(int* x, int N);
// Optimize Problem
size_t AllocateMemCpu(SparseMatrix& A_in);
void ColorMatrixCpu(SparseMatrix& A, int* num_colors);
void CreateSellPermCpu(SparseMatrix& A);
void F2cPermCpu(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2c_perm, local_int_t* perm_f, local_int_t* iperm_c);
// Permute a vector using coloring buffer
void PermVectorCpu(local_int_t* perm, Vector& x, local_int_t length);
// Test CG
void ReplaceMatrixDiagonalCpu(SparseMatrix& A, Vector diagonal);
// CG Support Kernels
// Dot-product Per single rank
void ComputeDotProductCpu(const local_int_t n, const Vector& x, const Vector& y, double& result, bool& isOptimized);
// WAXPBY
int ComputeWAXPBYCpu(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
Vector& w, bool& isOptimized);
// SYMGS
void SpmvDiagCpu(local_int_t n, const double* x, double* y, double* z);
void AxpbyCpu(local_int_t n, double* x, double* y, double* z);
void SpFmaCpu(local_int_t n, const double* x, double* y, double* z);
// External Matrix SpMV + Scatter
void ExtSpMVCpu(const SparseMatrix& A, const local_int_t n, const double alpha, const double* x, double* y);
#endif // USE_GRACE
#endif // CPUKERNELS_HPP

87
src/Cuda.hpp Normal file
View File

@@ -0,0 +1,87 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef USE_CUDA
#include "cublas_v2.h"
#include "cuda_runtime_api.h"
#include "cusparse.h"
#include <cuda.h>
#ifdef USE_NCCL
#include "nccl.h"
#endif
#ifdef USE_NVTX
#include <nvToolsExt.h>
#endif
#include <unistd.h>
extern cusparseHandle_t cusparsehandle;
extern cublasHandle_t cublashandle;
extern cudaStream_t stream;
extern cudaEvent_t copy_done;
extern cudaStream_t copy_stream;
extern int* ranktoId; // DEV:Compress rank in MPI_WORLD to Neighbors
extern int* rankToId_h; // HOST:Compress rank in MPI_WORLD to Neighbors
extern int* idToRank_h;
extern bool Use_Compression; /*USE CUDA L2 compression*/
extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
#endif
#ifdef USE_CUDA
#define CHECK_CUDART(x) \
do \
{ \
cudaError_t res = (x); \
if (res != cudaSuccess) \
{ \
char rank_name[1024]; \
gethostname(rank_name, 1024); \
fprintf(stderr, "CUDART: %s = %d (%s) on %s at (%s:%d)\n", #x, res, cudaGetErrorString(res), rank_name, \
__FILE__, __LINE__); \
exit(1); \
} \
} while (0)
// IF NVTX is needed for profiling, please define USE_NVTX
// Then, add PUSH_RANGE and POP_RANGE around the target code block
// See, https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
// #define USE_NVTX
#ifdef USE_NVTX
const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
const int num_colors = sizeof(colors) / sizeof(uint32_t);
#define PUSH_RANGE(name, cid) \
{ \
int color_id = cid; \
color_id = color_id % num_colors; \
nvtxEventAttributes_t eventAttrib = {0}; \
eventAttrib.version = NVTX_VERSION; \
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
eventAttrib.colorType = NVTX_COLOR_ARGB; \
eventAttrib.color = colors[color_id]; \
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
eventAttrib.message.ascii = name; \
nvtxRangePushEx(&eventAttrib); \
}
#define POP_RANGE nvtxRangePop();
#else
#define PUSH_RANGE(name, cid) \
{ \
}
#define POP_RANGE
#endif
#endif

2613
src/CudaKernels.cu Normal file

File diff suppressed because it is too large Load Diff

92
src/CudaKernels.hpp Normal file
View File

@@ -0,0 +1,92 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef USE_CUDA
#include "SparseMatrix.hpp"
///////// L2 Memory Compression Allocation Support Routines //
cudaError_t setProp(CUmemAllocationProp* prop);
cudaError_t cudaMallocCompressible(void** adr, size_t size);
cudaError_t cudaFreeCompressible(void* ptr, size_t size);
///////// Allocate CUDA Memory for data structures //
local_int_t EstimateLUmem(local_int_t n, local_int_t padded_n, local_int_t level);
void AllocateMemCuda(SparseMatrix& A_in);
void AllocateMemOptCuda(SparseMatrix& A_in);
///////// Deallocate CUDA Memory for data structures //
void DeleteMatrixGpu(SparseMatrix& A);
///////// Genrerate Problem //
void GenerateProblemCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
// Halo Exchange
void SetupHaloCuda(SparseMatrix& A, local_int_t sendbufld, local_int_t* sendlen, local_int_t* sendbuff,
local_int_t* tot_to_send, int* nneighs, int* neighs_h, local_int_t* sendlen_h, local_int_t** elem_to_send_d);
void ExtToLocMapCuda(
local_int_t localNumberOfRows, local_int_t str, local_int_t end, local_int_t* extToLocMap, local_int_t* eltsToRecv);
void ExtTolocCuda(local_int_t localNumberOfRows, int neighborId, local_int_t ext_nnz, local_int_t* csr_ext_columns,
double* csr_ext_values, local_int_t* ext2csr_offsets, local_int_t* extToLocMap, local_int_t* csrColumns);
void PackSendBufferCuda(const SparseMatrix& A, Vector& x, bool cpu_data, cudaStream_t stream1);
void ExchangeHaloCuda(const SparseMatrix& A, Vector& x, cudaStream_t stream1, int use_ibarrier = 0);
// Optimize Problem
void SetVectorAscCuda(local_int_t* arr, local_int_t n);
void ColorMatrixCuda(double* A_vals, local_int_t* A_col, local_int_t* nnzPerRow, local_int_t rows, local_int_t* color,
int* num_colors, int* count_colors, int max_colors, local_int_t* ref2opt, local_int_t* opt2ref, int rank, int nx,
int* rowhash);
void PermElemToSendCuda(local_int_t totalToBeSent, local_int_t* elementsToSend, local_int_t* perm);
void EllPermColumnsValuesCuda(local_int_t localNumberOfRows, local_int_t* nnzPerRow, local_int_t* csrColumns,
double* csrValues, local_int_t* permOffsets, local_int_t* permColumns, double* permValues, local_int_t* opt2ref,
local_int_t* ref2opt, local_int_t* diagonalIdx, local_int_t* permLOffsets, local_int_t* permUOffsets, bool diag);
void TransposeCuda(local_int_t n, local_int_t slice_size, local_int_t* sellCollIndex, double* sellValues);
void EllMaxRowLenPerBlockCuda(local_int_t nrow, int sliceSize, local_int_t* sellLPermOffsets,
local_int_t* sellUPermOffsets, local_int_t* sellLSliceMrl, local_int_t* sellUSliceMrl);
void PrefixsumCuda(local_int_t localNumberOfRows, local_int_t* arr);
void MultiplyBySliceSizeCUDA(local_int_t nrow, int slice_size, local_int_t* arr);
void CreateAMatrixSliceOffsetsCuda(local_int_t nrow, local_int_t slice_size, local_int_t* arr);
void CreateSellLUColumnsValuesCuda(const local_int_t n, int sliceSize, local_int_t* columns, double* values,
local_int_t* sellLSliceOffset, local_int_t* sellLColumns, double* sellLValues, local_int_t* sellUSliceOffset,
local_int_t* sellUColumns, double* sellUValues, int level);
void PermVectorCuda(local_int_t* perm, Vector& x, local_int_t length);
void F2cPermCuda(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2cPerm, local_int_t* permF, local_int_t* ipermC);
// Test CG
void ReplaceMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
void CopyMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
// CG Support Kernels
// 1. MG
void ComputeRestrictionCuda(const SparseMatrix& A, const Vector& r);
void ComputeProlongationCuda(const SparseMatrix& A, Vector& x);
// 2. WAXPBY
void ComputeWAXPBYCuda(
const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
// 3.SYMGS
void SpmvDiagCuda(local_int_t n, double* x, double* d);
void AxpbyCuda(local_int_t n, double* x, double* y, double* z);
void SpFmaCuda(local_int_t n, double* x, double* y, double* z);
// 4.External Matrix SpMV + Scatter
void ExtSpMVCuda(SparseMatrix& A, double alpha, double* x, double* y);
// Transfer Problem to CPU
size_t CopyDataToHostCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
#endif

205
src/ExchangeHalo.cpp Normal file
View File

@@ -0,0 +1,205 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ExchangeHalo.cpp
HPCG routine
*/
// Compile this routine only if running with MPI
#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#include "Geometry.hpp"
#include <cstdlib>
#include <mpi.h>
extern p2p_comm_mode_t P2P_Mode;
/*!
Communicates data that is at the border of the part of the domain assigned to this processor.
@param[in] A The known system matrix
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
non-local entries updated by other processors
*/
void ExchangeHalo(const SparseMatrix& A, Vector& x)
{
local_int_t localNumberOfRows = A.localNumberOfRows;
int num_neighbors = A.numberOfSendNeighbors;
local_int_t * receiveLength = A.receiveLength;
local_int_t * sendLength = A.sendLength;
int * neighbors = A.neighbors;
double * sendBuffer = A.sendBuffer;
local_int_t totalToBeSent = A.totalToBeSent;
local_int_t * elementsToSend = A.elementsToSend;
double * const xv = x.values;
int size, rank; // Number of MPI processes, My process ID
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
//
int MPI_MY_TAG = 99;
MPI_Request * request = new MPI_Request[num_neighbors];
//
// Externals are at end of locals
//
double * x_external = (double *) xv + localNumberOfRows;
// Post receives first
// TODO: Thread this loop
for (int i = 0; i < num_neighbors; i++) {
local_int_t n_recv = receiveLength[i];
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
x_external += n_recv;
}
//
// Fill up send buffer
//
// TODO: Thread this loop
for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
//
// Send to each neighbor
//
// TODO: Thread this loop
for (int i = 0; i < num_neighbors; i++) {
local_int_t n_send = sendLength[i];
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
sendBuffer += n_send;
}
//
// Complete the reads issued above
//
MPI_Status status;
// TODO: Thread this loop
for (int i = 0; i < num_neighbors; i++) {
if ( MPI_Wait(request+i, &status) ) {
std::exit(-1); // TODO: have better error exit
}
}
delete [] request;
return;
}
/*!
Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
@param[in] A The known system matrix
@param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
non-local entries updated by other processors
@param[in] use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT,
by calling MPI_Ibarrier once at the last routine call in MG.
*/
void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
{
// Extract Matrix pieces
local_int_t localNumberOfRows = A.localNumberOfRows;
int num_neighbors = A.numberOfSendNeighbors;
local_int_t* receiveLength = A.receiveLength;
local_int_t* sendLength = A.sendLength;
int* neighbors = A.neighborsPhysical;
double* sendBuffer = A.sendBuffer;
local_int_t totalToBeSent = A.totalToBeSent;
local_int_t* elementsToSend = A.elementsToSend;
if (P2P_Mode == MPI_CPU)
{
double* const xv = x.values;
double* x_external = (double*) xv + localNumberOfRows;
int MPI_MY_TAG = 99;
MPI_Request* request = new MPI_Request[num_neighbors];
// Post receives first
for (int i = 0; i < num_neighbors; i++)
{
local_int_t n_recv = receiveLength[i];
MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
x_external += n_recv;
}
for (local_int_t i = 0; i < totalToBeSent; i++)
sendBuffer[i] = xv[elementsToSend[i]];
//
// Send to each neighbor
//
for (int i = 0; i < num_neighbors; i++)
{
local_int_t n_send = sendLength[i];
MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
sendBuffer += n_send;
}
//
// Complete the reads issued above
//
MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
//[Experimental] Can improve MPI_Allreduce performance
#if 0
if (use_ibarrier == 1)
MPI_Ibarrier(MPI_COMM_WORLD, request);
#endif
delete[] request;
}
else if (P2P_Mode == MPI_CPU_All2allv)
{
double* const xv = x.values;
double* x_external = (double*) xv + localNumberOfRows;
for (local_int_t i = 0; i < totalToBeSent; i++)
sendBuffer[i] = xv[elementsToSend[i]];
MPI_Alltoallv(
sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
}
return;
}
#endif
// ifndef HPCG_NO_MPI

38
src/ExchangeHalo.hpp Normal file
View File

@@ -0,0 +1,38 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef EXCHANGEHALO_HPP
#define EXCHANGEHALO_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
void ExchangeHalo(const SparseMatrix& A, Vector& x);
void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier = 0);
#endif // EXCHANGEHALO_HPP

View File

@@ -0,0 +1,158 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file GenerateProblem.cpp
HPCG routine
*/
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "GenerateCoarseProblem.hpp"
#include "GenerateGeometry.hpp"
#include "GenerateProblem.hpp"
#include "SetupHalo.hpp"
#include <cassert>
#ifndef HPCG_NO_MPI
// Used to find ranks for CPU and GPU programs
extern int global_total_ranks;
extern int* physical_rank_dims;
#endif
/*!
Routine to construct a prolongation/restriction operator for a given fine grid matrix
solution (as computed by a direct solver).
@param[inout] Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary
vectors will be defined.
Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.
*/
void GenerateCoarseProblem(const SparseMatrix& Af)
{
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
// below may result in global range values.
global_int_t nxf = Af.geom->nx;
global_int_t nyf = Af.geom->ny;
global_int_t nzf = Af.geom->nz;
local_int_t nxc, nyc, nzc; // Coarse nx, ny, nz
assert(nxf % 2 == 0);
assert(nyf % 2 == 0);
assert(nzf % 2 == 0); // Need fine grid dimensions to be divisible by 2
nxc = nxf / 2;
nyc = nyf / 2;
nzc = nzf / 2;
local_int_t* f2cOperator = new local_int_t[Af.localNumberOfRows];
local_int_t localNumberOfRows = nxc * nyc * nzc; // This is the size of our subblock
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
assert(localNumberOfRows
> 0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)
for (int i = 0; i < 3 * global_total_ranks; i++)
physical_rank_dims[i] = physical_rank_dims[i] / 2;
// Construct the geometry and linear system
Geometry* geomc = new Geometry;
GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy,
Af.geom->npz, Af.geom->different_dim, geomc);
Vector* rc = new Vector;
Vector* xc = new Vector;
Vector* Axf = new Vector;
MGData* mgData = new MGData;
if (Af.rankType == GPU)
{
SparseMatrix* Ac = Af.Ac;
Ac->rankType = GPU;
InitializeSparseMatrix(*Ac, geomc);
GenerateProblem(*Ac, 0, 0, 0);
SetupHalo(*Ac);
InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
#ifdef USE_CUDA
cudaMemcpy(f2cOperator, Af.gpuAux.f2c, sizeof(local_int_t) * localNumberOfRows, cudaMemcpyDeviceToHost);
#endif
}
else
{
SparseMatrix* Ac = new SparseMatrix;
InitializeSparseMatrix(*Ac, geomc);
Ac->rankType = CPU;
(*Ac).Ac = 0;
GenerateProblem(*Ac, 0, 0, 0);
SetupHalo(*Ac);
InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
Af.Ac = Ac;
// Use a parallel loop to do initial assignment:
// distributes the physical placement of arrays of pointers across the memory system
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; ++i)
{
f2cOperator[i] = 0;
}
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for(local_int_t i = 0; i < nzc * nyc * nxc; i++)
{
local_int_t izc = (i / (nxc * nyc));
local_int_t iyc = (i - izc * nxc * nyc) / nxc;
local_int_t ixc = i - (izc * nyc + iyc) * nxc;
local_int_t izf = 2 * izc;
local_int_t iyf = 2 * iyc;
local_int_t ixf = 2 * ixc;
local_int_t currentCoarseRow = izc * nxc * nyc + iyc * nxc + ixc;
local_int_t currentFineRow = izf * nxf * nyf + iyf * nxf + ixf;
f2cOperator[currentCoarseRow] = currentFineRow;
}
}
InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
Af.mgData = mgData;
return;
}

View File

@@ -0,0 +1,19 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef GENERATECOARSEPROBLEM_HPP
#define GENERATECOARSEPROBLEM_HPP
#include "SparseMatrix.hpp"
void GenerateCoarseProblem(const SparseMatrix& A);
#endif // GENERATECOARSEPROBLEM_HPP

801
src/GenerateGeometry.cpp Normal file
View File

@@ -0,0 +1,801 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file GenerateGeometry.cpp
HPCG routine
*/
#include <cassert>
#include <cmath>
#include <cstdlib>
#include "ComputeOptimalShapeXYZ.hpp"
#include "GenerateGeometry.hpp"
#include <cstdio>
#ifdef HPCG_DEBUG
#include "hpcg.hpp"
#include <fstream>
using std::endl;
#endif
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_MPI
// Used to find ranks for CPU and GPU programs
extern int global_total_ranks;
extern int* physical_rank_dims;
extern int* logical_rank_to_phys;
#endif
/*!
Computes the factorization of the total number of processes into a
3-dimensional process grid that is as close as possible to a cube. The
quality of the factorization depends on the prime number structure of the
total number of processes. It then stores this decompostion together with the
parallel parameters of the run in the geometry data structure.
@param[in] size total number of MPI processes
@param[in] rank this process' rank among other MPI processes
@param[in] numThreads number of OpenMP threads in this process
@param[in] nx, ny, nz number of grid points for each local block in the x, y, and z dimensions, respectively
@param[out] geom data structure that will store the above parameters and the factoring of total number of processes
into three dimensions
*/
// Level 0 Generation, we need to decide nx, ny, nz based on
// G2C ratio and npx, npy, npz
// Remap rank IDs to logical IDs to enforce 3D shape correctness when exec_mode is GPUCPU
void GenerateGeometry(HPCG_Params& params, Geometry* geom)
{
int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
int nx = params.nx, ny = params.ny, nz = params.nz;
int npx = params.npx, npy = params.npy, npz = params.npz;
// If npx. npy, and npz are not provided by user
// find the optimal shape
if (npx * npy * npz <= 0 || npx * npy * npz > size)
ComputeOptimalShapeXYZ(size, npx, npy, npz);
// When search_for_same0 is true, finds the next rank that is the same as local
// problem size as rank 0. When false, finds the ranks that are not the same as rank 0
auto loop_over_ranks = [](int index, int lp, bool search_for_same0) -> int
{
for (int p = index; p < global_total_ranks; p++)
{
int nnpx = physical_rank_dims[3 * p];
int nnpy = physical_rank_dims[3 * p + 1];
int nnpz = physical_rank_dims[3 * p + 2];
bool same_zero = false;
if (nnpx == physical_rank_dims[0] && nnpy == physical_rank_dims[1] && nnpz == physical_rank_dims[2])
same_zero = true;
if (same_zero == search_for_same0)
{
logical_rank_to_phys[lp] = p;
index = p + 1;
break;
}
}
return index;
};
// Here decide and broadcast nx, ny, nz
// 1 Check for GPU and CPU execution modes
auto user_diff_dim = NONE;
if (params.exec_mode == GPUCPU)
{
// User defined diff direction between GPU and CPU
// If user decides that nz should be diff between GPU and CPU
// and NPZ is even --> Decide GPU and CPU local size based on
// local_problem_def and g2c
if (params.diff_dim == Z && (npz & 1) == 0)
{
user_diff_dim = Z;
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
nz = nz / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
nz = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
nz = nz / params.g2c;
if (params.rank_type == GPU)
nz = nz - (nz / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
nz = params.g2c;
if (params.rank_type == GPU)
nz = nz - params.g2c;
}
}
// If user decides that ny should be diff between GPU and CPU
// and NPY is even --> Decide GPU and CPU local size based on
// local_problem_def and g2c
else if (params.diff_dim == Y && (npy & 1) == 0)
{
user_diff_dim = Y;
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
ny = ny / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
ny = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
ny = ny / params.g2c;
if (params.rank_type == GPU)
ny = ny - (ny / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
ny = params.g2c;
if (params.rank_type == GPU)
ny = ny - params.g2c;
}
}
// If user decides that nx should be diff between GPU and CPU
// and NPX is even --> Decide GPU and CPU local size based on
// local_problem_def and g2c
else if (params.diff_dim == X && (npx & 1) == 0)
{
user_diff_dim = X;
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
nx = nx / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
nx = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
nx = nx / params.g2c;
if (params.rank_type == GPU)
nx = nx - (nx / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
nx = params.g2c;
if (params.rank_type == GPU)
nx = nx - params.g2c;
}
}
// Automatic partition direction
// When user does not specify the diff dimension
if (user_diff_dim == NONE)
{ // Did not succeed with user choice
if ((npz & 1) == 0)
{
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
nz = nz / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
nz = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
nz = nz / params.g2c;
if (params.rank_type == GPU)
nz = nz - (nz / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
nz = params.g2c;
if (params.rank_type == GPU)
nz = nz - params.g2c;
}
}
else if ((npy & 1) == 0)
{
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
ny = ny / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
ny = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
ny = ny / params.g2c;
if (params.rank_type == GPU)
ny = ny - (ny / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
ny = params.g2c;
if (params.rank_type == GPU)
ny = ny - params.g2c;
}
}
else if ((npx & 1) == 0)
{
if (params.local_problem_def == GPU_RATIO)
{
if (params.rank_type == CPU)
nx = nx / params.g2c;
}
else if (params.local_problem_def == GPU_ABS)
{
if (params.rank_type == CPU)
nx = params.g2c;
}
else if (params.local_problem_def == GPU_CPU_RATIO)
{
if (params.rank_type == CPU)
nx = nx / params.g2c;
if (params.rank_type == GPU)
nx = nx - (nx / params.g2c);
}
else
{ /*GPU_CPU_ABS*/
if (params.rank_type == CPU)
nx = params.g2c;
if (params.rank_type == GPU)
nx = nx - params.g2c;
}
}
}
}
// Now let us exchange dimensions
int sendBuf[] = {nx, ny, nz};
#ifndef HPCG_NO_MPI
MPI_Allgather(sendBuf, 3, MPI_INT, physical_rank_dims, 3, MPI_INT, MPI_COMM_WORLD);
#endif
// My logical rank Id
int logical_rank;
// last physical position for the rank that has the same size as 0
int same_as_0_position = 0;
// last physical position for the rank that does not have the same size as 0
int not_same_as_0_position = 0;
auto different_dim = NONE;
bool all_same = true;
int num_ranks_same = 1;
int num_ranks_not_same = 0;
int x0 = physical_rank_dims[0];
int y0 = physical_rank_dims[1];
int z0 = physical_rank_dims[2];
for (int p = 1; p < global_total_ranks; p++)
{
int x = physical_rank_dims[3 * p];
int y = physical_rank_dims[3 * p + 1];
int z = physical_rank_dims[3 * p + 2];
if (x != x0 || y != y0 || z != z0)
num_ranks_not_same++;
else
num_ranks_same++;
}
if (num_ranks_not_same > 0)
all_same = false;
if (!all_same)
{
// try twice: user-based, automatic
for (int i = 0; i < 2; i++)
{
bool z_condition = (i == 0) ? user_diff_dim == Z && (npz & 1) == 0 : (npz & 1) == 0;
bool y_condition = (i == 0) ? user_diff_dim == Y && (npy & 1) == 0 : (npy & 1) == 0;
bool x_condition = (i == 0) ? user_diff_dim == X && (npx & 1) == 0 : (npx & 1) == 0;
// Let us start with Z
if (z_condition)
{ // Z is even
different_dim = Z;
bool x_same = true;
bool y_same = true;
for (int p = 1; p < global_total_ranks; p++)
{
int x = physical_rank_dims[3 * p];
int y = physical_rank_dims[3 * p + 1];
assert(x == x0 && y == y0);
}
}
else if (y_condition)
{ // Y is even
different_dim = Y;
bool x_same = true;
bool z_same = true;
for (int p = 1; p < global_total_ranks; p++)
{
int x = physical_rank_dims[3 * p];
int z = physical_rank_dims[3 * p + 2];
assert(x == x0 && z == z0);
}
}
else if (x_condition)
{
different_dim = X;
bool y_same = true;
bool z_same = true;
for (int p = 1; p < global_total_ranks; p++)
{
int y = physical_rank_dims[3 * p + 1];
int z = physical_rank_dims[3 * p + 2];
assert(z == z0 && y == y0);
}
}
if (z_condition || y_condition || x_condition)
break;
}
}
// When exec_mode is GPUCPU, GPU and CPU ranks can have different dims. Therefore,
// we must rearrange the ranks such that the 3D shape is correct.
int same_rank_counter = 0;
if (different_dim != NONE)
{
for (int iz = 0; iz < npz; iz++)
for (int iy = 0; iy < npy; iy++)
for (int ix = 0; ix < npx; ix++)
{
int logical_position = iz * npy * npx + iy * npx + ix;
// Different dim is Z
// The first NPXxNPY are GPUs, then the next NPXxNPY is CPUs, and so on
if (different_dim == Z)
{
if ((iz & 1) == 0 && same_rank_counter < num_ranks_same)
{ // same as 0
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
same_rank_counter++;
}
else
{ // Not same as 0
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
}
}
// Different dim is Y
// The first NPXxNPZ are GPUs, then the next NPXxNPZ is CPUs, and so on
else if (different_dim == Y)
{
if ((iy & 1) == 0 && same_rank_counter < num_ranks_same)
{ // same as 0
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
same_rank_counter++;
}
else
{ // Not same as 0
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
}
}
// Different dim is X
// The first NPYxNPZ are GPUs, then the next NPYxNPZ is CPUs, and so on
else if (different_dim == X)
{
if ((ix & 1) == 0 && same_rank_counter < num_ranks_same)
{ // same as 0
same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
same_rank_counter++;
}
else
{ // Not same as 0
not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
}
}
}
}
else
{
// Keep rank Ids the same if all ranks have the same problem size
for (int p = 0; p < global_total_ranks; p++)
logical_rank_to_phys[p] = p;
}
for (int p = 0; p < global_total_ranks; p++)
{
if (rank == logical_rank_to_phys[p])
{
logical_rank = p;
}
}
// Now compute this process's indices in the 3D cube
int ipz = logical_rank / (npx * npy);
int ipy = (logical_rank - ipz * npx * npy) / npx;
int ipx = logical_rank % npx;
#ifdef HPCG_DEBUG
if (rank == 0)
HPCG_fout << "size = " << size << endl
<< "nx = " << nx << endl
<< "ny = " << ny << endl
<< "nz = " << nz << endl
<< "npx = " << npx << endl
<< "npy = " << npy << endl
<< "npz = " << npz << endl;
HPCG_fout << "For rank = " << rank << endl
<< "ipx = " << ipx << endl
<< "ipy = " << ipy << endl
<< "ipz = " << ipz << endl;
assert(size >= npx * npy * npz);
#endif
geom->size = size;
geom->rank = rank;
geom->logical_rank = logical_rank;
geom->different_dim = different_dim;
geom->numThreads = params.numThreads;
geom->nx = nx;
geom->ny = ny;
geom->nz = nz;
geom->npx = npx;
geom->npy = npy;
geom->npz = npz;
geom->ipx = ipx;
geom->ipy = ipy;
geom->ipz = ipz;
// These values should be defined to take into account changes in nx, ny, nz values
// due to variable local grid sizes
global_int_t gnx = 0;
global_int_t gny = 0;
global_int_t gnz = 0;
// Find the global NX. NY, and NZ
// For diff dims, accumulate sequentially
// For similar dims, just multiply rank 3D location by the local dim
if (different_dim == X)
for (int i = 0; i < npx; i++)
{
int r = ipz * npx * npy + ipy * npx + i;
int p = logical_rank_to_phys[r];
gnx += physical_rank_dims[p * 3];
}
else
gnx = npx * nx;
if (different_dim == Y)
for (int i = 0; i < npy; i++)
{
int r = ipz * npx * npy + i * npx + ipx;
int p = logical_rank_to_phys[r];
gny += physical_rank_dims[p * 3 + 1];
}
else
gny = npy * ny;
if (different_dim == Z)
for (int i = 0; i < npz; i++)
{
int r = i * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
gnz += physical_rank_dims[p * 3 + 2];
}
else
gnz = npz * nz;
// Here, we find the initial global indices (gix0, giy0, and giz0)
// for each rank based on its 3d location in the grid
// Also, for the diff dim find the previous and next neighbor IDs
// Notice, on the diff dims the previous and next neighbors have
// the different dimension!
int prev_n = 0;
int next_n = 0;
global_int_t giz0 = 0;
global_int_t gix0 = 0;
global_int_t giy0 = 0;
if (different_dim == X)
{
for (int i = 0; i < ipx; i++)
{
int r = ipz * npx * npy + ipy * npx + i;
int p = logical_rank_to_phys[r];
gix0 += physical_rank_dims[p * 3];
if (i == ipx - 1)
{
prev_n = physical_rank_dims[p * 3];
}
}
if (ipx + 1 < npx)
{
int r = ipz * npx * npy + ipy * npx + (ipx + 1);
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3];
}
}
else
gix0 = ipx * nx;
if (different_dim == Y)
{
for (int i = 0; i < ipy; i++)
{
int r = ipz * npx * npy + i * npx + ipx;
int p = logical_rank_to_phys[r];
giy0 += physical_rank_dims[p * 3 + 1];
if (i == ipy - 1)
{
prev_n = physical_rank_dims[p * 3 + 1];
}
}
if (ipy + 1 < npy)
{
int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3 + 1];
}
}
else
giy0 = ipy * ny;
if (different_dim == Z)
{
for (int i = 0; i < ipz; i++)
{
int r = i * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
giz0 += physical_rank_dims[p * 3 + 2];
if (i == ipz - 1)
{
prev_n = physical_rank_dims[p * 3 + 2];
}
}
if (ipz + 1 < npz)
{
int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3 + 2];
}
}
else
giz0 = ipz * nz;
// Keep these values for later
geom->gnx = gnx;
geom->gny = gny;
geom->gnz = gnz;
geom->gix0 = gix0;
geom->giy0 = giy0;
geom->giz0 = giz0;
geom->previous_neighbor_dim = prev_n;
geom->next_neighbor_dim = next_n;
return;
}
// Simpler generateion for next/coarse levels
// Do not need to find nx, ny, nz for CPU and GPU based on parameters
// Do not need to find logical rank IDs
void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
int npy, int npz, dim_3d_t different_dim, Geometry* geom)
{
// My logical rank Id
int logical_rank;
for (int p = 0; p < global_total_ranks; p++)
{
if (rank == logical_rank_to_phys[p])
{
logical_rank = p;
}
}
// Now compute this process's indices in the 3D cube
int ipz = logical_rank / (npx * npy);
int ipy = (logical_rank - ipz * npx * npy) / npx;
int ipx = logical_rank % npx;
#ifdef HPCG_DEBUG
if (rank == 0)
HPCG_fout << "size = " << size << endl
<< "nx = " << nx << endl
<< "ny = " << ny << endl
<< "nz = " << nz << endl
<< "npx = " << npx << endl
<< "npy = " << npy << endl
<< "npz = " << npz << endl;
HPCG_fout << "For rank = " << rank << endl
<< "ipx = " << ipx << endl
<< "ipy = " << ipy << endl
<< "ipz = " << ipz << endl;
assert(size >= npx * npy * npz);
#endif
geom->size = size;
geom->rank = rank;
geom->logical_rank = logical_rank;
geom->different_dim = different_dim;
geom->numThreads = numThreads;
geom->nx = nx;
geom->ny = ny;
geom->nz = nz;
geom->npx = npx;
geom->npy = npy;
geom->npz = npz;
geom->ipx = ipx;
geom->ipy = ipy;
geom->ipz = ipz;
// Find the global NX. NY, and NZ
// For diff dims, accumulate sequentially
// For similar dims, just multiply rank 3D location by the local dim
global_int_t gnx = 0;
global_int_t gny = 0;
global_int_t gnz = 0;
if (different_dim == X)
for (int i = 0; i < npx; i++)
{
int r = ipz * npx * npy + ipy * npx + i;
int p = logical_rank_to_phys[r];
gnx += physical_rank_dims[p * 3];
}
else
gnx = npx * nx;
if (different_dim == Y)
for (int i = 0; i < npy; i++)
{
int r = ipz * npx * npy + i * npx + ipx;
int p = logical_rank_to_phys[r];
gny += physical_rank_dims[p * 3 + 1];
}
else
gny = npy * ny;
if (different_dim == Z)
for (int i = 0; i < npz; i++)
{
int r = i * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
gnz += physical_rank_dims[p * 3 + 2];
}
else
gnz = npz * nz;
// Here, we find the initial global indices (gix0, giy0, and giz0)
// for each rank based on its 3d location in the grid
// Also, for the diff dim find the previous and next neighbor IDs
// Notice, on the diff dims the previous and next neighbors have
// the different dimension!
int prev_n = 0;
int next_n = 0;
global_int_t giz0 = 0;
global_int_t gix0 = 0;
global_int_t giy0 = 0;
if (different_dim == X)
{
for (int i = 0; i < ipx; i++)
{
int r = ipz * npx * npy + ipy * npx + i;
int p = logical_rank_to_phys[r];
gix0 += physical_rank_dims[p * 3];
if (i == ipx - 1)
{
prev_n = physical_rank_dims[p * 3];
}
}
if (ipx + 1 < npx)
{
int r = ipz * npx * npy + ipy * npx + (ipx + 1);
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3];
}
}
else
gix0 = ipx * nx;
if (different_dim == Y)
{
for (int i = 0; i < ipy; i++)
{
int r = ipz * npx * npy + i * npx + ipx;
int p = logical_rank_to_phys[r];
giy0 += physical_rank_dims[p * 3 + 1];
if (i == ipy - 1)
{
prev_n = physical_rank_dims[p * 3 + 1];
}
}
if (ipy + 1 < npy)
{
int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3 + 1];
}
}
else
giy0 = ipy * ny;
if (different_dim == Z)
{
for (int i = 0; i < ipz; i++)
{
int r = i * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
giz0 += physical_rank_dims[p * 3 + 2];
if (i == ipz - 1)
{
prev_n = physical_rank_dims[p * 3 + 2];
}
}
if (ipz + 1 < npz)
{
int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
int p = logical_rank_to_phys[r];
next_n = physical_rank_dims[p * 3 + 2];
}
}
else
giz0 = ipz * nz;
// Keep these values for later
geom->gnx = gnx;
geom->gny = gny;
geom->gnz = gnz;
geom->gix0 = gix0;
geom->giy0 = giy0;
geom->giz0 = giz0;
geom->previous_neighbor_dim = prev_n;
geom->next_neighbor_dim = next_n;
return;
}

39
src/GenerateGeometry.hpp Normal file
View File

@@ -0,0 +1,39 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef GENERATEGEOMETRY_HPP
#define GENERATEGEOMETRY_HPP
#include "Geometry.hpp"
#include "hpcg.hpp"
void GenerateGeometry(HPCG_Params& params, Geometry* geom);
void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
int npy, int npz, dim_3d_t partition_by, Geometry* geom);
#endif // GENERATEGEOMETRY_HPP

404
src/GenerateProblem.cpp Normal file
View File

@@ -0,0 +1,404 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file GenerateProblem.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "mytimer.hpp"
#include "GenerateProblem.hpp"
#include "GenerateProblem_ref.hpp"
#ifdef USE_CUDA
#include "Cuda.hpp"
#include "CudaKernels.hpp"
#endif
#ifdef USE_GRACE
#include "CpuKernels.hpp"
#endif
/*!
Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.
@param[in] A The generated system matrix
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
non-zero on entry)
@see GenerateGeometry
*/
#ifdef USE_CUDA
void GenerateProblem_Gpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
{
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
local_int_t localNumberOfRows = nx * ny * nz;
local_int_t numberOfNonzerosPerRow = 27;
global_int_t totalNumberOfRows = gnx * gny * gnz;
if (b != 0)
InitializeVector(*b, localNumberOfRows, GPU);
if (x != 0)
InitializeVector(*x, localNumberOfRows, GPU);
if (xexact != 0)
InitializeVector(*xexact, localNumberOfRows, GPU);
GenerateProblemCuda(A, b, x, xexact);
local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+ 18LL
* (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+ 2LL * ((gny - 2LL) * (gnz - 2LL)))
+ 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
A.title = 0;
A.totalNumberOfRows = totalNumberOfRows;
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
A.localNumberOfRows = localNumberOfRows;
A.localNumberOfColumns = localNumberOfRows;
A.localNumberOfNonzeros = localNumberOfNonzeros;
return;
}
#endif
#ifdef USE_GRACE
// Neighbor rank to sequential ID and vice versa
extern int *rankToId_h, *idToRank_h;
// GenerateProblem_Cpu is called 4 times for each level
// Sometimes we need to perform actions based on the level (global across the applications)
int global_steps = 0;
void GenerateProblem_Cpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
{
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
// below may result in global range values.
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
int npx = A.geom->npx;
int npy = A.geom->npy;
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
assert(localNumberOfRows
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
local_int_t numberOfNonzerosPerRow
= 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
assert(totalNumberOfRows
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
// Allocate arrays that are of length localNumberOfRows
if (global_steps == 0)
{
rankToId_h = new int[A.geom->size + 1];
idToRank_h = new int[27];
global_steps++;
}
local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
double** matrixValues = new double*[localNumberOfRows];
double** matrixDiagonal = new double*[localNumberOfRows];
if (b != 0)
InitializeVector(*b, localNumberOfRows, CPU);
if (x != 0)
InitializeVector(*x, localNumberOfRows, CPU);
if (xexact != 0)
InitializeVector(*xexact, localNumberOfRows, CPU);
double* bv = 0;
double* xv = 0;
double* xexactv = 0;
if (b != 0)
bv = b->values; // Only compute exact solution if requested
if (x != 0)
xv = x->values; // Only compute exact solution if requested
if (xexact != 0)
xexactv = xexact->values; // Only compute exact solution if requested
A.localToGlobalMap.resize(localNumberOfRows);
// Use a parallel loop to do initial assignment:
// distributes the physical placement of arrays of pointers across the memory system
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; ++i)
{
matrixValues[i] = 0;
matrixDiagonal[i] = 0;
mtxIndG[i] = 0;
mtxIndL[i] = 0;
}
if (global_steps == 1)
{
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < A.geom->size + 1; i++)
{
rankToId_h[i] = 0;
}
global_steps++;
}
// Now allocate the arrays pointed to
mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
local_int_t localNumberOfNonzeros = 0;
local_int_t ext_nnz = 0;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for reduction(+ : localNumberOfNonzeros) reduction(+ : ext_nnz)
#endif
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
const local_int_t iz = (i / (nx * ny));
const local_int_t iy = (i - iz * nx * ny) / nx;
const local_int_t ix = i - (iz * ny + iy) * nx;
const global_int_t gix = ix + gix0;
const global_int_t giy = iy + giy0;
const global_int_t giz = iz + giz0;
local_int_t currentLocalRow = i;
global_int_t currentGlobalRow = gix + giy * gnx + giz * gnx * gny;
A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
char numberOfNonzerosInRow = 0;
double* currentValuePointer = matrixValues[currentLocalRow];
global_int_t* currentIndexPointerG = mtxIndG[currentLocalRow];
global_int_t curcol;
double* diagonalPointer = nullptr;
// Go through all the neighbors around a 3D point to decide
// which one is a halo and which one is local to the rank
for (int k = 0; k < 27; k++)
{
// Neibor global Ids
long long int cgix = gix + tid2indCpu[k][0];
long long int cgiy = giy + tid2indCpu[k][1];
long long int cgiz = giz + tid2indCpu[k][2];
// These used when the point is local to the rank
local_int_t zi = (cgiz) % nz;
local_int_t yi = (cgiy) % ny;
local_int_t xi = (cgix) % nx;
// local column Id
local_int_t lcol = zi * ny * nx + yi * nx + xi;
// Is the global 3D point inside the global problem?
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
if (ok /*Yes this a valid point globally*/)
{
*currentIndexPointerG++ = cgix + cgiy * gnx + cgiz * gnx * gny;
;
if (k == 13)
{
*currentValuePointer = 26.0;
diagonalPointer = currentValuePointer;
}
else
{
*currentValuePointer = -1.0;
}
// Rank Id in the global domain
int ipz = cgiz / nz;
int ipy = cgiy / ny;
int ipx = cgix / nx;
// For GPUCPU exec mode, when the CPU and GPU have diff dims in a direction,
// we need to find the point rank manually, not based on its local dimension
// but based on its physical location to the local problem
// Note the halo size is always 1
if (A.geom->different_dim == Z)
{
long long int local = cgiz - giz0;
if (local >= 0 && local < nz)
ipz = A.geom->ipz;
else if (local < 0)
ipz = A.geom->ipz - 1;
else if (local >= nz)
ipz = A.geom->ipz + 1;
}
else if (A.geom->different_dim == Y)
{
long long int local = cgiy - giy0;
if (local >= 0 && local < ny)
ipy = A.geom->ipy;
else if (local < 0)
ipy = A.geom->ipy - 1;
else if (local >= ny)
ipy = A.geom->ipy + 1;
}
else if (A.geom->different_dim == X)
{
long long int local = cgix - gix0;
if (local >= 0 && local < nx)
ipx = A.geom->ipx;
else if (local < 0)
ipx = A.geom->ipx - 1;
else if (local >= nx)
ipx = A.geom->ipx + 1;
}
// Now, after find the point rank from the location
// in the 3D grid (ranks domain NPXxNPYxNPZ)
int col_rank = ipx + ipy * npx + ipz * npy * npx;
// The neighbor point rank is diff than the current point rank
if (A.geom->logical_rank != col_rank)
{
if (global_steps == 2)
rankToId_h[col_rank + 1] = 1; // To find its sequential Id (will be prefix summed later)
ext_nnz++;
}
currentValuePointer++;
numberOfNonzerosInRow++;
}
}
matrixDiagonal[currentLocalRow] = diagonalPointer;
nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
localNumberOfNonzeros += numberOfNonzerosInRow;
if (b != 0)
bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
if (x != 0)
xv[currentLocalRow] = 0.0;
if (xexact != 0)
xexactv[currentLocalRow] = 1.0;
}
// Prefixsum to RakToId
// Map physical neighbor ranks to sequential IDs
// less memory consumption
if (global_steps == 2)
{
PrefixsumCpu(rankToId_h + 1, A.geom->size);
int counter = 1;
for (int i = 1; i < A.geom->size + 1; i++)
{
if (rankToId_h[i] == counter)
{
idToRank_h[counter - 1] = i - 1;
counter++;
}
}
global_steps++;
}
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
<< endl
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
<< " nonzeros." << endl;
#endif
global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+ 18LL
* (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+ 2LL * ((gny - 2LL) * (gnz - 2LL)))
+ 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
// This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
assert(totalNumberOfNonzeros
> 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
A.title = 0;
A.totalNumberOfRows = totalNumberOfRows;
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
A.localNumberOfRows = localNumberOfRows;
A.localNumberOfColumns = localNumberOfRows;
A.localNumberOfNonzeros = localNumberOfNonzeros;
A.nonzerosInRow = nonzerosInRow;
A.mtxIndG = mtxIndG;
A.mtxIndL = mtxIndL;
A.matrixValues = matrixValues;
A.matrixDiagonal = matrixDiagonal;
A.extNnz = ext_nnz;
return;
}
#endif // USE_GRACE
void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
{
if (A.rankType == GPU)
{
#ifdef USE_CUDA
GenerateProblem_Gpu(A, b, x, xexact);
#endif
}
else
{
#ifdef USE_GRACE
GenerateProblem_Cpu(A, b, x, xexact);
#endif
}
}

20
src/GenerateProblem.hpp Normal file
View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef GENERATEPROBLEM_HPP
#define GENERATEPROBLEM_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
#endif // GENERATEPROBLEM_HPP

251
src/GenerateProblem_ref.cpp Normal file
View File

@@ -0,0 +1,251 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file GenerateProblem_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
#include <fstream>
using std::endl;
#include "hpcg.hpp"
#endif
#include <cassert>
#include "GenerateProblem_ref.hpp"
/*!
Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact
solution.
@param[in] A The known system matrix
@param[inout] b The newly allocated and generated right hand side vector (if b!=0 on entry)
@param[inout] x The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
@param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
non-zero on entry)
@see GenerateGeometry
*/
void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
{
// Make local copies of geometry information. Use global_int_t since the RHS products in the calculations
// below may result in global range values.
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
// If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
assert(localNumberOfRows
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
local_int_t numberOfNonzerosPerRow
= 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
assert(totalNumberOfRows
> 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
// Allocate arrays that are of length localNumberOfRows
local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
double** matrixValues = new double*[localNumberOfRows];
double** matrixDiagonal = new double*[localNumberOfRows];
if (b != 0)
InitializeVector(*b, localNumberOfRows, CPU);
if (x != 0)
InitializeVector(*x, localNumberOfRows, CPU);
if (xexact != 0)
InitializeVector(*xexact, localNumberOfRows, CPU);
double* bv = 0;
double* xv = 0;
double* xexactv = 0;
if (b != 0)
bv = b->values; // Only compute exact solution if requested
if (x != 0)
xv = x->values; // Only compute exact solution if requested
if (xexact != 0)
xexactv = xexact->values; // Only compute exact solution if requested
A.localToGlobalMap.resize(localNumberOfRows);
// Use a parallel loop to do initial assignment:
// distributes the physical placement of arrays of pointers across the memory system
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; ++i)
{
matrixValues[i] = 0;
matrixDiagonal[i] = 0;
mtxIndG[i] = 0;
mtxIndL[i] = 0;
}
#ifndef HPCG_CONTIGUOUS_ARRAYS
// Now allocate the arrays pointed to
for (local_int_t i = 0; i < localNumberOfRows; ++i)
mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
for (local_int_t i = 0; i < localNumberOfRows; ++i)
matrixValues[i] = new double[numberOfNonzerosPerRow];
for (local_int_t i = 0; i < localNumberOfRows; ++i)
mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
#else
// Now allocate the arrays pointed to
mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
for (local_int_t i = 1; i < localNumberOfRows; ++i)
{
mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
}
#endif
local_int_t localNumberOfNonzeros = 0;
// TODO: This triply nested loop could be flattened or use nested parallelism
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t iz = 0; iz < nz; iz++)
{
global_int_t giz = giz0 + iz;
for (local_int_t iy = 0; iy < ny; iy++)
{
global_int_t giy = giy0 + iy;
for (local_int_t ix = 0; ix < nx; ix++)
{
global_int_t gix = gix0 + ix;
local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
#ifndef HPCG_NO_OPENMP
// C++ std::map is not threadsafe for writing
#pragma omp critical
#endif
A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
<< A.globalToLocalMap[currentGlobalRow] << endl;
#endif
char numberOfNonzerosInRow = 0;
double* currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
global_int_t* currentIndexPointerG
= mtxIndG[currentLocalRow]; // Pointer to current index in current row
for (int sz = -1; sz <= 1; sz++)
{
if (giz + sz > -1 && giz + sz < gnz)
{
for (int sy = -1; sy <= 1; sy++)
{
if (giy + sy > -1 && giy + sy < gny)
{
for (int sx = -1; sx <= 1; sx++)
{
if (gix + sx > -1 && gix + sx < gnx)
{
global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
if (curcol == currentGlobalRow)
{
matrixDiagonal[currentLocalRow] = currentValuePointer;
*currentValuePointer++ = 26.0;
}
else
{
*currentValuePointer++ = -1.0;
}
*currentIndexPointerG++ = curcol;
numberOfNonzerosInRow++;
} // end x bounds test
} // end sx loop
} // end y bounds test
} // end sy loop
} // end z bounds test
} // end sz loop
nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
#ifndef HPCG_NO_OPENMP
#pragma omp critical
#endif
localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
if (b != 0)
bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
if (x != 0)
xv[currentLocalRow] = 0.0;
if (xexact != 0)
xexactv[currentLocalRow] = 1.0;
} // end ix loop
} // end iy loop
} // end iz loop
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
<< endl
<< "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
<< " nonzeros." << endl;
#endif
global_int_t totalNumberOfNonzeros = 0;
#ifndef HPCG_NO_MPI
// Use MPI's reduce function to sum all nonzeros
#ifdef HPCG_NO_LONG_LONG
MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
#else
long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
totalNumberOfNonzeros = gnnz; // Copy back
#endif
#else
totalNumberOfNonzeros = localNumberOfNonzeros;
#endif
// If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
// This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
assert(totalNumberOfNonzeros
> 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
A.title = 0;
A.totalNumberOfRows = totalNumberOfRows;
A.totalNumberOfNonzeros = totalNumberOfNonzeros;
A.localNumberOfRows = localNumberOfRows;
A.localNumberOfColumns = localNumberOfRows;
A.localNumberOfNonzeros = localNumberOfNonzeros;
A.nonzerosInRow = nonzerosInRow;
A.mtxIndG = mtxIndG;
A.mtxIndL = mtxIndL;
A.matrixValues = matrixValues;
A.matrixDiagonal = matrixDiagonal;
return;
}

View File

@@ -0,0 +1,21 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef GENERATEPROBLEM_REF_HPP
#define GENERATEPROBLEM_REF_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
#endif // GENERATEPROBLEM_REF_HPP

207
src/Geometry.hpp Normal file
View File

@@ -0,0 +1,207 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file Geometry.hpp
HPCG data structure for problem geometry
*/
#ifndef GEOMETRY_HPP
#define GEOMETRY_HPP
/*!
This defines the type for integers that have local subdomain dimension.
Define as "long long" when local problem dimension is > 2^31
*/
// #define INDEX_64
#ifndef INDEX_64
typedef int local_int_t;
#else
typedef long long local_int_t;
#endif
/*!
This defines the type for integers that have global dimension
Define as "long long" when global problem dimension is > 2^31
*/
#ifdef HPCG_NO_LONG_LONG
typedef int global_int_t;
#else
typedef long long global_int_t;
#endif
#define HPCG_MAX_ROW_LEN 27
// Enums
typedef enum
{
X = 0,
Y = 1,
Z = 2,
NONE = 3
} dim_3d_t;
typedef enum
{
MPI_CPU,
MPI_CUDA_AWARE,
MPI_GPU_All2allv,
MPI_CPU_All2allv,
NCCL /*GPUONLY*/
} p2p_comm_mode_t;
typedef enum
{
CPU,
GPU
} rank_type_t;
typedef enum
{
GPUONLY = 0,
CPUONLY = 1,
GPUCPU = 2
} exec_mode_t;
typedef enum
{
GPU_RATIO = 0 /*NX, NY, NZ are local to GPU and g2c is a ratio*/,
GPU_ABS = 1 /*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
GPU_CPU_RATIO = 2 /*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
GPU_CPU_ABS = 3 /*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
} local_problem_def_t;
// This macro should be defined if the global_int_t is not long long
// in order to stop complaints from non-C++11 compliant compilers.
// #define HPCG_NO_LONG_LONG
/*!
This is a data structure to contain all processor geometry information
*/
struct Geometry_STRUCT
{
int size; //!< Number of MPI processes
int rank; //!< This process' rank in the range [0 to size - 1]
int logical_rank; //!< For hetrogeneous setup,
int numThreads; //!< This process' number of threads
local_int_t nx; //!< Number of x-direction grid points for each local subdomain
local_int_t ny; //!< Number of y-direction grid points for each local subdomain
local_int_t nz; //!< Number of z-direction grid points for each local subdomain
int npx; //!< Number of processors in x-direction
int npy; //!< Number of processors in y-direction
int npz; //!< Number of processors in z-direction
int pz; //!< partition ID of z-dimension process that starts the second region of nz values
int npartz; //!< Number of partitions with varying nz values
int* partz_ids; //!< Array of partition ids of processor in z-direction where new value of nz starts (valid values
//!< are 1 to npz)
local_int_t* partz_nz; //!< Array of length npartz containing the nz values for each partition
int ipx; //!< Current rank's x location in the npx by npy by npz processor grid
int ipy; //!< Current rank's y location in the npx by npy by npz processor grid
int ipz; //!< Current rank's z location in the npx by npy by npz processor grid
global_int_t gnx; //!< Global number of x-direction grid points
global_int_t gny; //!< Global number of y-direction grid points
global_int_t gnz; //!< Global number of z-direction grid points
global_int_t gix0; //!< Base global x index for this rank in the npx by npy by npz processor grid
global_int_t giy0; //!< Base global y index for this rank in the npx by npy by npz processor grid
global_int_t giz0; //!< Base global z index for this rank in the npx by npy by npz processor grid
dim_3d_t different_dim; //!< The dimension that the GPU and CPU rank are partitioned along
int previous_neighbor_dim;
int next_neighbor_dim;
};
typedef struct Geometry_STRUCT Geometry;
/*!
Returns the rank of the MPI process that is assigned the global row index
given as the input argument.
@param[in] geom The description of the problem's geometry.
@param[in] index The global row index
@return Returns the MPI rank of the process assigned the row
*/
inline int ComputeRankOfMatrixRow(const Geometry& geom, global_int_t index)
{
global_int_t gnx = geom.gnx;
global_int_t gny = geom.gny;
global_int_t iz = index / (gny * gnx);
global_int_t iy = (index - iz * gny * gnx) / gnx;
global_int_t ix = index % gnx;
// We now permit varying values for nz for any nx-by-ny plane of MPI processes.
// npartz is the number of different groups of nx-by-ny groups of processes.
// partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith
// nx-by-ny group. partz_nz is an array of length npartz containing the value of nz for the ith group.
// With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
int ipz = 0;
int ipartz_ids = 0;
for (int i = 0; i < geom.npartz; ++i)
{
int ipart_nz = geom.partz_nz[i];
ipartz_ids = geom.partz_ids[i] - ipartz_ids;
if (iz <= ipart_nz * ipartz_ids)
{
ipz += iz / ipart_nz;
break;
}
else
{
ipz += ipartz_ids;
iz -= ipart_nz * ipartz_ids;
}
}
// global_int_t ipz = iz/geom.nz;
int ipy = iy / geom.ny;
int ipx = ix / geom.nx;
int rank = ipx + ipy * geom.npx + ipz * geom.npy * geom.npx;
return rank;
}
/*!
Destructor for geometry data.
@param[inout] data the geometry data structure whose storage is deallocated
*/
inline void DeleteGeometry(Geometry& geom)
{
// Not used anymore
// if(geom.partz_nz != 0)
// delete [] geom.partz_nz;
// if(geom.partz_ids != 0)
// delete [] geom.partz_ids;
return;
}
#endif // GEOMETRY_HPP

81
src/MGData.hpp Normal file
View File

@@ -0,0 +1,81 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file MGData.hpp
HPCG data structure
*/
#ifndef MGDATA_HPP
#define MGDATA_HPP
#include "SparseMatrix.hpp"
#include "Vector.hpp"
#include <cassert>
struct MGData_STRUCT
{
int numberOfPresmootherSteps; // Call ComputeSYMGS this many times prior to coarsening
int numberOfPostsmootherSteps; // Call ComputeSYMGS this many times after coarsening
local_int_t*
f2cOperator; //!< 1D array containing the fine operator local IDs that will be injected into coarse space.
Vector* rc; // coarse grid residual vector
Vector* xc; // coarse grid solution vector
Vector* Axf; // fine grid residual vector
/*!
This is for storing optimized data structres created in OptimizeProblem and
used inside optimized ComputeSPMV().
*/
void* optimizationData;
};
typedef struct MGData_STRUCT MGData;
/*!
Constructor for the data structure of CG vectors.
@param[in] Ac - Fully-formed coarse matrix
@param[in] f2cOperator -
@param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
*/
inline void InitializeMGData(local_int_t* f2cOperator, Vector* rc, Vector* xc, Vector* Axf, MGData& data)
{
data.numberOfPresmootherSteps = 1;
data.numberOfPostsmootherSteps = 1;
data.f2cOperator = f2cOperator; // Space for injection operator
data.rc = rc;
data.xc = xc;
data.Axf = Axf;
return;
}
/*!
Destructor for the CG vectors data.
@param[inout] data the MG data structure whose storage is deallocated
*/
inline void DeleteMGData(MGData& data)
{
delete[] data.f2cOperator;
DeleteVector(*data.Axf);
DeleteVector(*data.rc);
DeleteVector(*data.xc);
delete data.Axf;
delete data.rc;
delete data.xc;
return;
}
#endif // MGDATA_HPP

66
src/MixedBaseCounter.cpp Normal file
View File

@@ -0,0 +1,66 @@
#include <map>
#include "MixedBaseCounter.hpp"
MixedBaseCounter::MixedBaseCounter(int* counts, int length)
{
this->length = length;
int i;
for (i = 0; i < 32; ++i)
{
this->max_counts[i] = counts[i];
this->cur_counts[i] = 0;
}
// terminate with 0's
this->max_counts[i] = this->cur_counts[i] = 0;
this->max_counts[length] = this->cur_counts[length] = 0;
}
MixedBaseCounter::MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right)
{
this->length = left.length;
for (int i = 0; i < left.length; ++i)
{
this->max_counts[i] = left.max_counts[i] - right.cur_counts[i];
this->cur_counts[i] = 0;
}
}
void MixedBaseCounter::next()
{
for (int i = 0; i < this->length; ++i)
{
this->cur_counts[i]++;
if (this->cur_counts[i] > this->max_counts[i])
{
this->cur_counts[i] = 0;
continue;
}
break;
}
}
int MixedBaseCounter::is_zero()
{
for (int i = 0; i < this->length; ++i)
if (this->cur_counts[i])
return 0;
return 1;
}
int MixedBaseCounter::product(int* multipliers)
{
int k = 0, x = 1;
for (int i = 0; i < this->length; ++i)
for (int j = 0; j < this->cur_counts[i]; ++j)
{
k = 1;
x *= multipliers[i];
}
return x * k;
}

16
src/MixedBaseCounter.hpp Normal file
View File

@@ -0,0 +1,16 @@
class MixedBaseCounter
{
private:
int length; //!< number of prime factor counts (cannot exceed 32 for a 32-bit integer)
int max_counts[32 + 1]; //!< maximum value for prime factor counts
int cur_counts[32 + 1]; //!< current prime factor counts
public:
MixedBaseCounter(int* counts, int length);
MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right);
void next();
int is_zero();
int product(int* multipliers);
};

427
src/OptimizeProblem.cpp Normal file
View File

@@ -0,0 +1,427 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file OptimizeProblem.cpp
HPCG routine
*/
#include "OptimizeProblem.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
#include "Cuda.hpp"
#include "WriteProblem.hpp"
#include "mytimer.hpp"
extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
/*!
Optimizes the data structures used for CG iteration to increase the
performance of the benchmark version of the preconditioned CG algorithm.
@param[inout] A The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
@param[inout] data The data structure with all necessary CG vectors preallocated
@param[inout] b The known right hand side vector
@param[inout] x The solution vector to be computed in future CG iteration
@param[inout] xexact The exact solution vector
@return returns 0 upon success and non-zero otherwise
@see GenerateGeometry
@see GenerateProblem
*/
#ifdef USE_CUDA
size_t OptimizeProblemGpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
{
// This function can be used to completely transform any part of the data structures.
// Right now it does nothing, so compiling with a check for unused variables results in complaints
SparseMatrix* A = &A_in;
local_int_t numberOfMgLevels = 4;
local_int_t slice_size = A->slice_size;
for (int level = 0; level < numberOfMgLevels; ++level)
{
const local_int_t nrow = A->localNumberOfRows;
int totalColors = 8;
// Let's deal with perm and iperm
SetVectorAscCuda(A->ref2opt, nrow);
SetVectorAscCuda(A->opt2ref, nrow);
// Let us color the matrix
int num_colors = 0;
ColorMatrixCuda(NULL, A->gpuAux.columns, A->gpuAux.nnzPerRow, A->localNumberOfRows, A->gpuAux.color,
&(num_colors), A->gpuAux.colorCountCpu, 8, A->ref2opt, A->opt2ref, A->geom->rank, A->geom->nx, NULL);
A->totalColors = totalColors;
PermElemToSendCuda(A->totalToBeSent, A->gpuAux.elementsToSend, A->ref2opt);
// Create (S)ELL
local_int_t TranslateIndex = slice_size * HPCG_MAX_ROW_LEN;
local_int_t* translated_ell_col_index = A->sellAPermColumns + TranslateIndex;
double* translated_ell_values = A->sellAPermValues + TranslateIndex;
EllPermColumnsValuesCuda(nrow, A->gpuAux.nnzPerRow, A->gpuAux.columns, A->gpuAux.values,
A->gpuAux.csrAPermOffsets, translated_ell_col_index, translated_ell_values, A->opt2ref, A->ref2opt,
A->gpuAux.sellADiagonalIdx, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets, false);
// Coloumn mojor blocked/sliced ellpack
TransposeCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues);
// Per block max row len
local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
EllMaxRowLenPerBlockCuda(nrow, slice_size, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets,
A->sellLSliceMrl, A->sellUSliceMrl);
// Find prefix sum for sliced ell
PrefixsumCuda(num_slices, A->sellLSliceMrl);
MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellLSliceMrl + 1);
PrefixsumCuda(num_slices, A->sellUSliceMrl);
MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellUSliceMrl + 1);
// Set the general matrix slice_offsets
CreateAMatrixSliceOffsetsCuda(num_slices + 1, A->slice_size, A->sellASliceMrl);
// Lower Upper ELL variant parts
CreateSellLUColumnsValuesCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues, A->sellLSliceMrl,
A->sellLPermColumns, A->sellLPermValues, A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, level);
local_int_t sell_slices = (nrow + slice_size - 1) / slice_size;
const local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
local_int_t sell_l_nnz = 0;
cudaMemcpyAsync(
&sell_l_nnz, &(A->sellLSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
local_int_t sell_u_nnz = 0;
cudaMemcpyAsync(
&sell_u_nnz, &(A->sellUSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
auto INDEX_TYPE = CUSPARSE_INDEX_32I;
#ifdef INDEX_64 // In src/Geometry
INDEX_TYPE = CUSPARSE_INDEX_64I;
#endif
cusparseCreateSlicedEll(&(A->cusparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
CUDA_R_64F);
cusparseCreateSlicedEll(&(A->cusparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
CUDA_R_64F);
local_int_t sell_nnz = sell_slices * slice_size * HPCG_MAX_ROW_LEN;
cusparseCreateSlicedEll(&(A->cusparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz, slice_size,
A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
CUDA_R_64F);
double alpha = 1.0, beta = 0.0;
size_t e_buf_size = 0;
size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
cusparseDnVecDescr_t dummy1, dummy2;
cusparseCreateDnVec(&dummy1, nrow, x.values_d, CUDA_R_64F);
cusparseCreateDnVec(&dummy2, nrow, b.values_d, CUDA_R_64F);
cusparseCreateDnVec(&(A->cusparseOpt.vecX), nrow, x.values_d, CUDA_R_64F);
cusparseCreateDnVec(&(A->cusparseOpt.vecY), nrow, b.values_d, CUDA_R_64F);
max_buf_size = e_buf_size;
// MV
// Lower
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL, dummy1,
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &l_buf_size);
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU, dummy1,
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &u_buf_size);
cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matA, dummy1,
&beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &i_buf_size);
max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
// SV
// Lower
size_t buffer_size_sv_l, buffer_size_sv_u;
cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrL);
cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrU);
cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
{
cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
A->cusparseOpt.spsvDescrL, &buffer_size_sv_l);
cudaMalloc(&A->bufferSvL, buffer_size_sv_l);
}
cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrL,
A->bufferSvL);
cusparseSpSV_updateMatrix(
cusparsehandle, A->cusparseOpt.spsvDescrL, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
cusparseSpMatSetAttribute(A->cusparseOpt.matU, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
{
cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
A->cusparseOpt.spsvDescrU, &buffer_size_sv_u);
cudaMalloc(&A->bufferSvU, buffer_size_sv_u);
}
cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrU,
A->bufferSvU);
cusparseSpSV_updateMatrix(
cusparsehandle, A->cusparseOpt.spsvDescrU, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
if (max_buf_size > 0)
cudaMalloc(&(A->bufferMvA), max_buf_size);
cusparseDestroyDnVec(dummy1);
cusparseDestroyDnVec(dummy2);
// //////////////////////////////////////////////////////////////////////////
A = A->Ac;
}
A = &A_in;
for (int level = 1; level < numberOfMgLevels; ++level)
{
const local_int_t nrow_c = A->Ac->localNumberOfRows;
const local_int_t nrow_f = A->localNumberOfRows;
F2cPermCuda(nrow_c, A->gpuAux.f2c, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
A = A->Ac;
}
return 0;
}
#endif
#ifdef USE_GRACE
size_t OptimizeProblemCpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
{
// Initialize data structures
size_t mem = AllocateMemCpu(A_in);
SparseMatrix* A = &A_in;
local_int_t numberOfMgLevels = 4;
local_int_t slice_size = A->slice_size;
for (int level = 0; level < numberOfMgLevels; ++level)
{
// Color the matrix
int num_colors;
ColorMatrixCpu(*A, &num_colors);
A->totalColors = num_colors;
// Compute when each color starts
A->cpuAux.firstRowOfColor[0] = 0;
for (int c = 1; c < A->totalColors; c++)
{
A->cpuAux.firstRowOfColor[c] = A->cpuAux.firstRowOfColor[c - 1] + A->cpuAux.nRowsWithColor[c - 1];
}
// Reorder the matrix
CreateSellPermCpu(*A);
#ifndef HPCG_NO_MPI
// Translate row IDs that will be send to neighbours
#pragma omp parallel for
for (local_int_t i = 0; i < A->totalToBeSent; i++)
{
local_int_t orig = A->elementsToSend[i];
A->elementsToSend[i] = A->ref2opt[orig];
}
#endif
local_int_t numberOfNonzerosPerRow = HPCG_MAX_ROW_LEN;
local_int_t nrow = A->localNumberOfRows;
local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
local_int_t sell_l_nnz = A->sellLSliceMrl[num_slices];
local_int_t sell_u_nnz = A->sellUSliceMrl[num_slices];
local_int_t sell_nnz = num_slices * slice_size * numberOfNonzerosPerRow;
auto INDEX_TYPE = NVPL_SPARSE_INDEX_32I;
#ifdef INDEX_64 // In src/Geometry
INDEX_TYPE = NVPL_SPARSE_INDEX_64I;
#endif
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE,
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE,
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz,
slice_size, A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE,
NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
double alpha = 1.0, beta = 0.0;
size_t e_buf_size = 0;
size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecX), nrow, x.values, NVPL_SPARSE_R_64F);
nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecY), nrow, b.values, NVPL_SPARSE_R_64F);
max_buf_size = e_buf_size;
// //MV
// //Lower
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvLDescr);
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvLDescr, &l_buf_size);
// //Upper
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvUDescr);
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvUDescr, &u_buf_size);
// //L+D+U
nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvADescr);
nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvADescr, &i_buf_size);
max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
// //SV
// //Lower
size_t buffer_size_sv_l, buffer_size_sv_u;
nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
nvpl_sparse_diag_type_t diagtype = NVPL_SPARSE_DIAG_TYPE_NON_UNIT;
nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrL);
nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrU);
nvpl_sparse_sp_mat_set_attribute(
A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
nvpl_sparse_sp_mat_set_attribute(
A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
Vector origDiagA;
InitializeVector(origDiagA, A->localNumberOfRows, CPU);
CopyMatrixDiagonal(*A, origDiagA);
// Pass strictly L, and then update the diagonal
if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
{
nvpl_sparse_sp_mat_set_attribute(
A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, &buffer_size_sv_l);
A->bufferSvL = new char[buffer_size_sv_l];
mem += buffer_size_sv_l;
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
}
else
{
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
}
// Pass strctly U, and then update diagonal
nvpl_sparse_sp_mat_set_attribute(
A->nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
{
nvpl_sparse_sp_mat_set_attribute(
A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, &buffer_size_sv_u);
A->bufferSvU = new char[buffer_size_sv_u];
mem += buffer_size_sv_u;
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
}
else
{
nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
}
DeleteVector(origDiagA);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
A = A->Ac;
}
A = &A_in;
for (int level = 1; level < numberOfMgLevels; level++)
{
local_int_t nrow_c = A->Ac->localNumberOfRows;
local_int_t nrow_f = A->localNumberOfRows;
// Permute space injector operator
F2cPermCpu(nrow_c, A->mgData->f2cOperator, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
A = A->Ac;
}
return mem;
}
#endif // USE_GRACE
size_t OptimizeProblem(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
{
size_t result = 0;
if (A_in.rankType == GPU)
{
#ifdef USE_CUDA
result = OptimizeProblemGpu(A_in, data, b, x, xexact);
#endif
}
else
{
#ifdef USE_GRACE
result = OptimizeProblemCpu(A_in, data, b, x, xexact);
#endif
}
return result;
}
// Helper function (see OptimizeProblem.hpp for details)
double OptimizeProblemMemoryUse(const SparseMatrix& A)
{
return 0.0;
}

30
src/OptimizeProblem.hpp Normal file
View File

@@ -0,0 +1,30 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef OPTIMIZEPROBLEM_HPP
#define OPTIMIZEPROBLEM_HPP
#include "CGData.hpp"
#include "SparseMatrix.hpp"
#include "Vector.hpp"
size_t OptimizeProblem(SparseMatrix& A, CGData& data, Vector& b, Vector& x, Vector& xexact);
// This helper function should be implemented in a non-trivial way if OptimizeProblem is non-trivial
// It should return as type double, the total number of bytes allocated and retained after calling OptimizeProblem.
// This value will be used to report Gbytes used in ReportResults (the value returned will be divided by 1000000000.0).
double OptimizeProblemMemoryUse(const SparseMatrix& A);
#endif // OPTIMIZEPROBLEM_HPP

176
src/OutputFile.cpp Normal file
View File

@@ -0,0 +1,176 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include <iostream>
#include <list>
#include <sstream>
#include <string>
#include "OutputFile.hpp"
using std::string;
using std::stringstream;
using std::list;
using std::ofstream;
extern int use_output_file;
OutputFile::OutputFile(const string& name_arg, const string& version_arg)
: name(name_arg)
, version(version_arg)
, eol("\n")
, keySeparator("::")
{
}
OutputFile::OutputFile(void)
: eol("\n")
, keySeparator("::")
{
}
OutputFile::~OutputFile()
{
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
{
delete *it;
}
}
void OutputFile::add(const string& key_arg, const string& value_arg)
{
descendants.push_back(allocKeyVal(key_arg, value_arg));
}
void OutputFile::add(const string& key_arg, double value_arg)
{
stringstream ss;
ss << value_arg;
descendants.push_back(allocKeyVal(key_arg, ss.str()));
}
void OutputFile::add(const string& key_arg, int value_arg)
{
stringstream ss;
ss << value_arg;
descendants.push_back(allocKeyVal(key_arg, ss.str()));
}
#ifndef HPCG_NO_LONG_LONG
void OutputFile::add(const string& key_arg, long long value_arg)
{
stringstream ss;
ss << value_arg;
descendants.push_back(allocKeyVal(key_arg, ss.str()));
}
#endif
void OutputFile::add(const string& key_arg, size_t value_arg)
{
stringstream ss;
ss << value_arg;
descendants.push_back(allocKeyVal(key_arg, ss.str()));
}
void OutputFile::setKeyValue(const string& key_arg, const string& value_arg)
{
key = key_arg;
value = value_arg;
}
OutputFile* OutputFile::get(const string& key_arg)
{
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
{
if ((*it)->key == key_arg)
return *it;
}
return 0;
}
string OutputFile::generateRecursive(string prefix)
{
string result = "";
result += prefix + key + "=" + value + eol;
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
{
result += (*it)->generateRecursive(prefix + key + keySeparator);
}
return result;
}
string OutputFile::generate(void)
{
string result = name + "\nversion=" + version + eol;
for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
{
result += (*it)->generateRecursive("");
}
time_t rawtime;
time(&rawtime);
tm* ptm = localtime(&rawtime);
char sdate[64];
// use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
sprintf(sdate, "%04d-%02d-%02d_%02d-%02d-%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
ptm->tm_min, ptm->tm_sec);
string filename = name + "_" + version + "_";
filename += string(sdate) + ".txt";
if (use_output_file)
{
ofstream myfile(filename.c_str());
myfile << result;
myfile.close();
}
else
{
std::cout << result << std::flush;
}
return result;
}
OutputFile* OutputFile::allocKeyVal(const std::string& key_arg, const std::string& value_arg)
{
OutputFile* of = new OutputFile();
of->setKeyValue(key_arg, value_arg);
return of;
}

161
src/OutputFile.hpp Normal file
View File

@@ -0,0 +1,161 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file Output_File.hpp
HPCG output file classes
*/
#ifndef OUTPUTFILE_HPP
#define OUTPUTFILE_HPP
#include <list>
#include <string>
//! The OutputFile class for the uniform collecting and reporting of performance data for HPCG
/*!
The OutputFile class facilitates easy collecting and reporting of
key-value-formatted data that can be then registered with the HPCG results
collection website. The keys may have hierarchy key1::key2::key3=val with
double colon :: as a separator. A sample output may look like this (note how
"major" and "micro" keys repeat with different ancestor keys):
\code
version=3.2.1alpha
version::major=3
version::minor=2
version::micro=1
version::release=alpha
axis=xyz
axis::major=x
axis::minor=y
\endcode
*/
class OutputFile
{
protected:
std::list<OutputFile*> descendants; //!< descendant elements
std::string name; //!< name of the benchmark
std::string version; //!< version of the benchmark
std::string key; //!< the key under which the element is stored
std::string value; //!< the value of the stored element
std::string eol; //!< end-of-line character sequence in the output file
std::string keySeparator; //!< character sequence to separate keys in the output file
//! Recursively generate output string from descendant list, and their descendants and so on
std::string generateRecursive(std::string prefix);
public:
static OutputFile* allocKeyVal(const std::string& key, const std::string& value);
//! Constructor: accepts name and version as strings that are used to create a file name for printing results.
/*!
This constructor accepts and name and version number for the benchmark that
are used to form a file name information for results that are generated by
the generate() method.
\param name (in) string containing name of the benchmark
\param version (in) string containing the version of the benchmark
*/
OutputFile(const std::string& name, const std::string& version);
//! Default constructor: no-arguments accepted, should be used for descendant nodes
/*!
This no-argument constructor can be used for descendant nodes to provide
key1::key2::key3=val output. Unlike the root node, descendant nodes do not
have name and version but only store key-value pairs.
*/
OutputFile(void);
~OutputFile();
//! Create and add a descendant element with value of type "string"
/*!
Create and add a descendant element identified by "key" and associated with
"value". The element is added at the end of a list of previously added
elements.
@param[in] key The key that identifies the added element and under which the element is stored
@param[in] value The value stored by the element
*/
void add(const std::string& key, const std::string& value);
//! Create and add a descendant element with value of type "double"
/*!
Create and add a descendant element identified by "key" and associated with
"value". The element is added at the end of a list of previously added
elements.
@param[in] key The key that identifies the added element and under which the element is stored
@param[in] value The value stored by the element
*/
void add(const std::string& key, double value);
//! Create and add a descendant element with value of type "int"
/*!
Create and add a descendant element identified by "key" and associated with
"value". The element is added at the end of a list of previously added
elements.
@param[in] key The key that identifies the added element and under which the element is stored
@param[in] value The value stored by the element
*/
void add(const std::string& key, int value);
#ifndef HPCG_NO_LONG_LONG
//! Create and add a descendant element with value of type "long long"
/*!
Create and add a descendant element identified by "key" and associated with
"value". The element is added at the end of a list of previously added
elements.
@param[in] key The key that identifies the added element and under which the element is stored
@param[in] value The value stored by the element
*/
void add(const std::string& key, long long value);
#endif
//! Create and add a descendant element with value of type "size_t"
/*!
Create and add a descendant element identified by "key" and associated with
"value". The element is added at the end of a list of previously added
elements.
@param[in] key The key that identifies the added element and under which the element is stored
@param[in] value The value stored by the element
*/
void add(const std::string& key, size_t value);
//! Key-Value setter method
/*!
Set the key and the value of this element.
@param[in] key The key that identifies this element and under which the element is stored
@param[in] value The value stored by the element
*/
void setKeyValue(const std::string& key, const std::string& value);
//! Get the element in the list with the given key or return NULL if not found
OutputFile* get(const std::string& key);
//! Generate output string with results based on the stored key-value hierarchy
std::string generate(void);
};
#endif // OUTPUTFILE_HPP

79
src/ReadHpcgDat.cpp Normal file
View File

@@ -0,0 +1,79 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#include <cstdio>
#include "ReadHpcgDat.hpp"
static int SkipUntilEol(FILE* stream)
{
int chOrEof;
bool finished;
do
{
chOrEof = fgetc(stream);
finished = (chOrEof == EOF) || (chOrEof == '\n') || (chOrEof == '\r');
} while (!finished);
if ('\r' == chOrEof)
{ // on Windows, \r might be followed by \n
int chOrEofExtra = fgetc(stream);
if ('\n' == chOrEofExtra || EOF == chOrEofExtra)
chOrEof = chOrEofExtra;
else
ungetc(chOrEofExtra, stream);
}
return chOrEof;
}
int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename)
{
FILE* hpcgStream = fopen(filename, "r");
if (!hpcgStream)
{
printf("Cannot open input file: %s\n", filename);
return -1;
}
SkipUntilEol(hpcgStream); // skip the first line
SkipUntilEol(hpcgStream); // skip the second line
for (int i = 0; i < 3; ++i)
if (fscanf(hpcgStream, "%d", localDimensions + i) != 1 || localDimensions[i] < 16)
localDimensions[i] = 16;
SkipUntilEol(hpcgStream); // skip the rest of the second line
if (secondsPerRun != 0)
{ // Only read number of seconds if the pointer is non-zero
if (fscanf(hpcgStream, "%d", secondsPerRun) != 1 || secondsPerRun[0] < 0)
secondsPerRun[0] = 30 * 60; // 30 minutes
}
SkipUntilEol(hpcgStream); // skip the rest of the third line
for (int i = 0; i < 3; ++i)
// the user didn't specify (or values are invalid) process dimensions
if (fscanf(hpcgStream, "%d", localProcDimensions + i) != 1 || localProcDimensions[i] < 1)
localProcDimensions[i] = 0; // value 0 means: "not specified" and it will be fixed later
fclose(hpcgStream);
return 0;
}

20
src/ReadHpcgDat.hpp Normal file
View File

@@ -0,0 +1,20 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef READHPCGDAT_HPP
#define READHPCGDAT_HPP
int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename);
#endif // READHPCGDAT_HPP

512
src/ReportResults.cpp Normal file
View File

@@ -0,0 +1,512 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file ReportResults.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#include "OptimizeProblem.hpp"
#include "OutputFile.hpp"
#include "ReportResults.hpp"
#include <vector>
#ifdef HPCG_DEBUG
#include <fstream>
using std::endl;
#include "hpcg.hpp"
#endif
extern int use_output_file;
/*!
Creates a YAML file and writes the information about the HPCG run, its results, and validity.
@param[in] geom The description of the problem's geometry.
@param[in] A The known system matrix
@param[in] numberOfMgLevels Number of levels in multigrid V cycle
@param[in] numberOfCgSets Number of CG runs performed
@param[in] niters Number of preconditioned CG iterations performed to lower the residual below a threshold
@param[in] times Vector of cumulative timings for each of the phases of a preconditioned CG iteration
@param[in] testcg_data the data structure with the results of the CG-correctness test including pass/fail
information
@param[in] testsymmetry_data the data structure with the results of the CG symmetry test including pass/fail
information
@param[in] testnorms_data the data structure with the results of the CG norm test including pass/fail information
@param[in] global_failure indicates whether a failure occurred during the correctness tests of CG
@see YAML_Doc
*/
void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
const TestNormsData& testnorms_data, int global_failure, bool quickPath)
{
double minOfficialTime = 1800; // Any official benchmark result must run at least this many seconds
#ifndef HPCG_NO_MPI
double t4 = times[4];
double t4min = 0.0;
double t4max = 0.0;
double t4avg = 0.0;
MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
t4avg = t4avg / ((double) A.geom->size);
#endif
if (A.geom->rank == 0)
{ // Only PE 0 needs to compute and report timing results
// TODO: Put the FLOP count, Memory BW and Memory Usage models into separate functions
// ======================== FLOP count model =======================================
double fNumberOfCgSets = numberOfCgSets;
double fniters = fNumberOfCgSets * (double) optMaxIters;
double fnrow = A.totalNumberOfRows;
double fnnz = A.totalNumberOfNonzeros;
// Op counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
double fnops_ddot = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 ddots with nrow adds and nrow mults
double fnops_waxpby
= (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 WAXPBYs with nrow adds and nrow mults
double fnops_sparsemv = (fniters + fNumberOfCgSets) * 2.0 * fnnz; // 1 SpMV with nnz adds and nnz mults
// Op counts from the multigrid preconditioners
double fnops_precond = 0.0;
const SparseMatrix* Af = &A;
for (int i = 1; i < numberOfMgLevels; ++i)
{
double fnnz_Af = Af->totalNumberOfNonzeros;
double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
fnops_precond += fnumberOfPresmootherSteps * fniters * 4.0 * fnnz_Af; // number of presmoother flops
fnops_precond += fniters * 2.0 * fnnz_Af; // cost of fine grid residual calculation
fnops_precond += fnumberOfPostsmootherSteps * fniters * 4.0 * fnnz_Af; // number of postsmoother flops
Af = Af->Ac; // Go to next coarse level
}
fnops_precond
+= fniters * 4.0 * ((double) Af->totalNumberOfNonzeros); // One symmetric GS sweep at the coarsest level
double fnops = fnops_ddot + fnops_waxpby + fnops_sparsemv + fnops_precond;
double frefnops = fnops * ((double) refMaxIters) / ((double) optMaxIters);
// ======================== Memory bandwidth model =======================================
// Read/Write counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
double fnreads_ddot
= (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow * sizeof(double); // 3 ddots with 2 nrow reads
double fnwrites_ddot = (3.0 * fniters + fNumberOfCgSets) * sizeof(double); // 3 ddots with 1 write
double fnreads_waxpby = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow
* sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
double fnwrites_waxpby
= (3.0 * fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
double fnreads_sparsemv = (fniters + fNumberOfCgSets)
* (fnnz * (sizeof(double) + sizeof(local_int_t))
+ fnrow * sizeof(double)); // 1 SpMV with nnz reads of values, nnz reads indices,
// plus nrow reads of x
double fnwrites_sparsemv = (fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 1 SpMV nrow writes
// Op counts from the multigrid preconditioners
double fnreads_precond = 0.0;
double fnwrites_precond = 0.0;
Af = &A;
for (int i = 1; i < numberOfMgLevels; ++i)
{
double fnnz_Af = Af->totalNumberOfNonzeros;
double fnrow_Af = Af->totalNumberOfRows;
double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
fnreads_precond += fnumberOfPresmootherSteps * fniters
* (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+ fnrow_Af * sizeof(double)); // number of presmoother reads
fnwrites_precond
+= fnumberOfPresmootherSteps * fniters * fnrow_Af * sizeof(double); // number of presmoother writes
fnreads_precond += fniters
* (fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+ fnrow_Af * sizeof(double)); // Number of reads for fine grid residual calculation
fnwrites_precond
+= fniters * fnnz_Af * sizeof(double); // Number of writes for fine grid residual calculation
fnreads_precond += fnumberOfPostsmootherSteps * fniters
* (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+ fnrow_Af * sizeof(double)); // number of postsmoother reads
fnwrites_precond
+= fnumberOfPostsmootherSteps * fniters * fnnz_Af * sizeof(double); // number of postsmoother writes
Af = Af->Ac; // Go to next coarse level
}
double fnnz_Af = Af->totalNumberOfNonzeros;
double fnrow_Af = Af->totalNumberOfRows;
fnreads_precond
+= fniters * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t)) + fnrow_Af * sizeof(double));
; // One symmetric GS sweep at the coarsest level
fnwrites_precond += fniters * fnrow_Af * sizeof(double); // One symmetric GS sweep at the coarsest level
double fnreads = fnreads_ddot + fnreads_waxpby + fnreads_sparsemv + fnreads_precond;
double fnwrites = fnwrites_ddot + fnwrites_waxpby + fnwrites_sparsemv + fnwrites_precond;
double frefnreads = fnreads * ((double) refMaxIters) / ((double) optMaxIters);
double frefnwrites = fnwrites * ((double) refMaxIters) / ((double) optMaxIters);
// ======================== Memory usage model =======================================
// Data in GenerateProblem_ref
double numberOfNonzerosPerRow
= 27.0; // We are approximating a 27-point finite element/volume/difference 3D stencil
double size = ((double) A.geom->size); // Needed for estimating size of halo
double fnbytes = ((double) sizeof(Geometry)); // Geometry struct in main.cpp
fnbytes += ((double) sizeof(double) * fNumberOfCgSets); // testnorms_data in main.cpp
// Model for GenerateProblem_ref.cpp
fnbytes += fnrow * sizeof(char); // array nonzerosInRow
fnbytes += fnrow * ((double) sizeof(global_int_t*)); // mtxIndG
fnbytes += fnrow * ((double) sizeof(local_int_t*)); // mtxIndL
fnbytes += fnrow * ((double) sizeof(double*)); // matrixValues
fnbytes += fnrow * ((double) sizeof(double*)); // matrixDiagonal
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(local_int_t)); // mtxIndL[1..nrows]
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(double)); // matrixValues[1..nrows]
fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
fnbytes += fnrow * ((double) 3 * sizeof(double)); // x, b, xexact
// Model for CGData.hpp
double fncol = ((global_int_t) A.localNumberOfColumns)
* size; // Estimate of the global number of columns using the value from rank 0
fnbytes += fnrow * ((double) 2 * sizeof(double)); // r, Ap
fnbytes += fncol * ((double) 2 * sizeof(double)); // z, p
std::vector<double> fnbytesPerLevel(numberOfMgLevels); // Count byte usage per level (level 0 is main CG level)
fnbytesPerLevel[0] = fnbytes;
// Benchmarker-provided model for OptimizeProblem.cpp
double fnbytes_OptimizedProblem = OptimizeProblemMemoryUse(A);
fnbytes += fnbytes_OptimizedProblem;
Af = A.Ac;
for (int i = 1; i < numberOfMgLevels; ++i)
{
double fnrow_Af = Af->totalNumberOfRows;
double fncol_Af = ((global_int_t) Af->localNumberOfColumns)
* size; // Estimate of the global number of columns using the value from rank 0
double fnbytes_Af = 0.0;
// Model for GenerateCoarseProblem.cpp
fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t)); // f2cOperator
fnbytes_Af += fnrow_Af * ((double) sizeof(double)); // rc
fnbytes_Af += 2.0 * fncol_Af
* ((double) sizeof(double)); // xc, Axf are estimated based on the size of these arrays on rank 0
fnbytes_Af += ((double) (sizeof(Geometry) + sizeof(SparseMatrix) + 3 * sizeof(Vector)
+ sizeof(MGData))); // Account for structs geomc, Ac, rc, xc, Axf - (minor)
// Model for GenerateProblem.cpp (called within GenerateCoarseProblem.cpp)
fnbytes_Af += fnrow_Af * sizeof(char); // array nonzerosInRow
fnbytes_Af += fnrow_Af * ((double) sizeof(global_int_t*)); // mtxIndG
fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t*)); // mtxIndL
fnbytes_Af += fnrow_Af * ((double) sizeof(double*)); // matrixValues
fnbytes_Af += fnrow_Af * ((double) sizeof(double*)); // matrixDiagonal
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(local_int_t)); // mtxIndL[1..nrows]
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(double)); // matrixValues[1..nrows]
fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
// Model for SetupHalo_ref.cpp
#ifndef HPCG_NO_MPI
fnbytes_Af += ((double) sizeof(double) * Af->totalToBeSent); // sendBuffer
fnbytes_Af += ((double) sizeof(local_int_t) * Af->totalToBeSent); // elementsToSend
fnbytes_Af += ((double) sizeof(int) * Af->numberOfSendNeighbors); // neighbors
fnbytes_Af += ((double) sizeof(local_int_t) * Af->numberOfSendNeighbors); // receiveLength, sendLength
#endif
fnbytesPerLevel[i] = fnbytes_Af;
fnbytes += fnbytes_Af; // Running sum
Af = Af->Ac; // Go to next coarse level
}
assert(Af == 0); // Make sure we got to the lowest grid level
// Count number of bytes used per equation
double fnbytesPerEquation = fnbytes / fnrow;
// Instantiate YAML document
OutputFile doc("HPCG-Benchmark", "3.1");
doc.add("Release date", "March 28, 2019");
doc.add("Machine Summary", "");
doc.get("Machine Summary")->add("Distributed Processes", A.geom->size);
doc.get("Machine Summary")->add("Threads per processes", A.geom->numThreads);
doc.add("Global Problem Dimensions", "");
doc.get("Global Problem Dimensions")->add("Global nx", A.geom->gnx);
doc.get("Global Problem Dimensions")->add("Global ny", A.geom->gny);
doc.get("Global Problem Dimensions")->add("Global nz", A.geom->gnz);
doc.add("Processor Dimensions", "");
doc.get("Processor Dimensions")->add("npx", A.geom->npx);
doc.get("Processor Dimensions")->add("npy", A.geom->npy);
doc.get("Processor Dimensions")->add("npz", A.geom->npz);
doc.add("Local Domain Dimensions", "");
doc.get("Local Domain Dimensions")->add("nx", A.geom->nx);
doc.get("Local Domain Dimensions")->add("ny", A.geom->ny);
doc.add("########## Problem Summary ##########", "");
doc.add("Setup Information", "");
doc.get("Setup Information")->add("Setup Time", times[9]);
doc.add("Linear System Information", "");
doc.get("Linear System Information")->add("Number of Equations", A.totalNumberOfRows);
doc.get("Linear System Information")->add("Number of Nonzero Terms", A.totalNumberOfNonzeros);
doc.add("Multigrid Information", "");
doc.get("Multigrid Information")->add("Number of coarse grid levels", numberOfMgLevels - 1);
Af = &A;
doc.get("Multigrid Information")->add("Coarse Grids", "");
for (int i = 1; i < numberOfMgLevels; ++i)
{
doc.get("Multigrid Information")->get("Coarse Grids")->add("Grid Level", i);
doc.get("Multigrid Information")
->get("Coarse Grids")
->add("Number of Equations", Af->Ac->totalNumberOfRows);
doc.get("Multigrid Information")
->get("Coarse Grids")
->add("Number of Nonzero Terms", Af->Ac->totalNumberOfNonzeros);
doc.get("Multigrid Information")
->get("Coarse Grids")
->add("Number of Presmoother Steps", Af->mgData->numberOfPresmootherSteps);
doc.get("Multigrid Information")
->get("Coarse Grids")
->add("Number of Postsmoother Steps", Af->mgData->numberOfPostsmootherSteps);
Af = Af->Ac;
}
doc.add("########## Memory Use Summary ##########", "");
doc.add("Memory Use Information", "");
doc.get("Memory Use Information")->add("Total memory used for data (Gbytes)", fnbytes / 1000000000.0);
doc.get("Memory Use Information")
->add("Memory used for OptimizeProblem data (Gbytes)", fnbytes_OptimizedProblem / 1000000000.0);
doc.get("Memory Use Information")
->add("Bytes per equation (Total memory / Number of Equations)", fnbytesPerEquation);
doc.get("Memory Use Information")
->add("Memory used for linear system and CG (Gbytes)", fnbytesPerLevel[0] / 1000000000.0);
doc.get("Memory Use Information")->add("Coarse Grids", "");
for (int i = 1; i < numberOfMgLevels; ++i)
{
doc.get("Memory Use Information")->get("Coarse Grids")->add("Grid Level", i);
doc.get("Memory Use Information")
->get("Coarse Grids")
->add("Memory used", fnbytesPerLevel[i] / 1000000000.0);
}
doc.add("########## V&V Testing Summary ##########", "");
doc.add("Spectral Convergence Tests", "");
if (testcg_data.count_fail == 0)
doc.get("Spectral Convergence Tests")->add("Result", "PASSED");
else
doc.get("Spectral Convergence Tests")->add("Result", "FAILED");
doc.get("Spectral Convergence Tests")->add("Unpreconditioned", "");
doc.get("Spectral Convergence Tests")
->get("Unpreconditioned")
->add("Maximum iteration count", testcg_data.niters_max_no_prec);
doc.get("Spectral Convergence Tests")
->get("Unpreconditioned")
->add("Expected iteration count", testcg_data.expected_niters_no_prec);
doc.get("Spectral Convergence Tests")->add("Preconditioned", "");
doc.get("Spectral Convergence Tests")
->get("Preconditioned")
->add("Maximum iteration count", testcg_data.niters_max_prec);
doc.get("Spectral Convergence Tests")
->get("Preconditioned")
->add("Expected iteration count", testcg_data.expected_niters_prec);
const char DepartureFromSymmetry[] = "Departure from Symmetry |x'Ay-y'Ax|/(2*||x||*||A||*||y||)/epsilon";
doc.add(DepartureFromSymmetry, "");
if (testsymmetry_data.count_fail == 0)
doc.get(DepartureFromSymmetry)->add("Result", "PASSED");
else
doc.get(DepartureFromSymmetry)->add("Result", "FAILED");
doc.get(DepartureFromSymmetry)->add("Departure for SpMV", testsymmetry_data.depsym_spmv);
doc.get(DepartureFromSymmetry)->add("Departure for MG", testsymmetry_data.depsym_mg);
doc.add("########## Iterations Summary ##########", "");
doc.add("Iteration Count Information", "");
if (!global_failure)
doc.get("Iteration Count Information")->add("Result", "PASSED");
else
doc.get("Iteration Count Information")->add("Result", "FAILED");
doc.get("Iteration Count Information")->add("Reference CG iterations per set", refMaxIters);
doc.get("Iteration Count Information")->add("Optimized CG iterations per set", optMaxIters);
doc.get("Iteration Count Information")
->add("Total number of reference iterations", refMaxIters * numberOfCgSets);
doc.get("Iteration Count Information")
->add("Total number of optimized iterations", optMaxIters * numberOfCgSets);
doc.add("########## Reproducibility Summary ##########", "");
doc.add("Reproducibility Information", "");
if (testnorms_data.pass)
doc.get("Reproducibility Information")->add("Result", "PASSED");
else
doc.get("Reproducibility Information")->add("Result", "FAILED");
doc.get("Reproducibility Information")->add("Scaled residual mean", testnorms_data.mean);
doc.get("Reproducibility Information")->add("Scaled residual variance", testnorms_data.variance);
doc.add("########## Performance Summary (times in sec) ##########", "");
doc.add("Benchmark Time Summary", "");
doc.get("Benchmark Time Summary")->add("Optimization phase", times[7]);
doc.get("Benchmark Time Summary")->add("DDOT", times[1]);
doc.get("Benchmark Time Summary")->add("WAXPBY", times[2]);
doc.get("Benchmark Time Summary")->add("SpMV", times[3]);
doc.get("Benchmark Time Summary")->add("MG", times[5]);
doc.get("Benchmark Time Summary")->add("Total", times[0]);
doc.add("Floating Point Operations Summary", "");
doc.get("Floating Point Operations Summary")->add("Raw DDOT", fnops_ddot);
doc.get("Floating Point Operations Summary")->add("Raw WAXPBY", fnops_waxpby);
doc.get("Floating Point Operations Summary")->add("Raw SpMV", fnops_sparsemv);
doc.get("Floating Point Operations Summary")->add("Raw MG", fnops_precond);
doc.get("Floating Point Operations Summary")->add("Total", fnops);
doc.get("Floating Point Operations Summary")->add("Total with convergence overhead", frefnops);
doc.add("GB/s Summary", "");
doc.get("GB/s Summary")->add("Raw Read B/W", fnreads / times[0] / 1.0E9);
doc.get("GB/s Summary")->add("Raw Write B/W", fnwrites / times[0] / 1.0E9);
doc.get("GB/s Summary")->add("Raw Total B/W", (fnreads + fnwrites) / (times[0]) / 1.0E9);
doc.get("GB/s Summary")
->add("Total with convergence and optimization phase overhead",
(frefnreads + frefnwrites) / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0))
/ 1.0E9);
doc.add("GFLOP/s Summary", "");
doc.get("GFLOP/s Summary")->add("Raw DDOT", fnops_ddot / times[1] / 1.0E9);
doc.get("GFLOP/s Summary")->add("Raw WAXPBY", fnops_waxpby / times[2] / 1.0E9);
doc.get("GFLOP/s Summary")->add("Raw SpMV", fnops_sparsemv / (times[3]) / 1.0E9);
doc.get("GFLOP/s Summary")->add("Raw MG", fnops_precond / (times[5]) / 1.0E9);
doc.get("GFLOP/s Summary")->add("Raw Total", fnops / times[0] / 1.0E9);
doc.get("GFLOP/s Summary")->add("Total with convergence overhead", frefnops / times[0] / 1.0E9);
// This final GFLOP/s rating includes the overhead of problem setup and optimizing the data structures vs ten
// sets of 50 iterations of CG
double totalGflops = frefnops / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0)) / 1.0E9;
double totalGflops24 = frefnops / (times[0] + fNumberOfCgSets * times[7] / 10.0) / 1.0E9;
doc.get("GFLOP/s Summary")->add("Total with convergence and optimization phase overhead", totalGflops);
doc.add("User Optimization Overheads", "");
doc.get("User Optimization Overheads")->add("Optimization phase time (sec)", (times[7]));
doc.get("User Optimization Overheads")
->add("Optimization phase time vs reference SpMV+MG time", times[7] / times[8]);
#ifndef HPCG_NO_MPI
doc.add("DDOT Timing Variations", "");
doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time", t4min);
doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time", t4max);
doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time", t4avg);
// doc.get("Sparse Operations Overheads")->add("Halo exchange time (sec)", (times[6]));
// doc.get("Sparse Operations Overheads")->add("Halo exchange as percentage of SpMV time",
// (times[6])/totalSparseMVTime*100.0);
#endif
doc.add("Final Summary", "");
bool isValidRun = (testcg_data.count_fail == 0) && (testsymmetry_data.count_fail == 0) && (testnorms_data.pass)
&& (!global_failure);
if (isValidRun)
{
doc.get("Final Summary")->add("HPCG result is VALID with a GFLOP/s rating of", totalGflops);
doc.get("Final Summary")->add("HPCG 2.4 rating for historical reasons is", totalGflops24);
if (!A.isDotProductOptimized)
{
doc.get("Final Summary")
->add("Reference version of ComputeDotProduct used",
"Performance results are most likely suboptimal");
}
if (!A.isSpmvOptimized)
{
doc.get("Final Summary")
->add("Reference version of ComputeSPMV used", "Performance results are most likely suboptimal");
}
if (!A.isMgOptimized)
{
if (A.geom->numThreads > 1)
doc.get("Final Summary")
->add("Reference version of ComputeMG used and number of threads greater than 1",
"Performance results are severely suboptimal");
else // numThreads ==1
doc.get("Final Summary")
->add("Reference version of ComputeMG used", "Performance results are most likely suboptimal");
}
if (!A.isWaxpbyOptimized)
{
doc.get("Final Summary")
->add("Reference version of ComputeWAXPBY used", "Performance results are most likely suboptimal");
}
if (times[0] >= minOfficialTime)
{
doc.get("Final Summary")
->add("Please upload results from the YAML file contents to", "http://hpcg-benchmark.org");
}
else
{
doc.get("Final Summary")->add("Results are valid but execution time (sec) is", times[0]);
if (quickPath)
{
doc.get("Final Summary")
->add("You have selected the QuickPath option",
"Results are official for legacy installed systems with confirmation from the HPCG "
"Benchmark leaders.");
doc.get("Final Summary")
->add("After confirmation please upload results from the YAML file contents to",
"http://hpcg-benchmark.org");
}
else
{
doc.get("Final Summary")
->add("Official results execution time (sec) must be at least", minOfficialTime);
}
}
}
else
{
doc.get("Final Summary")->add("HPCG result is", "INVALID.");
doc.get("Final Summary")
->add("Please review the YAML file contents", "You may NOT submit these results for consideration.");
}
std::string yaml = doc.generate();
#ifdef HPCG_DEBUG
HPCG_fout << yaml;
#endif
}
return;
}

26
src/ReportResults.hpp Normal file
View File

@@ -0,0 +1,26 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef REPORTRESULTS_HPP
#define REPORTRESULTS_HPP
#include "SparseMatrix.hpp"
#include "TestCG.hpp"
#include "TestNorms.hpp"
#include "TestSymmetry.hpp"
void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
const TestNormsData& testnorms_data, int global_failure, bool quickPath);
#endif // REPORTRESULTS_HPP

729
src/SetupHalo.cpp Normal file
View File

@@ -0,0 +1,729 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file SetupHalo.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <map>
#include <mpi.h>
#include <set>
#endif
#include <algorithm>
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#include "SetupHalo.hpp"
#include "SetupHalo_ref.hpp"
#ifdef USE_CUDA
#include "Cuda.hpp"
#include "CudaKernels.hpp"
#endif
#ifdef USE_GRACE
#include "CpuKernels.hpp"
#endif
#ifndef HPCG_NO_MPI
// Used to find ranks for CPU and GPU programs
extern int global_total_ranks;
extern int* physical_rank_dims;
extern int* logical_rank_to_phys;
extern int* rankToId_h;
extern int* idToRank_h;
extern p2p_comm_mode_t P2P_Mode;
#endif
/*!
Prepares system matrix data structure and creates data necessary necessary
for communication of boundary values of this process.
@param[inout] A The known system matrix
@see ExchangeHalo
*/
#ifdef USE_CUDA
void SetupHalo_Gpu(SparseMatrix& A)
{
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
#ifndef HPCG_NO_MPI
local_int_t localNumberOfRows = A.localNumberOfRows;
local_int_t* send_buffer_d;
local_int_t sendbufld
= std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
int* neighbors = new int[27];
int* neighborsPhysical = new int[27];
CHECK_CUDART(cudaMalloc((void**) &(send_buffer_d), 27 * sendbufld * sizeof(local_int_t)));
local_int_t* sendLength = new local_int_t[27];
local_int_t totalToBeSent = 0;
int neiCount = 0;
int numberOfExternalValues = 0;
local_int_t* sendcounts2 = new local_int_t[27];
local_int_t* receiveLength = new local_int_t[27];
memset(sendcounts2, 0, sizeof(local_int_t) * (27));
local_int_t* sendcounts_d = NULL;
local_int_t* elementsToSendGpu;
cudaMalloc(&sendcounts_d, sizeof(local_int_t) * (27));
cudaMemsetAsync(sendcounts_d, 0, sizeof(local_int_t) * (27), stream);
// Finds elements to send and neighbors
SetupHaloCuda(A, sendbufld, sendcounts_d, send_buffer_d, &totalToBeSent, &neiCount, neighbors, sendLength,
&elementsToSendGpu);
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
double* sendBuffer = nullptr;
if (totalToBeSent > 0)
{
cudaMemcpyAsync(
elementsToSend, elementsToSendGpu, sizeof(local_int_t) * totalToBeSent, cudaMemcpyDeviceToHost, stream);
local_int_t* sendcounts = (local_int_t*) malloc(sizeof(local_int_t) * (A.geom->size + 1));
memset(sendcounts, 0, sizeof(local_int_t) * (A.geom->size + 1));
local_int_t *eltsToRecv_d = NULL, *extToLocMap = NULL;
sendcounts[0] = 0;
for (int i = 0; i < neiCount; i++)
{
receiveLength[i] = sendLength[i];
sendcounts[i + 1] = sendcounts[i] + sendLength[i];
int neighborId = neighbors[i];
neighborsPhysical[i] = logical_rank_to_phys[neighborId];
}
CHECK_CUDART(cudaMalloc(&extToLocMap, sizeof(local_int_t) * localNumberOfRows));
CHECK_CUDART(cudaMalloc(&eltsToRecv_d, sizeof(local_int_t) * totalToBeSent));
CHECK_CUDART(cudaMallocHost(&(sendBuffer), sizeof(double) * totalToBeSent));
CHECK_CUDART(cudaMalloc(&(A.gpuAux.sendBuffer), sizeof(double) * totalToBeSent));
local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
// Exchange elements to send with neighbors
auto INDEX_TYPE = MPI_INT;
#ifdef INDEX_64 // In src/Geometry
INDEX_TYPE = MPI_LONG;
#endif
MPI_Status status;
int MPI_MY_TAG = 93;
MPI_Request* request = new MPI_Request[neiCount];
cudaStreamSynchronize(stream);
local_int_t* recv_ptr = eltsToRecv;
for (int i = 0; i < neiCount; i++)
{
auto n_recv = sendLength[i];
MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
recv_ptr += n_recv;
}
local_int_t* elts_ptr = elementsToSend;
for (int i = 0; i < neiCount; i++)
{
auto n_send = sendLength[i];
MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
elts_ptr += n_send;
}
for (int i = 0; i < neiCount; i++)
{
MPI_Wait(request + i, &status);
}
delete[] request;
cudaMemcpyAsync(
eltsToRecv_d, eltsToRecv, sizeof(local_int_t) * (totalToBeSent), cudaMemcpyHostToDevice, stream);
// Add the sorted indices from neighbors. For each neighbor, add its indices sequentially
// before the next neighbor's indices. Tje indices will be adjusted to be
// localNumberOfRows + its sequential location
for (int neighborCount = 0; neighborCount < neiCount; ++neighborCount)
{
int neighborId = neighbors[neighborCount];
cudaMemsetAsync(extToLocMap, 0, sizeof(local_int_t) * localNumberOfRows, stream);
local_int_t str = sendcounts[neighborCount];
local_int_t end = sendcounts[neighborCount + 1];
ExtToLocMapCuda(localNumberOfRows, str, end, extToLocMap, eltsToRecv_d);
ExtTolocCuda(localNumberOfRows, neighborId, A.extNnz, A.csrExtColumns, A.csrExtValues,
A.gpuAux.ext2csrOffsets, extToLocMap, A.gpuAux.columns);
}
CHECK_CUDART(cudaFree(sendcounts_d));
CHECK_CUDART(cudaFree(extToLocMap));
CHECK_CUDART(cudaFree(eltsToRecv_d));
// For P2P Alltoallv communication
if (P2P_Mode == MPI_GPU_All2allv || P2P_Mode == MPI_CPU_All2allv)
{
int* sdispls = new int[A.geom->size];
int* rdispls = new int[A.geom->size];
int* scounts = new int[A.geom->size];
int* rcounts = new int[A.geom->size];
int tmp_s = 0, tmp_r = 0;
if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
return;
for (local_int_t i = 0; i < A.geom->size; i++)
{
scounts[i] = 0;
rcounts[i] = 0;
sdispls[i] = 0;
rdispls[i] = 0;
}
for (local_int_t i = 0; i < neiCount; i++)
{
local_int_t root = neighborsPhysical[i];
scounts[root] = sendLength[i];
rcounts[root] = receiveLength[i];
sdispls[root] = tmp_s;
tmp_s += sendLength[i];
rdispls[root] = tmp_r;
tmp_r += receiveLength[i];
}
A.scounts = scounts;
A.rcounts = rcounts;
A.sdispls = sdispls;
A.rdispls = rdispls;
}
}
// Store contents in our matrix struct
A.numberOfExternalValues = totalToBeSent;
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
A.numberOfSendNeighbors = neiCount;
A.totalToBeSent = totalToBeSent;
A.elementsToSend = elementsToSend;
A.gpuAux.elementsToSend = elementsToSendGpu;
A.neighbors = neighbors;
A.neighborsPhysical = neighborsPhysical;
A.receiveLength = receiveLength;
A.sendLength = sendLength;
A.sendBuffer = sendBuffer;
#endif
return;
}
#endif
#ifdef USE_GRACE
void SetupHalo_Cpu(SparseMatrix& A)
{
// Extract Matrix pieces
global_int_t nx = A.geom->nx;
global_int_t ny = A.geom->ny;
global_int_t nz = A.geom->nz;
global_int_t gnx = A.geom->gnx;
global_int_t gny = A.geom->gny;
global_int_t gnz = A.geom->gnz;
global_int_t gix0 = A.geom->gix0;
global_int_t giy0 = A.geom->giy0;
global_int_t giz0 = A.geom->giz0;
int npx = A.geom->npx;
int npy = A.geom->npy;
local_int_t localNumberOfRows = A.localNumberOfRows;
local_int_t* nonzerosInRow = A.nonzerosInRow;
global_int_t** mtxIndG = A.mtxIndG;
local_int_t** mtxIndL = A.mtxIndL;
#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
int cur_nnz = nonzerosInRow[i];
for (int j = 0; j < cur_nnz; j++)
mtxIndL[i][j] = mtxIndG[i][j];
}
#else // Run this section if compiling for MPI
// Scan global IDs of the nonzeros in the matrix. Determine if the column ID matches a row ID. If not:
// 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
// We need to receive this value of the x vector during the halo exchange.
// 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
std::map<local_int_t, std::map<global_int_t, local_int_t>> externalToLocalMap;
local_int_t* extTemp = new local_int_t[localNumberOfRows];
// Okay Let us git rid of the map
local_int_t sendbufld
= std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
local_int_t* send_buffer = new local_int_t[27 * sendbufld];
char* has_external = new char[localNumberOfRows];
local_int_t* sendcounter = new local_int_t[27];
for (local_int_t i = 0; i < 27; i++)
sendcounter[i] = 0;
// Goes through all local rows, for each local point
// find its 27 3D neighbors (including the point itself).
// For each neibor decide if it is on a different rank (halo) or local
// If external, add to the send buffer
// If local, create the local matrix
#pragma omp parallel for
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
const local_int_t iz = (i / (nx * ny));
const local_int_t iy = (i - iz * nx * ny) / nx;
const local_int_t ix = i - (iz * ny + iy) * nx;
const global_int_t gix = ix + gix0;
const global_int_t giy = iy + giy0;
const global_int_t giz = iz + giz0;
global_int_t curcol;
int nnz_c = 0;
bool rank_set[27];
for (int j = 0; j < 27; j++)
{
rank_set[j] = false;
}
has_external[i] = 0;
for (int k = 0; k < 27; k++)
{
long long int cgix = gix + tid2indCpu[k][0];
long long int cgiy = giy + tid2indCpu[k][1];
long long int cgiz = giz + tid2indCpu[k][2];
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
if (ok)
{
int ipz = cgiz / nz;
int ipy = cgiy / ny;
int ipx = cgix / nx;
// For GPUCPU exec mode, find the 3D rank coordinates.
// For diff dim between CPU and GPU, we cannot
// just divide on the local dim to find ipx/ipy/ipz
// We must find it manually based on neighbor 3d coordinates
// Note the halo size is always 1
if (A.geom->different_dim == Z)
{
long long int local = cgiz - giz0;
if (local >= 0 && local < nz)
ipz = A.geom->ipz;
else if (local < 0)
ipz = A.geom->ipz - 1;
else if (local >= nz)
ipz = A.geom->ipz + 1;
}
else if (A.geom->different_dim == Y)
{
long long int local = cgiy - giy0;
if (local >= 0 && local < ny)
ipy = A.geom->ipy;
else if (local < 0)
ipy = A.geom->ipy - 1;
else if (local >= ny)
ipy = A.geom->ipy + 1;
}
else if (A.geom->different_dim == X)
{
long long int local = cgix - gix0;
if (local >= 0 && local < nx)
ipx = A.geom->ipx;
else if (local < 0)
ipx = A.geom->ipx - 1;
else if (local >= nx)
ipx = A.geom->ipx + 1;
}
// Global rank Id
int col_rank = ipx + ipy * npx + ipz * npy * npx;
// The neighbor point rank is diff than the current point rank
if (A.geom->logical_rank != col_rank)
{
has_external[i] = 1;
int rankId = rankToId_h[col_rank];
local_int_t* p = &(sendcounter[rankId]);
// Add the halo point atomically to send_buffer
// For all the cols in a row that has the same rank,
// we add the row once to the rank buffer
if (!rank_set[rankId])
{
rank_set[rankId] = true;
local_int_t t;
#pragma omp atomic capture
{
t = *p;
*p += 1;
}
send_buffer[rankId * sendbufld + t] = i;
}
}
else
{
// local neighbor, add it to the local matrix
local_int_t zi = cgiz - giz0;
local_int_t yi = cgiy - giy0;
local_int_t xi = cgix - gix0;
local_int_t lcol = zi * ny * nx + yi * nx + xi;
mtxIndL[i][nnz_c] = lcol;
}
nnz_c++;
}
}
}
// Now external data structures
// 1 Create elements to send buffer (Sort the indicies for each neighbor)
local_int_t totalToBeSent = 0;
local_int_t* sendcounts = new local_int_t[A.geom->size + 1];
sendcounts[0] = 0;
int neighborCount = 0;
#pragma omp parallel for
for (local_int_t i = 0; i < 27; i++)
{
if (sendcounter[i] > 0)
{
std::sort(send_buffer + i * sendbufld, send_buffer + i * sendbufld + sendcounter[i]);
}
}
for (local_int_t i = 0; i < 27; i++)
{
if (sendcounter[i] > 0)
{
totalToBeSent += sendcounter[i];
sendcounts[neighborCount + 1] = sendcounts[neighborCount] + sendcounter[i];
neighborCount++;
}
}
// 2 Now find neighbor Ids, neighbor physical Ids (see GenerateGeometry), and elemets to send
local_int_t sendEntryCount = 0;
local_int_t* receiveLength = new local_int_t[neighborCount];
local_int_t* sendLength = new local_int_t[neighborCount];
// Build the arrays and lists needed by the ExchangeHalo function.
double* sendBuffer = new double[totalToBeSent];
int* neighbors = new int[neighborCount];
int* neighborsPhysical = new int[neighborCount];
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
neighborCount = 0;
for (local_int_t i = 0; i < 27; i++)
{
if (sendcounter[i] > 0)
{
int neighborId = idToRank_h[i]; // logical Id
int phys_neiId = logical_rank_to_phys[neighborId];
neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
neighborsPhysical[neighborCount] = phys_neiId;
receiveLength[neighborCount] = sendcounter[i];
sendLength[neighborCount] = sendcounter[i];
for (int j = 0; j < sendcounter[i]; j++)
{
elementsToSend[sendEntryCount] = send_buffer[i * sendbufld + j];
sendEntryCount++;
}
neighborCount++;
}
}
delete[] send_buffer;
delete[] sendcounter;
// Exchange elements to send wit other neighbors
auto INDEX_TYPE = MPI_INT;
#ifdef INDEX_64 // In src/Geometry
INDEX_TYPE = MPI_LONG;
#endif
MPI_Status status;
int MPI_MY_TAG = 93;
MPI_Request* request = new MPI_Request[neighborCount];
local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
local_int_t* recv_ptr = eltsToRecv;
for (int i = 0; i < neighborCount; i++)
{
int n_recv = sendLength[i];
MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
recv_ptr += n_recv;
}
local_int_t* elts_ptr = elementsToSend;
for (int i = 0; i < neighborCount; i++)
{
local_int_t n_send = sendLength[i];
MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
elts_ptr += n_send;
}
for (int i = 0; i < neighborCount; i++)
{
MPI_Wait(request + i, &status);
}
delete[] request;
// Create a map to be used in the optimization step
// Any external column index will be given a sequntail Id
// after the number of rows (Will be used to access x vector)
int prev_dim = 0;
for (int nc = 0; nc < neighborCount; ++nc)
{
int neighborId = neighbors[nc];
int phys_neiId = neighborsPhysical[nc];
local_int_t str = sendcounts[nc];
local_int_t end = sendcounts[nc + 1];
for (int j = str; j < end; j++)
{
const local_int_t col = eltsToRecv[j];
externalToLocalMap[neighborId][col] = localNumberOfRows + j;
}
}
delete[] eltsToRecv;
delete[] sendcounts;
if (totalToBeSent > 0)
{
// Last step sort all external IDs per rank Id, elements of neighbor 0 first, then 1, and so on
#pragma omp parallel for
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
int nnz_ext = 0;
if (has_external[i] == 1)
{
const local_int_t iz = (i / (nx * ny));
const local_int_t iy = (i - iz * nx * ny) / nx;
const local_int_t ix = i - (iz * ny + iy) * nx;
const global_int_t gix = ix + gix0;
const global_int_t giy = iy + giy0;
const global_int_t giz = iz + giz0;
int nnz_c = 0;
for (int k = 0; k < 27; k++)
{
long long int cgix = gix + tid2indCpu[k][0];
long long int cgiy = giy + tid2indCpu[k][1];
long long int cgiz = giz + tid2indCpu[k][2];
local_int_t zi = (cgiz) % nz;
local_int_t yi = (cgiy) % ny;
local_int_t xi = (cgix) % nx;
int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
int ipz = cgiz / nz;
int ipy = cgiy / ny;
int ipx = cgix / nx;
// The indices sent by the neighbor uses the neighbor's nx, ny, and nz which can
// be deffirent than the current neighbor's dims. Thus, based on neighor location
// and the diffrent_dim we adjust the indices if needed.
// Also, the ipx, ipy, and ipz must be updated accordingly
global_int_t new_nx = A.geom->nx;
global_int_t new_ny = A.geom->ny;
if (A.geom->different_dim == Z)
{
long long int local = cgiz - giz0;
if (local >= 0 && local < nz)
{
ipz = A.geom->ipz;
zi = local;
}
else if (local < 0)
{
ipz = A.geom->ipz - 1;
zi = A.geom->previous_neighbor_dim - 1;
}
else if (local >= nz)
{
ipz = A.geom->ipz + 1;
zi = 0;
}
}
else if (A.geom->different_dim == Y)
{
long long int local = cgiy - giy0;
if (local >= 0 && local < ny)
{
ipy = A.geom->ipy;
yi = local;
}
else if (local < 0)
{
ipy = A.geom->ipy - 1;
yi = A.geom->previous_neighbor_dim - 1;
new_ny = A.geom->previous_neighbor_dim;
}
else if (local >= ny)
{
ipy = A.geom->ipy + 1;
yi = 0;
new_ny = A.geom->next_neighbor_dim;
}
}
else if (A.geom->different_dim == X)
{
long long int local = cgix - gix0;
if (local >= 0 && local < nx)
{
ipx = A.geom->ipx;
xi = local;
}
else if (local < 0)
{
ipx = A.geom->ipx - 1;
xi = A.geom->previous_neighbor_dim - 1;
new_nx = A.geom->previous_neighbor_dim;
}
else if (local >= nx)
{
ipx = A.geom->ipx + 1;
xi = 0;
new_nx = A.geom->next_neighbor_dim;
}
}
local_int_t lcol = zi * new_ny * new_nx + yi * new_nx + xi;
int row_rank = ipx + ipy * npx + ipz * npy * npx;
if (ok)
{
if (externalToLocalMap.find(row_rank) != externalToLocalMap.end())
{
mtxIndL[i][nnz_c] = externalToLocalMap[row_rank][lcol];
nnz_ext++;
}
nnz_c++;
}
}
}
extTemp[i] = nnz_ext;
}
}
if (P2P_Mode == MPI_CPU_All2allv)
{
int* sdispls = new int[A.geom->size];
int* rdispls = new int[A.geom->size];
int* scounts = new int[A.geom->size];
int* rcounts = new int[A.geom->size];
int tmp_s = 0, tmp_r = 0;
if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
return;
for (local_int_t i = 0; i < A.geom->size; i++)
{
scounts[i] = 0;
rcounts[i] = 0;
sdispls[i] = 0;
rdispls[i] = 0;
}
for (local_int_t i = 0; i < neighborCount; i++)
{
local_int_t root = neighborsPhysical[i];
scounts[root] = sendLength[i];
rcounts[root] = receiveLength[i];
sdispls[root] = tmp_s;
tmp_s += sendLength[i];
rdispls[root] = tmp_r;
tmp_r += receiveLength[i];
}
A.scounts = scounts;
A.rcounts = rcounts;
A.sdispls = sdispls;
A.rdispls = rdispls;
}
delete[] has_external;
// Store contents in our matrix struct
A.numberOfExternalValues = totalToBeSent;
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
A.numberOfSendNeighbors = neighborCount;
A.totalToBeSent = totalToBeSent;
A.elementsToSend = elementsToSend;
A.neighbors = neighbors;
A.neighborsPhysical = neighborsPhysical;
A.receiveLength = receiveLength;
A.sendLength = sendLength;
A.sendBuffer = sendBuffer;
A.cpuAux.tempIndex = extTemp;
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
<< ", number of neighbors = " << A.numberOfSendNeighbors << endl;
for (int i = 0; i < A.numberOfSendNeighbors; i++)
{
HPCG_fout << " rank " << A.geom->rank << " neighbor " << neighbors[i]
<< " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
for (local_int_t j = 0; j < sendLength[i]; ++j)
HPCG_fout << " rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
<< endl;
}
#endif
#endif
// ifdef HPCG_NO_MPI
return;
}
#endif // USE_GRACE
void SetupHalo(SparseMatrix& A)
{
if (A.rankType == GPU)
{
#ifdef USE_CUDA
SetupHalo_Gpu(A);
#endif
}
else
{
#ifdef USE_GRACE
SetupHalo_Cpu(A);
#endif
}
}

21
src/SetupHalo.hpp Normal file
View File

@@ -0,0 +1,21 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef SETUPHALO_HPP
#define SETUPHALO_HPP
#include "SparseMatrix.hpp"
void SetupHalo(SparseMatrix& A);
#endif // SETUPHALO_HPP

212
src/SetupHalo_ref.cpp Normal file
View File

@@ -0,0 +1,212 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file SetupHalo_ref.cpp
HPCG routine
*/
#ifndef HPCG_NO_MPI
#include <map>
#include <mpi.h>
#include <set>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#ifdef HPCG_DETAILED_DEBUG
#include <fstream>
using std::endl;
#include "hpcg.hpp"
#include <cassert>
#endif
#include <cstdio>
#include "SetupHalo_ref.hpp"
#include "mytimer.hpp"
extern int use_output_file;
/*!
Reference version of SetupHalo that prepares system matrix data structure and creates data necessary
for communication of boundary values of this process.
@param[inout] A The known system matrix
@see ExchangeHalo
*/
void SetupHalo_ref(SparseMatrix& A)
{
// Extract Matrix pieces
local_int_t localNumberOfRows = A.localNumberOfRows;
local_int_t* nonzerosInRow = A.nonzerosInRow;
global_int_t** mtxIndG = A.mtxIndG;
local_int_t** mtxIndL = A.mtxIndL;
#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
int cur_nnz = nonzerosInRow[i];
for (int j = 0; j < cur_nnz; j++)
mtxIndL[i][j] = mtxIndG[i][j];
}
#else // Run this section if compiling for MPI
// Scan global IDs of the nonzeros in the matrix. Determine if the column ID matches a row ID. If not:
// 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
// We need to receive this value of the x vector during the halo exchange.
// 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
std::map<int, std::set<global_int_t>> sendList, receiveList;
typedef std::map<int, std::set<global_int_t>>::iterator map_iter;
typedef std::set<global_int_t>::iterator set_iter;
std::map<global_int_t, local_int_t> externalToLocalMap;
// TODO: With proper critical and atomic regions, this loop could be threaded, but not attempting it at this time
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
global_int_t currentGlobalRow = A.localToGlobalMap[i];
for (int j = 0; j < nonzerosInRow[i]; j++)
{
global_int_t curIndex = mtxIndG[i][j];
int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << "rank, row , col, globalToLocalMap[col] = " << A.geom->rank << " " << currentGlobalRow << " "
<< curIndex << " " << A.globalToLocalMap[curIndex] << endl;
#endif
if (A.geom->rank != rankIdOfColumnEntry)
{ // If column index is not a row index, then it comes from another processor
receiveList[rankIdOfColumnEntry].insert(curIndex);
sendList[rankIdOfColumnEntry].insert(
currentGlobalRow); // Matrix symmetry means we know the neighbor process wants my value
}
}
}
// Count number of matrix entries to send and receive
local_int_t totalToBeSent = 0;
for (map_iter curNeighbor = sendList.begin(); curNeighbor != sendList.end(); ++curNeighbor)
{
totalToBeSent += (curNeighbor->second).size();
}
local_int_t totalToBeReceived = 0;
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
{
totalToBeReceived += (curNeighbor->second).size();
}
#ifdef HPCG_DETAILED_DEBUG
// These are all attributes that should be true, due to symmetry
HPCG_fout << "totalToBeSent = " << totalToBeSent << " totalToBeReceived = " << totalToBeReceived << endl;
assert(totalToBeSent == totalToBeReceived); // Number of sent entry should equal number of received
assert(sendList.size() == receiveList.size()); // Number of send-to neighbors should equal number of receive-from
// Each receive-from neighbor should be a send-to neighbor, and send the same number of entries
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
{
assert(sendList.find(curNeighbor->first) != sendList.end());
assert(sendList[curNeighbor->first].size() == receiveList[curNeighbor->first].size());
}
#endif
// Build the arrays and lists needed by the ExchangeHalo function.
double* sendBuffer = new double[totalToBeSent];
local_int_t* elementsToSend = new local_int_t[totalToBeSent];
int* neighbors = new int[sendList.size()];
local_int_t* receiveLength = new local_int_t[receiveList.size()];
local_int_t* sendLength = new local_int_t[sendList.size()];
int neighborCount = 0;
local_int_t receiveEntryCount = 0;
local_int_t sendEntryCount = 0;
for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor, ++neighborCount)
{
int neighborId = curNeighbor->first; // rank of current neighbor we are processing
neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
receiveLength[neighborCount] = receiveList[neighborId].size();
sendLength[neighborCount] = sendList[neighborId].size(); // Get count if sends/receives
for (set_iter i = receiveList[neighborId].begin(); i != receiveList[neighborId].end(); ++i, ++receiveEntryCount)
{
externalToLocalMap[*i]
= localNumberOfRows + receiveEntryCount; // The remote columns are indexed at end of internals
}
for (set_iter i = sendList[neighborId].begin(); i != sendList[neighborId].end(); ++i, ++sendEntryCount)
{
// if (geom.rank==1) HPCG_fout << "*i, globalToLocalMap[*i], sendEntryCount = " << *i << " " <<
// A.globalToLocalMap[*i] << " " << sendEntryCount << endl;
elementsToSend[sendEntryCount] = A.globalToLocalMap[*i]; // store local ids of entry to send
}
}
#if 1
// Convert matrix indices to local IDs
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localNumberOfRows; i++)
{
for (int j = 0; j < nonzerosInRow[i]; j++)
{
global_int_t curIndex = mtxIndG[i][j];
int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
if (A.geom->rank == rankIdOfColumnEntry)
{ // My column index, so convert to local index
mtxIndL[i][j] = A.globalToLocalMap[curIndex];
}
else
{ // If column index is not a row index, then it comes from another processor
mtxIndL[i][j] = externalToLocalMap[curIndex];
}
}
}
#endif
// Store contents in our matrix struct
A.numberOfExternalValues = externalToLocalMap.size();
printf("%d %d\n", A.localNumberOfRows, A.numberOfExternalValues);
fflush(0);
A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
A.numberOfSendNeighbors = sendList.size();
A.totalToBeSent = totalToBeSent;
A.elementsToSend = elementsToSend;
A.neighbors = neighbors;
A.receiveLength = receiveLength;
A.sendLength = sendLength;
A.sendBuffer = sendBuffer;
#ifdef HPCG_DETAILED_DEBUG
HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
<< ", number of neighbors = " << A.numberOfSendNeighbors << endl;
for (int i = 0; i < A.numberOfSendNeighbors; i++)
{
HPCG_fout << " rank " << A.geom->rank << " neighbor " << neighbors[i]
<< " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
for (local_int_t j = 0; j < sendLength[i]; ++j)
HPCG_fout << " rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
<< endl;
}
#endif
#endif
// ifdef HPCG_NO_MPI
return;
}

21
src/SetupHalo_ref.hpp Normal file
View File

@@ -0,0 +1,21 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef SETUPHALO_REF_HPP
#define SETUPHALO_REF_HPP
#include "SparseMatrix.hpp"
void SetupHalo_ref(SparseMatrix& A);
#endif // SETUPHALO_REF_HPP

306
src/SparseMatrix.hpp Normal file
View File

@@ -0,0 +1,306 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file SparseMatrix.hpp
HPCG data structures for the sparse matrix
*/
#ifndef SPARSEMATRIX_HPP
#define SPARSEMATRIX_HPP
#ifdef USE_CUDA
#include <cuda.h>
#include <cusparse.h>
#endif
#ifdef USE_GRACE
#include <nvpl_sparse.h>
#endif
#include "Cuda.hpp"
#include "Geometry.hpp"
#include "MGData.hpp"
#include "Vector.hpp"
#include <cassert>
#include <vector>
extern bool Use_Hpcg_Mem_Reduction;
#ifndef HPCG_NO_MPI
extern p2p_comm_mode_t P2P_Mode;
#endif
#if __cplusplus < 201103L
// for C++03
#include <map>
typedef std::map<global_int_t, local_int_t> GlobalToLocalMap;
#else
// for C++11 or greater
#include <unordered_map>
using GlobalToLocalMap = std::unordered_map<global_int_t, local_int_t>;
#endif
#ifdef USE_CUDA
struct CUSPARSE_STRUCT
{
cusparseDnVecDescr_t vecX;
cusparseDnVecDescr_t vecY;
cusparseSpMatDescr_t matA;
cusparseSpMatDescr_t matL;
cusparseSpMatDescr_t matU;
// CUSPARSE SpSV
cusparseSpSVDescr_t spsvDescrL, spsvDescrU;
};
struct GPU_AUX_STRUCT
{
// Uncolored row related info
local_int_t* nnzPerRow;
local_int_t* columns;
double* values;
local_int_t* csrAPermOffsets;
local_int_t* csrLPermOffsets;
local_int_t* csrUPermOffsets;
local_int_t* diagonalIdx;
// Sliced EllPACK Aux
local_int_t* sellADiagonalIdx;
// Auxiliary data
local_int_t* f2c;
local_int_t* color;
int* colorCountCpu;
// MULTI-GPU Aux data
local_int_t* map;
local_int_t* ext2csrOffsets;
local_int_t* elementsToSend;
global_int_t* localToGlobalMap;
local_int_t compressNumberOfRows;
double* sendBuffer;
};
#endif
#ifdef USE_GRACE
struct NVPL_SPARSE_STRUCT
{
nvpl_sparse_dn_vec_descr_t vecX;
nvpl_sparse_dn_vec_descr_t vecY;
nvpl_sparse_sp_mat_descr_t matL;
nvpl_sparse_sp_mat_descr_t matU;
nvpl_sparse_sp_mat_descr_t matA;
nvpl_sparse_spsv_descr_t spsvDescrL, spsvDescrU;
nvpl_sparse_spmv_descr_t spmvADescr, spmvLDescr, spmvUDescr;
};
struct CPU_AUX_STRUCT
{
// Auxiliary data
// Coloring info as number of colors and where each color starts
// Also keep information on how many consecutive rows share the same color
// This assumes matrix reordering (rows with same color are packed)
local_int_t* color;
local_int_t* firstRowOfColor;
local_int_t* nRowsWithColor;
local_int_t* tempIndex;
};
#endif
struct SparseMatrix_STRUCT
{
rank_type_t rankType;
int level;
char* title; //!< name of the sparse matrix
Geometry* geom; //!< geometry associated with this matrix
global_int_t totalNumberOfRows; //!< total number of matrix rows across all processes
global_int_t totalNumberOfNonzeros; //!< total number of matrix nonzeros across all processes
local_int_t localNumberOfRows; //!< number of rows local to this process
local_int_t localNumberOfColumns; //!< number of columns local to this process
local_int_t localNumberOfNonzeros; //!< number of nonzeros local to this process
local_int_t* nonzerosInRow; //!< The number of nonzeros in a row will always be 27 or fewer
global_int_t** mtxIndG; //!< matrix indices as global values
local_int_t** mtxIndL; //!< matrix indices as local values
double** matrixValues; //!< values of matrix entries
double** matrixDiagonal; //!< values of matrix diagonal entries
GlobalToLocalMap globalToLocalMap; //!< global-to-local mapping
std::vector<global_int_t> localToGlobalMap; //!< local-to-global mapping
mutable bool isDotProductOptimized;
mutable bool isSpmvOptimized;
mutable bool isMgOptimized;
mutable bool isWaxpbyOptimized;
mutable MGData* mgData; // Pointer to the coarse level data for this fine matrix
void* optimizationData; // pointer that can be used to store implementation-specific data
local_int_t totalToBeSent; //!< total number of entries to be sent
local_int_t slice_size;
#ifndef HPCG_NO_MPI
local_int_t numberOfExternalValues; //!< number of entries that are external to this process
int numberOfSendNeighbors; //!< number of neighboring processes that will be send local data
local_int_t* elementsToSend; //!< elements to send to neighboring processes
int* neighbors; //!< neighboring processes
int* neighborsPhysical;
local_int_t* receiveLength; //!< lenghts of messages received from neighboring processes
local_int_t* sendLength; //!< lenghts of messages sent to neighboring processes
double* sendBuffer; //!< send buffer for non-blocking sends
local_int_t extNnz;
#endif
// Optmization Data common between CPU and GPU
// Coloring permutations
local_int_t totalColors;
local_int_t* ref2opt;
local_int_t* opt2ref;
local_int_t* f2cPerm;
// Sliced EllPACK
local_int_t *sellASliceMrl, *sellLSliceMrl, *sellUSliceMrl;
local_int_t *sellAPermColumns, *sellLPermColumns, *sellUPermColumns;
double *sellAPermValues, *sellLPermValues, *sellUPermValues;
double* diagonal;
char* bufferSvL = nullptr;
char* bufferSvU = nullptr;
char* bufferMvA = nullptr;
char* bufferMvL = nullptr;
char* bufferMvU = nullptr;
// MULTI-GPU data
local_int_t* csrExtOffsets;
local_int_t* csrExtColumns;
double* csrExtValues;
double* tempBuffer;
// When MPI_All2allv is used for P2P communication
int* scounts;
int* rcounts;
int* sdispls;
int* rdispls;
#ifdef USE_CUDA
CUSPARSE_STRUCT cusparseOpt;
GPU_AUX_STRUCT gpuAux;
#endif
// #ifdef USE_GRACE
// NVPL_SPARSE_STRUCT nvplSparseOpt;
// CPU_AUX_STRUCT cpuAux;
// #endif
mutable struct SparseMatrix_STRUCT* Ac; // Coarse grid matrix
};
typedef struct SparseMatrix_STRUCT SparseMatrix;
/*!
Initializes the known system matrix data structure members to 0.
@param[in] A the known system matrix
*/
inline void InitializeSparseMatrix(SparseMatrix& A, Geometry* geom)
{
A.title = 0;
A.geom = geom;
A.totalNumberOfRows = 0;
A.totalNumberOfNonzeros = 0;
A.localNumberOfRows = 0;
A.localNumberOfColumns = 0;
A.localNumberOfNonzeros = 0;
A.nonzerosInRow = 0;
A.mtxIndG = 0;
A.mtxIndL = 0;
A.matrixValues = 0;
A.matrixDiagonal = 0;
// Optimization is ON by default. The code that switches it OFF is in the
// functions that are meant to be optimized.
A.isDotProductOptimized = true;
A.isSpmvOptimized = true;
A.isMgOptimized = true;
A.isWaxpbyOptimized = true;
A.totalToBeSent = 0;
#ifndef HPCG_NO_MPI
A.numberOfExternalValues = 0;
A.numberOfSendNeighbors = 0;
A.totalToBeSent = 0;
A.elementsToSend = 0;
A.neighbors = 0;
A.neighborsPhysical = 0;
A.receiveLength = 0;
A.sendLength = 0;
A.sendBuffer = 0;
#endif
A.mgData = 0; // Fine-to-coarse grid transfer initially not defined.
return;
}
/*!
Copy values from matrix diagonal into user-provided vector.
@param[in] A the known system matrix.
@param[inout] diagonal Vector of diagonal values (must be allocated before call to this function).
*/
inline void CopyMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
{
double** curDiagA = A.matrixDiagonal;
double* dv = diagonal.values;
assert(A.localNumberOfRows == diagonal.localLength);
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
dv[i] = *(curDiagA[i]);
return;
}
/*!
Replace specified matrix diagonal value.
@param[inout] A The system matrix.
@param[in] diagonal Vector of diagonal values that will replace existing matrix diagonal values.
*/
inline void ReplaceMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
{
double** curDiagA = A.matrixDiagonal;
double* dv = diagonal.values;
assert(A.localNumberOfRows == diagonal.localLength);
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
*(curDiagA[i]) = dv[i];
return;
}
#endif // SPARSEMATRIX_HPP

243
src/TestCG.cpp Normal file
View File

@@ -0,0 +1,243 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file TestCG.cpp
HPCG routine
*/
// Changelog
//
// Version 0.4
// - Added timing of setup time for sparse MV
// - Corrected percentages reported for sparse MV with overhead
//
/////////////////////////////////////////////////////////////////////////
#include <fstream>
#include <iostream>
using std::endl;
#include "hpcg.hpp"
#include <vector>
#include "CG.hpp"
#include "CG_ref.hpp"
#include "TestCG.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
extern int use_output_file;
/*!
Test the correctness of the Preconditined CG implementation by using a system matrix with a dominant diagonal.
@param[in] geom The description of the problem's geometry.
@param[in] A The known system matrix
@param[in] data the data structure with all necessary CG vectors preallocated
@param[in] b The known right hand side vector
@param[inout] x On entry: the initial guess; on exit: the new approximate solution
@param[out] testcg_data the data structure with the results of the test including pass/fail information
@return Returns zero on success and a non-zero value otherwise.
@see CG()
*/
int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data)
{
// Use this array for collecting timing information
std::vector<double> times(8, 0.0);
// Temporary storage for holding original diagonal and RHS
Vector origDiagA, exaggeratedDiagA, origB;
InitializeVector(origDiagA, A.localNumberOfRows, A.rankType);
InitializeVector(exaggeratedDiagA, A.localNumberOfRows, A.rankType);
InitializeVector(origB, A.localNumberOfRows, A.rankType);
CopyMatrixDiagonal(A, origDiagA);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyMatrixDiagonalCuda(A, origDiagA);
#endif
}
CopyVector(origDiagA, exaggeratedDiagA);
CopyVector(b, origB);
// Modify the matrix diagonal to greatly exaggerate diagonal values.
// CG should converge in about 10 iterations for this problem, regardless of problem size
for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
{
global_int_t globalRowID = A.localToGlobalMap[i];
if (globalRowID < 9)
{
double scale = (globalRowID + 2) * 1.0e6;
ScaleVectorValue(exaggeratedDiagA, i, scale);
ScaleVectorValue(b, i, scale);
}
else
{
ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
ScaleVectorValue(b, i, 1.0e6);
}
}
// Reference Matrix
ReplaceMatrixDiagonal(A, exaggeratedDiagA);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyVectorH2D(exaggeratedDiagA);
PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
PermVectorCuda(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
ReplaceMatrixDiagonalCuda(A, exaggeratedDiagA);
cusparseSpSV_updateMatrix(
cusparsehandle, A.cusparseOpt.spsvDescrL, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
cusparseSpSV_updateMatrix(
cusparsehandle, A.cusparseOpt.spsvDescrU, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
#endif
}
else
{
#ifdef USE_GRACE
PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
PermVectorCpu(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
ReplaceMatrixDiagonalCpu(A, exaggeratedDiagA);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
#endif
}
////////////////////////////////
int niters = 0;
double normr = 0.0;
double normr0 = 0.0;
int maxIters = 50;
int numberOfCgCalls = 2;
double tolerance = 1.0e-12; // Set tolerance to reasonable value for grossly scaled diagonal terms
testcg_data.expected_niters_no_prec
= 12; // For the unpreconditioned CG call, we should take about 10 iterations, permit 12
testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2
testcg_data.niters_max_no_prec = 0;
testcg_data.niters_max_prec = 0;
for (int k = 0; k < 2; ++k)
{ // This loop tests both unpreconditioned and preconditioned runs
int expected_niters = testcg_data.expected_niters_no_prec;
if (k == 1)
expected_niters = testcg_data.expected_niters_prec;
for (int i = 0; i < numberOfCgCalls; ++i)
{
ZeroVector(x); // Zero out x
int ierr = CG(A, data, b, x, maxIters, tolerance, niters, normr, normr0, &times[0], k == 1, 0);
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
}
if (niters <= expected_niters)
{
++testcg_data.count_pass;
}
else
{
++testcg_data.count_fail;
}
if (k == 0 && niters > testcg_data.niters_max_no_prec)
testcg_data.niters_max_no_prec = niters; // Keep track of largest iter count
if (k == 1 && niters > testcg_data.niters_max_prec)
testcg_data.niters_max_prec = niters; // Same for preconditioned run
if (A.geom->rank == 0)
{
if (use_output_file)
{
HPCG_fout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
<< normr / normr0 << "]" << endl;
}
else
{
std::cout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
<< normr / normr0 << "]" << endl;
}
if (niters > expected_niters)
if (use_output_file)
{
HPCG_fout << " Expected " << expected_niters << " iterations. Performed " << niters << "."
<< endl;
}
else
{
std::cout << " Expected " << expected_niters << " iterations. Performed " << niters << "."
<< endl;
}
}
}
}
// Restore matrix diagonal and RHS
ReplaceMatrixDiagonal(A, origDiagA);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
ReplaceMatrixDiagonalCuda(A, origDiagA);
cusparseSpSV_updateMatrix(
cusparsehandle, A.cusparseOpt.spsvDescrL, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
cusparseSpSV_updateMatrix(
cusparsehandle, A.cusparseOpt.spsvDescrU, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
#endif
}
else
{
#ifdef USE_GRACE
ReplaceMatrixDiagonalCpu(A, origDiagA);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
nvpl_sparse_spsv_update_matrix(
nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
#endif
}
CopyVector(origB, b);
// Delete vectors
DeleteVector(origDiagA);
DeleteVector(exaggeratedDiagA);
DeleteVector(origB);
testcg_data.normr = normr;
return 0;
}

45
src/TestCG.hpp Normal file
View File

@@ -0,0 +1,45 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file TestCG.hpp
HPCG data structure
*/
#ifndef TESTCG_HPP
#define TESTCG_HPP
#include "CGData.hpp"
#include "SparseMatrix.hpp"
#include "Vector.hpp"
#include "hpcg.hpp"
struct TestCGData_STRUCT
{
int count_pass; //!< number of succesful tests
int count_fail; //!< number of succesful tests
int expected_niters_no_prec; //!< expected number of test CG iterations without preconditioning with diagonally
//!< dominant matrix (~12)
int expected_niters_prec; //!< expected number of test CG iterations with preconditioning and with diagonally
//!< dominant matrix (~1-2)
int niters_max_no_prec; //!< maximum number of test CG iterations without predictitioner
int niters_max_prec; //!< maximum number of test CG iterations without predictitioner
double normr; //!< residual norm achieved during test CG iterations
};
typedef struct TestCGData_STRUCT TestCGData;
extern int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data);
#endif // TESTCG_HPP

49
src/TestNorms.cpp Normal file
View File

@@ -0,0 +1,49 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file TestNorms.cpp
HPCG routine
*/
#include "TestNorms.hpp"
#include <cmath>
/*!
Computes the mean and standard deviation of the array of norm results.
@param[in] testnorms_data data structure with the results of norm test
@return Returns 0 upon success or non-zero otherwise
*/
int TestNorms(TestNormsData& testnorms_data)
{
double mean_delta = 0.0;
for (int i = 0; i < testnorms_data.samples; ++i)
mean_delta += (testnorms_data.values[i] - testnorms_data.values[0]);
double mean = testnorms_data.values[0] + mean_delta / (double) testnorms_data.samples;
testnorms_data.mean = mean;
// Compute variance
double sumdiff = 0.0;
for (int i = 0; i < testnorms_data.samples; ++i)
sumdiff += (testnorms_data.values[i] - mean) * (testnorms_data.values[i] - mean);
testnorms_data.variance = sumdiff / (double) testnorms_data.samples;
// Determine if variation is sufficiently small to declare success
testnorms_data.pass = (testnorms_data.variance < 1.0e-6);
return 0;
}

36
src/TestNorms.hpp Normal file
View File

@@ -0,0 +1,36 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file TestNorms.hpp
HPCG data structure
*/
#ifndef TESTNORMS_HPP
#define TESTNORMS_HPP
struct TestNormsData_STRUCT
{
double* values; //!< sample values
double mean; //!< mean of all sampes
double variance; //!< variance of mean
int samples; //!< number of samples
bool pass; //!< pass/fail indicator
};
typedef struct TestNormsData_STRUCT TestNormsData;
extern int TestNorms(TestNormsData& testnorms_data);
#endif // TESTNORMS_HPP

298
src/TestSymmetry.cpp Normal file
View File

@@ -0,0 +1,298 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file TestSymmetry.cpp
HPCG routine
*/
// The MPI include must be first for Windows platforms
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#include <cfloat>
#include <fstream>
#include <iostream>
using std::endl;
#include <cmath>
#include <vector>
#include "hpcg.hpp"
#include "ComputeDotProduct.hpp"
#include "ComputeMG.hpp"
#include "ComputeResidual.hpp"
#include "ComputeSPMV.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
#include "Geometry.hpp"
#include "SparseMatrix.hpp"
#include "TestSymmetry.hpp"
extern int use_output_file;
/*!
Tests symmetry-preserving properties of the sparse matrix vector multiply and multi-grid routines.
@param[in] geom The description of the problem's geometry.
@param[in] A The known system matrix
@param[in] b The known right hand side vector
@param[in] xexact The exact solution vector
@param[inout] testsymmetry_data The data structure with the results of the CG symmetry test including pass/fail
information
@return returns 0 upon success and non-zero otherwise
@see ComputeDotProduct
@see ComputeDotProduct_ref
@see ComputeSPMV
@see ComputeSPMV_ref
@see ComputeMG
@see ComputeMG_ref
*/
int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data)
{
local_int_t nrow = A.localNumberOfRows;
local_int_t ncol = A.localNumberOfColumns;
Vector x_ncol, y_ncol, z_ncol;
InitializeVector(x_ncol, ncol, A.rankType);
InitializeVector(y_ncol, ncol, A.rankType);
InitializeVector(z_ncol, ncol, A.rankType);
double t4 = 0.0; // Needed for dot-product call, otherwise unused
testsymmetry_data.count_fail = 0;
// Test symmetry of matrix
// First load vectors with random values
FillRandomVector(x_ncol);
FillRandomVector(y_ncol);
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyVectorH2D(y_ncol);
CopyVectorH2D(x_ncol);
#endif
}
int ierr;
double xNorm2, yNorm2;
double ANorm = 2 * 26.0;
// Next, compute x'*A*y
ComputeDotProduct(nrow, y_ncol, y_ncol, yNorm2, t4, A.isDotProductOptimized, A.rankType);
ierr = ComputeSPMV(A, y_ncol, z_ncol); // z_nrow = A*y_overlap
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
double xtAy = 0.0;
ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtAy, t4, A.isDotProductOptimized, A.rankType); // x'*A*y
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
}
// Next, compute y'*A*x
ComputeDotProduct(nrow, x_ncol, x_ncol, xNorm2, t4, A.isDotProductOptimized, A.rankType);
ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
double ytAx = 0.0;
ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytAx, t4, A.isDotProductOptimized, A.rankType); // y'*A*x
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
}
testsymmetry_data.depsym_spmv = std::fabs((long double) (xtAy - ytAx))
/ ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
if (testsymmetry_data.depsym_spmv > 1.0)
++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
if (A.geom->rank == 0)
if (use_output_file)
{
HPCG_fout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
<< testsymmetry_data.depsym_spmv << endl;
}
else
{
std::cout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
<< testsymmetry_data.depsym_spmv << endl;
}
// Test symmetry of multi-grid
// Compute x'*Minv*y
ierr = ComputeMG(A, y_ncol, z_ncol); // z_ncol = Minv*y_ncol
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
}
double xtMinvy = 0.0;
ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtMinvy, t4, A.isDotProductOptimized, A.rankType); // x'*Minv*y
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
}
// Next, compute z'*Minv*x
ierr = ComputeMG(A, x_ncol, z_ncol); // z_ncol = Minv*x_ncol
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
}
double ytMinvx = 0.0;
ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytMinvx, t4, A.isDotProductOptimized, A.rankType); // y'*Minv*x
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
}
testsymmetry_data.depsym_mg = std::fabs((long double) (xtMinvy - ytMinvx))
/ ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
if (testsymmetry_data.depsym_mg > 1.0)
++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
if (A.geom->rank == 0)
if (use_output_file)
{
HPCG_fout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
<< testsymmetry_data.depsym_mg << endl;
}
else
{
std::cout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
<< testsymmetry_data.depsym_mg << endl;
}
CopyVector(xexact, x_ncol); // Copy exact answer into overlap vector
int numberOfCalls = 2;
double residual = 0.0;
for (int i = 0; i < numberOfCalls; ++i)
{
if (A.rankType == GPU)
{
#ifdef USE_CUDA
CopyVectorH2D(x_ncol);
#endif
}
ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
if (A.rankType == GPU)
{
#ifdef USE_CUDA
PermVectorCuda(A.ref2opt, z_ncol, nrow);
CopyVectorD2H(z_ncol);
#endif
}
else
{
#ifdef USE_GRACE
PermVectorCpu(A.ref2opt, z_ncol, nrow);
#endif
}
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
if ((ierr = ComputeResidual(A.localNumberOfRows, b, z_ncol, residual)))
if (use_output_file)
{
HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
}
if (A.geom->rank == 0)
if (use_output_file)
{
HPCG_fout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
}
else
{
std::cout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
}
}
DeleteVector(x_ncol);
DeleteVector(y_ncol);
DeleteVector(z_ncol);
return 0;
}

38
src/TestSymmetry.hpp Normal file
View File

@@ -0,0 +1,38 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file TestSymmetry.hpp
HPCG data structures for symmetry testing
*/
#ifndef TESTSYMMETRY_HPP
#define TESTSYMMETRY_HPP
#include "CGData.hpp"
#include "SparseMatrix.hpp"
#include "hpcg.hpp"
struct TestSymmetryData_STRUCT
{
double depsym_spmv; //!< departure from symmetry for the SPMV kernel
double depsym_mg; //!< departure from symmetry for the MG kernel
int count_fail; //!< number of failures in the symmetry tests
};
typedef struct TestSymmetryData_STRUCT TestSymmetryData;
extern int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data);
#endif // TESTSYMMETRY_HPP

240
src/Vector.hpp Normal file
View File

@@ -0,0 +1,240 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file Vector.hpp
HPCG data structures for dense vectors
*/
#ifndef VECTOR_HPP
#define VECTOR_HPP
#include <cassert>
#include <cstdlib>
#include <cuda_runtime.h>
#include <omp.h>
#include <vector>
#include "Geometry.hpp"
struct Vector_STRUCT
{
rank_type_t rt;
local_int_t localLength; //!< length of local portion of the vector
bool isCudaHost;
double* values; //!< array of values
/*!
This is for storing optimized data structures created in OptimizeProblem and
used inside optimized ComputeSPMV().
*/
void* optimizationData;
#ifdef USE_CUDA
double* values_d = nullptr;
#endif
bool initialized = false;
};
typedef struct Vector_STRUCT Vector;
/*!
Initializes input vector.
@param[in] v
@param[in] localLength Length of local portion of input vector
*/
inline void InitializeVector(Vector& v, local_int_t localLength, rank_type_t rt, bool isCudaHost = false)
{
v.localLength = localLength;
v.isCudaHost = isCudaHost;
v.rt = rt;
#ifdef USE_CUDA
if (v.rt == GPU && v.isCudaHost)
cudaMallocHost(&(v.values), sizeof(double) * localLength);
else
#endif
v.values = new double[localLength];
v.optimizationData = 0;
#ifdef USE_CUDA
if (v.rt == GPU)
cudaMalloc((void**) &(v.values_d), sizeof(double) * localLength);
#endif
v.initialized = true;
return;
}
/*!
Fill the input vector with zero values.
@param[inout] v - On entrance v is initialized, on exit all its values are zero.
*/
inline void ZeroVector(Vector& v)
{
assert(v.initialized);
local_int_t localLength = v.localLength;
double* vv = v.values;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localLength; ++i)
vv[i] = 0.0;
#ifdef USE_CUDA
if (v.rt == GPU)
{
cudaMemset(v.values_d, 0, sizeof(double) * localLength);
}
#endif
return;
}
/*!
Multiply (scale) a specific vector entry by a given value.
@param[inout] v Vector to be modified
@param[in] index Local index of entry to scale
@param[in] value Value to scale by
*/
inline void ScaleVectorValue(Vector& v, local_int_t index, double value)
{
assert(index >= 0 && index < v.localLength);
double* vv = v.values;
vv[index] *= value;
return;
}
/*!
Fill the input vector with pseudo-random values.
@param[in] v
*/
inline void FillRandomVector(Vector& v)
{
local_int_t localLength = v.localLength;
double* vv = v.values;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < localLength; ++i)
vv[i] = rand() / (double) (RAND_MAX) + 1.0;
return;
}
/*!
Copy input vector to output vector.
@param[in] v Input vector
@param[in] w Output vector
*/
inline void CopyVector(const Vector& v, Vector& w)
{
local_int_t len = std::min(v.localLength, w.localLength);
assert(v.initialized && w.initialized);
double* vv = v.values;
double* wv = w.values;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (local_int_t i = 0; i < len; ++i)
wv[i] = vv[i];
#ifdef USE_CUDA
if (v.rt == GPU && w.rt == GPU)
{
cudaMemcpy(w.values_d, v.values_d, sizeof(double) * len, cudaMemcpyDeviceToDevice);
}
#endif
return;
}
#ifdef USE_CUDA
inline void CopyVectorD2H(const Vector& v)
{
local_int_t localLength = v.localLength;
cudaMemcpy(v.values, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToHost);
return;
}
inline void CopyVectorD2D(const Vector& v, Vector& w)
{
local_int_t localLength = v.localLength;
cudaMemcpy(w.values_d, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToDevice);
return;
}
inline void CopyVectorH2D(const Vector& v)
{
local_int_t localLength = v.localLength;
cudaMemcpy(v.values_d, v.values, sizeof(double) * localLength, cudaMemcpyHostToDevice);
return;
}
#endif
inline void CopyAndReorderVector(const Vector& v, Vector& w, local_int_t* perm)
{
local_int_t localLength = v.localLength;
assert(w.localLength >= localLength);
double* vv = v.values;
double* wv = w.values;
local_int_t i;
#ifndef HPCG_NO_OPENMP
#pragma omp parallel for
#endif
for (i = 0; i < localLength; ++i)
{
wv[i] = vv[perm[i]];
}
return;
}
/*!
Deallocates the members of the data structure of the known system matrix provided they are not 0.
@param[in] A the known system matrix
*/
inline void DeleteVector(Vector& v)
{
if (v.isCudaHost)
cudaFreeHost(v.values);
else
{
delete[] v.values;
}
v.localLength = 0;
#ifdef USE_CUDA
if (v.values_d)
cudaFree(v.values_d);
#endif
return;
}
#endif // VECTOR_HPP

98
src/WriteProblem.cpp Normal file
View File

@@ -0,0 +1,98 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file WriteProblem.cpp
HPCG routine
*/
#include "WriteProblem.hpp"
#include <cstdio>
/*!
Routine to dump:
- matrix in row, col, val format for analysis with MATLAB
- x, xexact, b as simple arrays of numbers.
Writes to A.dat, x.dat, xexact.dat and b.dat, respectivly.
NOTE: THIS CODE ONLY WORKS ON SINGLE PROCESSOR RUNS
Read into MATLAB using:
load A.dat
A=spconvert(A);
load x.dat
load xexact.dat
load b.dat
@param[in] geom The description of the problem's geometry.
@param[in] A The known system matrix
@param[in] b The known right hand side vector
@param[in] x The solution vector computed by CG iteration
@param[in] xexact Generated exact solution
@return Returns with -1 if used with more than one MPI process. Returns with 0 otherwise.
@see GenerateProblem
*/
int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact)
{
if (geom.size != 1)
return -1; // TODO Only works on one processor. Need better error handler
const global_int_t nrow = A.totalNumberOfRows;
FILE *fA = 0, *fx = 0, *fxexact = 0, *fb = 0;
fA = fopen("A.dat", "w");
fx = fopen("x.dat", "w");
fxexact = fopen("xexact.dat", "w");
fb = fopen("b.dat", "w");
if (!fA || !fx || !fxexact || !fb)
{
if (fb)
fclose(fb);
if (fxexact)
fclose(fxexact);
if (fx)
fclose(fx);
if (fA)
fclose(fA);
return -1;
}
for (global_int_t i = 0; i < nrow; i++)
{
const double* const currentRowValues = A.matrixValues[i];
const global_int_t* const currentRowIndices = A.mtxIndG[i];
const int currentNumberOfNonzeros = A.nonzerosInRow[i];
for (int j = 0; j < currentNumberOfNonzeros; j++)
#ifdef HPCG_NO_LONG_LONG
fprintf(fA, " %d %d %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
#else
fprintf(fA, " %lld %lld %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
#endif
fprintf(fx, "%22.16e\n", x.values[i]);
fprintf(fxexact, "%22.16e\n", xexact.values[i]);
fprintf(fb, "%22.16e\n", b.values[i]);
}
fclose(fA);
fclose(fx);
fclose(fxexact);
fclose(fb);
return 0;
}

22
src/WriteProblem.hpp Normal file
View File

@@ -0,0 +1,22 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef WRITEPROBLEM_HPP
#define WRITEPROBLEM_HPP
#include "Geometry.hpp"
#include "SparseMatrix.hpp"
#include <string>
int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact);
#endif // WRITEPROBLEM_HPP

107
src/YAML_Doc.cpp Normal file
View File

@@ -0,0 +1,107 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "YAML_Doc.hpp"
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
using namespace std;
/*!
Sets the application name and version which will become part of the YAML doc.
@param[in] miniApp_Name application name
@param[in] miniApp_Version application name
@param[in] destination_Directory destination directory for the YAML document
@param[in] destination_FileName file name for the YAML document
*/
YAML_Doc::YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
const std::string& destination_Directory, const std::string& destination_FileName)
{
miniAppName = miniApp_Name;
miniAppVersion = miniApp_Version;
destinationDirectory = destination_Directory;
destinationFileName = destination_FileName;
}
// inherits the destructor from YAML_Element
YAML_Doc::~YAML_Doc(void) {}
/*!
Generates YAML from the elements of the document and saves it to a file.
@return returns the complete YAML document as a string
*/
string YAML_Doc::generateYAML()
{
string yaml;
yaml = yaml + miniAppName + " version: " + miniAppVersion + "\n";
for (size_t i = 0; i < children.size(); i++)
{
yaml = yaml + children[i]->printYAML("");
}
time_t rawtime;
tm* ptm;
time(&rawtime);
ptm = localtime(&rawtime);
char sdate[64];
// use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
sprintf(sdate, "%04d.%02d.%02d.%02d.%02d.%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
ptm->tm_min, ptm->tm_sec);
string filename;
if (destinationFileName == "")
filename = miniAppName + "-" + miniAppVersion + "_";
else
filename = destinationFileName;
filename = filename + string(sdate) + ".yaml";
if (destinationDirectory != "" && destinationDirectory != ".")
{
string mkdir_cmd = "mkdir " + destinationDirectory;
int result = system(mkdir_cmd.c_str());
assert(result == 0);
filename = destinationDirectory + "/" + destinationFileName;
}
else
filename = "./" + filename;
ofstream myfile;
myfile.open(filename.c_str());
myfile << yaml;
myfile.close();
return yaml;
}

117
src/YAML_Doc.hpp Normal file
View File

@@ -0,0 +1,117 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file YAML_Doc.hpp
HPCG YAML classes
*/
// Changelog
//
// Version 0.1
// - Initial version.
//
/////////////////////////////////////////////////////////////////////////
#ifndef YAML_DOC_HPP
#define YAML_DOC_HPP
#include "YAML_Element.hpp"
#include <string>
//! The YAML_Doc class for the uniform collecting and reporting of performance data for HPCG
/*!
The YAML_Doc class works in conjunction with the YAML_Element class to facilitate easy collecting and reporting of
YAML-formatted data that can be then registered with the HPCG results collection website.
\code
//EXAMPLE CODE FOR GENERATING YAML
YAML_Doc doc("hpcg","0.1");
doc.add("final_residual",1.4523e-13);
doc.add("time","4.893");
//note: the following line will remove the data (4.890) associated with "time"
doc.get("time")->add("total",4.243);
//note: the following line will likewise remove the data (1.243) associated with "time"
doc.get("time")->get("total")->add("time",2.457);
doc.get("time")->get("total")->add("flops",4.88e5);
doc.get("time")->add("ddot",1.243);
doc.get("time")->add("sparsemv","");
doc.get("time")->get("sparsemv")->add("time",0.3445);
doc.get("time")->get("sparsemv")->add("overhead","");
doc.get("time")->get("sparsemv")->get("overhead")->add("time",0.0123);
doc.get("time")->get("sparsemv")->get("overhead")->add("percentage",0.034);
cout << doc.generateYAML() << endl;
return 0;
\endcode
Below is the output generated by the above code:
\verbatim
final_residual: 1.4523e-13
time:
total:
time: 2.457
flops: 4.88e5
ddot: 1.243
sparsemv:
time: 0.3445
overhead:
time: 0.0123
percentage: 0.034
\endverbatim
\note {No value is allowed to be attached to a key that has children. If children are added to a key, the value is
simply set to "".}
*/
class YAML_Doc : public YAML_Element
{
public:
//! Constructor: accepts mini-application name and version as strings, optionally accepts directory and file name
//! for printing results.
/*!
The sole constructor for this class accepts and name and version number for the mini-application as well as
optional directory and file name information for results that are generated by the generateYAML() method. \param
miniApp_Name (in) string containing name of the mini-application \param miniApp_Version (in) string containing the
version of the mini-application \param destination_Directory (in, optional) path of directory where results file
will be stored, relative to current working directory. If this value is not supplied, the results file will be
stored in the current working directory. If the directory does not exist it will be created. \param
destination_FileName (in, optional) root name of the results file. A suffix of ".yaml" will be automatically
appended. If no file name is specified the filename will be constructed by concatenating the miniAppName +
miniAppVersion + ".yaml" strings.
*/
YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
const std::string& destination_Directory = "", const std::string& destination_FileName = "");
//! Destructor
~YAML_Doc();
//! Generate YAML results to standard out and to a file using specified directory and filename, using current
//! directory and miniAppName + miniAppVersion + ".yaml" by default
std::string generateYAML();
protected:
std::string miniAppName; //!< the name of the application that generated the YAML output
std::string miniAppVersion; //!< the version of the application that generated the YAML output
std::string destinationDirectory; //!< the destination directory for the generated the YAML output
std::string destinationFileName; //!< the filename for the generated the YAML output
};
#endif // YAML_DOC_HPP

220
src/YAML_Element.cpp Normal file
View File

@@ -0,0 +1,220 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file YAML_Element.cpp
HPCG routine
*/
#include "YAML_Element.hpp"
#include <fstream>
#include <iostream>
#include <sstream>
using namespace std;
YAML_Element::YAML_Element(const std::string& key_arg, const std::string& value_arg)
{
key = key_arg;
value = value_arg;
}
YAML_Element::~YAML_Element()
{
for (size_t i = 0; i < children.size(); i++)
{
delete children[i];
}
children.clear();
}
/*!
Add an element to the vector
QUESTION: if an element is not added because the key already exists,
will this lead to memory leakage?
@param[in] key_arg The key under which the element is stored
@param[in] value_arg The value of the element
@return Returns the added element
*/
YAML_Element* YAML_Element::add(const std::string& key_arg, double value_arg)
{
this->value = "";
string converted_value = convert_double_to_string(value_arg);
YAML_Element* element = new YAML_Element(key_arg, converted_value);
children.push_back(element);
return element;
}
/*!
Add an element to the vector
@param[in] key_arg The key under which the element is stored
@param[in] value_arg The value of the element
@return Returns the added element
*/
YAML_Element* YAML_Element::add(const std::string& key_arg, int value_arg)
{
this->value = "";
string converted_value = convert_int_to_string(value_arg);
YAML_Element* element = new YAML_Element(key_arg, converted_value);
children.push_back(element);
return element;
}
#ifndef HPCG_NO_LONG_LONG
/*!
Add an element to the vector
@param[in] key_arg The key under which the element is stored
@param[in] value_arg The value of the element
@return Returns the added element
*/
YAML_Element* YAML_Element::add(const std::string& key_arg, long long value_arg)
{
this->value = "";
string converted_value = convert_long_long_to_string(value_arg);
YAML_Element* element = new YAML_Element(key_arg, converted_value);
children.push_back(element);
return element;
}
#endif
/*!
Add an element to the vector
@param[in] key_arg The key under which the element is stored
@param[in] value_arg The value of the element
@return Returns the added element
*/
YAML_Element* YAML_Element::add(const std::string& key_arg, size_t value_arg)
{
this->value = "";
string converted_value = convert_size_t_to_string(value_arg);
YAML_Element* element = new YAML_Element(key_arg, converted_value);
children.push_back(element);
return element;
}
/*!
Add an element to the vector
@param[in] key_arg The key under which the element is stored
@param[in] value_arg The value of the element
@return Returns the added element
*/
YAML_Element* YAML_Element::add(const std::string& key_arg, const std::string& value_arg)
{
this->value = "";
YAML_Element* element = new YAML_Element(key_arg, value_arg);
children.push_back(element);
return element;
}
/*!
Returns the pointer to the YAML_Element for the given key.
@param[in] key_arg The key under which the element was stored
@return If found, returns the element, otherwise returns NULL
*/
YAML_Element* YAML_Element::get(const std::string& key_arg)
{
for (size_t i = 0; i < children.size(); i++)
{
if (children[i]->getKey() == key_arg)
{
return children[i];
}
}
return 0;
}
/*!
Prints a line of a YAML document. Correct YAML depends on
correct spacing; the parameter space should be the proper
amount of space for the parent element
@param[in] space spacing inserted at the beginning of the line
@return Returns a single line of the YAML document without the leading white space
*/
string YAML_Element::printYAML(std::string space)
{
string yaml_line = space + key + ": " + value + "\n";
for (int i = 0; i < 2; i++)
space = space + " ";
for (size_t i = 0; i < children.size(); i++)
{
yaml_line = yaml_line + children[i]->printYAML(space);
}
return yaml_line;
}
/*!
Converts a double precision value to a string.
@param[in] value_arg The value to be converted.
*/
string YAML_Element::convert_double_to_string(double value_arg)
{
stringstream strm;
strm << value_arg;
return strm.str();
}
/*!
Converts a integer value to a string.
@param[in] value_arg The value to be converted.
*/
string YAML_Element::convert_int_to_string(int value_arg)
{
stringstream strm;
strm << value_arg;
return strm.str();
}
#ifndef HPCG_NO_LONG_LONG
/*!
Converts a "long long" integer value to a string.
@param[in] value_arg The value to be converted.
*/
string YAML_Element::convert_long_long_to_string(long long value_arg)
{
stringstream strm;
strm << value_arg;
return strm.str();
}
#endif
/*!
Converts a "size_t" integer value to a string.
@param[in] value_arg The value to be converted.
*/
string YAML_Element::convert_size_t_to_string(size_t value_arg)
{
stringstream strm;
strm << value_arg;
return strm.str();
}

87
src/YAML_Element.hpp Normal file
View File

@@ -0,0 +1,87 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*!
@file YAML_Element.hpp
HPCG data structures for YAML output
*/
// Changelog
//
// Version 0.1
// - Initial version.
//
/////////////////////////////////////////////////////////////////////////
#ifndef YAML_ELEMENT_HPP
#define YAML_ELEMENT_HPP
#include "Geometry.hpp"
#include <string>
#include <vector>
//! HPCG YAML_Element class, from the HPCG YAML_Element class for registering key-value pairs of performance data
/*!
HPCG generates a collection of performance data for each run of the executable. YAML_Element, and
the related YAML_Doc class, provide a uniform facility for gathering and reporting this data using the YAML text
format.
*/
class YAML_Element
{
public:
//! Default constructor.
YAML_Element()
{
key = "";
value = "";
}
//! Construct with known key-value pair
YAML_Element(const std::string& key_arg, const std::string& value_arg);
//! Destructor
~YAML_Element();
//! Key accessor method
std::string getKey()
{
return key;
}
//! Add a child element to an element list associated with this element, value of type double
YAML_Element* add(const std::string& key_arg, double value_arg);
//! Add a child element to an element list associated with this element, value of type int
YAML_Element* add(const std::string& key_arg, int value_arg);
#ifndef HPCG_NO_LONG_LONG
//! Add a child element to an element list associated with this element, value of type long long
YAML_Element* add(const std::string& key_arg, long long value_arg);
#endif
//! Add a child element to an element list associated with this element, value of type size_t
YAML_Element* add(const std::string& key_arg, size_t value_arg);
//! Add a child element to an element list associated with this element, value of type string
YAML_Element* add(const std::string& key_arg, const std::string& value_arg);
//! get the element in the list with the given key
YAML_Element* get(const std::string& key_arg);
std::string printYAML(std::string space);
protected:
std::string key; //!< the key under which the element is stored
std::string value; //!< the value of the stored element
std::vector<YAML_Element*> children; //!< children elements of this element
private:
std::string convert_double_to_string(double value_arg);
std::string convert_int_to_string(int value_arg);
#ifndef HPCG_NO_LONG_LONG
std::string convert_long_long_to_string(long long value_arg);
#endif
std::string convert_size_t_to_string(size_t value_arg);
};
#endif // YAML_ELEMENT_HPP

49
src/finalize.cpp Normal file
View File

@@ -0,0 +1,49 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fstream>
#include "hpcg.hpp"
extern int use_output_file;
/*!
Closes the I/O stream used for logging information throughout the HPCG run.
@return returns 0 upon success and non-zero otherwise
@see HPCG_Init
*/
int HPCG_Finalize(void)
{
if (use_output_file)
HPCG_fout.close();
return 0;
}

150
src/hpcg.hpp Normal file
View File

@@ -0,0 +1,150 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file hpcg.hpp
HPCG data structures and functions
*/
/*
Hitory:
*05.28.2023: HPC-Benchmark 23.5 release
*/
#ifndef HPCG_HPP
#define HPCG_HPP
#include "Geometry.hpp"
#include <fstream>
#ifndef USE_CUDA
#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64) || defined(__amd64__) || defined(__amd64) \
|| defined(_M_X64)
#define USE_CUDA
#endif
#endif
#ifdef USE_CUDA
#include "Cuda.hpp"
#endif
#define XSTR(s) STR(s)
#define STR(s) #s
#define EMPTY_MACRO_ 1
#define CHECK_EMPTY_MACRO_(x) EMPTY_MACRO_##x
#define CHECK_EMPTY_MACRO(x) CHECK_EMPTY_MACRO_(x)
#ifndef make_HPCG_VER_MAJOR
#define HPCG_VER_MAJOR 24
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MAJOR) == 1
#define HPCG_VER_MAJOR 24
#else
#define HPCG_VER_MAJOR make_HPCG_VER_MAJOR
#endif
#ifndef make_HPCG_VER_MINOR
#define HPCG_VER_MINOR 09
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MINOR) == 1
#define HPCG_VER_MINOR 09
#else
#define HPCG_VER_MINOR make_HPCG_VER_MINOR
#endif
#ifndef make_HPCG_VER_PATCH
#define HPCG_VER_PATCH 0
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_PATCH) == 1
#define HPCG_VER_PATCH 0
#else
#define HPCG_VER_PATCH make_HPCG_VER_PATCH
#endif
#ifndef make_HPCG_VER_BUILD
#define HPCG_VER_BUILD 0
#elif CHECK_EMPTY_MACRO(make_HPCG_VER_BUILD) == 1
#define HPCG_VER_BUILD 0
#else
#define HPCG_VER_BUILD make_HPCG_VER_BUILD
#endif
#define HPCG_VERSION (HPCG_VER_MAJOR * 1000 + HPCG_VER_MINOR * 100 + HPCG_VER_PATCH)
#define VER_HEADER \
"HPCG-NVIDIA " XSTR(HPCG_VER_MAJOR)"." XSTR(HPCG_VER_MINOR) "." XSTR(HPCG_VER_PATCH) " -- NVIDIA accelerated HPCG benchmark -- NVIDIA\n"
#define HPCG_LINE_MAX 256
extern std::ofstream HPCG_fout;
// Refer to src/init.cpp for possible user-defined values
struct HPCG_Params_STRUCT
{
int comm_size; //!< Number of MPI processes in MPI_COMM_WORLD
int comm_rank; //!< This process' MPI rank in the range [0 to comm_size - 1]
int numThreads; //!< This process' number of threads
local_int_t nx; //!< Number of processes in x-direction of 3D process grid
local_int_t ny; //!< Number of processes in y-direction of 3D process grid
local_int_t nz; //!< Number of processes in z-direction of 3D process grid
int runningTime; //!< Number of seconds to run the timed portion of the benchmark
int npx; //!< Number of x-direction grid points for each local subdomain
int npy; //!< Number of y-direction grid points for each local subdomain
int npz; //!< Number of z-direction grid points for each local subdomain
int pz; //!< Partition in the z processor dimension, default is npz
local_int_t zl; //!< nz for processors in the z dimension with value less than pz
local_int_t zu; //!< nz for processors in the z dimension with value greater than pz
bool benchmark_mode; // !< Skips running reference code
bool use_l2compression; // !< Activates GPU L2 Compression
bool use_hpcg_mem_reduction; // !< Not passed as parameter. Set in main to true. Activates aggressive memory
// reduction optimizations
rank_type_t rank_type; // !< Not passed as parameter. GPU or CPU
p2p_comm_mode_t p2_mode; // !< We have 4 methods to do p2p comm in MV and MG, refer to Geometry.hpp
exec_mode_t exec_mode = GPUONLY; // !< Three modes supported: GPUONLY, CPUONLY, GPUCPU.
int g2c; // !< Related to GPU/CPU local problem definition
dim_3d_t diff_dim; // !< Specifies the dim that is different for the CPU and GPU ranks
local_problem_def_t local_problem_def; // !< Specifies how nx, ny, nz, and g2c are interpreted (4 possibilites)
bool cpu_allowed_to_print; // !< Not passed as parameter. Specifies the CPU rank (opposite to GPU rank) that is
// allowed to print
bool use_output_file; // !< There is a global variable with the same name defined in src/init.cpp and used
// throughout the files
local_int_t gpu_slice_size;
local_int_t cpu_slice_size;
};
/*!
HPCG_Params is a shorthand for HPCG_Params_STRUCT
*/
typedef HPCG_Params_STRUCT HPCG_Params;
extern void InitializeRanks(HPCG_Params& params);
extern int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params);
extern int HPCG_Finalize(void);
#endif // HPCG_HPP

444
src/init.cpp Normal file
View File

@@ -0,0 +1,444 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#ifndef HPCG_NO_OPENMP
#include <omp.h>
#endif
#ifdef _WIN32
const char* NULLDEVICE = "nul";
#else
const char* NULLDEVICE = "/dev/null";
#endif
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include "hpcg.hpp"
#include "ReadHpcgDat.hpp"
int use_output_file = 0;
std::ofstream HPCG_fout; //!< output file stream for logging activities during HPCG run
#if defined(USE_CUDA) && defined(USE_NCCL)
ncclComm_t Nccl_Comm;
#endif
#ifndef HPCG_NO_MPI
char host_name[MPI_MAX_PROCESSOR_NAME];
char pro_name[MPI_MAX_PROCESSOR_NAME];
MPI_Comm proComm;
int global_rank = 0;
int global_total_ranks = 0;
int program_rank = 0;
int program_total_ranks = 0;
int* physical_rank_dims;
int* logical_rank_to_phys;
int* physical_rank_dims_d;
int* logical_rank_to_phys_d;
#else
char host_name[1000];
char pro_name[1000];
#endif
static int startswith(const char* s, const char* prefix)
{
size_t n = strlen(prefix);
if (strncmp(s, prefix, n))
return 0;
return 1;
}
int stringCmp(const void* a, const void* b)
{
return strcmp((const char*) a, (const char*) b);
}
/*!
Initializes an HPCG run by obtaining problem parameters (from a file or
command line) and then broadcasts them to all nodes. It also initializes
login I/O streams that are used throughout the HPCG run. Only MPI rank 0
performs I/O operations.
The function assumes that MPI has already been initialized for MPI runs.
@param[in] argc_p the pointer to the "argc" parameter passed to the main() function
@param[in] argv_p the pointer to the "argv" parameter passed to the main() function
@param[out] params the reference to the data structures that is filled the basic parameters of the run
@return returns 0 upon success and non-zero otherwise
@see HPCG_Finalize
*/
void InitializeRanks(HPCG_Params& params)
{
char(*host_names)[MPI_MAX_PROCESSOR_NAME];
char(*program_names)[MPI_MAX_PROCESSOR_NAME];
MPI_Comm nodeComm;
int n, namelen, color, local_procs;
size_t bytes;
int deviceCount;
int local_rank = 0;
// 1) Find global
MPI_Comm_rank(MPI_COMM_WORLD, &global_rank); // GLobal rank for CPU and GPU
MPI_Comm_size(MPI_COMM_WORLD, &global_total_ranks); // Global Number of ranks for CPU and GPU
physical_rank_dims = new int[3 * global_total_ranks];
logical_rank_to_phys = new int[global_total_ranks];
bytes = global_total_ranks * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
// Color ranks by program name (if more than one binary executed, e.g., one for CPU and one for GPU)
program_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
strcpy(program_names[global_rank], __FILE__);
for (n = 0; n < global_total_ranks; n++)
{
MPI_Bcast(&(program_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
}
qsort(program_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
color = 0;
for (n = 0; n < global_total_ranks; n++)
{
if (n > 0 && strcmp(program_names[n - 1], program_names[n]))
color++;
if (strcmp(__FILE__, program_names[n]) == 0)
break;
}
MPI_Comm_split(MPI_COMM_WORLD, color, 0, &proComm);
MPI_Comm_rank(proComm, &program_rank);
MPI_Comm_size(proComm, &program_total_ranks);
free(program_names);
MPI_Get_processor_name(host_name, &namelen); // Host name
host_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
strcpy(host_names[global_rank], host_name);
for (n = 0; n < global_total_ranks; n++)
{
MPI_Bcast(&(host_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
}
qsort(host_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
color = 0;
for (n = 0; n < global_total_ranks; n++)
{
if (n > 0 && strcmp(host_names[n - 1], host_names[n]))
color++;
if (strcmp(host_name, host_names[n]) == 0)
break;
}
MPI_Comm_split(proComm, color, 0, &nodeComm);
MPI_Comm_rank(nodeComm, &local_rank);
MPI_Comm_size(nodeComm, &local_procs);
free(host_names);
#ifdef USE_CUDA
cudaGetDeviceCount(&deviceCount);
#endif
// Figure out the rank type, based on execution mode (params.exec_mode)
if (params.exec_mode == CPUONLY)
{
params.rank_type = CPU;
}
else if (params.exec_mode == GPUONLY)
{
params.rank_type = GPU;
#ifdef USE_CUDA
cudaGetDeviceCount(&deviceCount);
cudaSetDevice(local_rank % deviceCount);
// Touch Pinned Memory
double* t;
cudaMallocHost((void**) (&(t)), sizeof(double));
cudaFreeHost(t);
if (params.p2_mode == NCCL)
{
#ifdef USE_NCCL
ncclUniqueId id;
if (global_rank == 0)
ncclGetUniqueId(&id);
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
ncclCommInitRank(&Nccl_Comm, global_total_ranks, id, global_rank);
#endif // USE_NCCL
}
#endif // USE_CUDA
}
else /*CPUGPU*/
{
// Here we assume that a node has the same number of GPU and CPU ranks
// This design is rigid but it is difficult to assign ranks automatically
// to GPUs and CPUs otherwise
params.cpu_allowed_to_print = false; // Enable printing for the first CPU rank only
int ranks_for_numa = local_procs / deviceCount;
if (ranks_for_numa == 1)
{
if (global_rank == 0)
printf("Warning: All Ranks will be Assigned to GPUs, check the total number of ranks\n");
}
if (local_rank % ranks_for_numa == 0)
{
params.rank_type = GPU;
#ifdef USE_CUDA
cudaSetDevice(local_rank / ranks_for_numa);
// Touch Pinned Memory
double* t;
cudaMallocHost((void**) (&(t)), sizeof(double));
cudaFreeHost(t);
#endif
}
else
{
params.rank_type = CPU;
if (local_rank == 1 && color == 0)
{
params.cpu_allowed_to_print = true;
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params)
{
int argc = *argc_p;
char** argv = *argv_p;
char fname[80];
int i, j, *iparams;
char cparams[][9] = {"--nx=", "--ny=", "--nz=", "--rt=", "--npx=", "--npy=", "--npz=", "--b=", "--l2cmp=", "--mr=",
"--exm=", "--g2c=", "--ddm=", "--lpm=", "--p2p=", "--of=", "--gss=", "--css="};
time_t rawtime;
tm* ptm;
const int nparams = (sizeof cparams) / (sizeof cparams[0]);
bool broadcastParams = false; // Make true if parameters read from file.
const char* name = "HPCG_USE_OUTPUT_FILE";
char* value;
value = getenv(name);
if (value != NULL)
{
use_output_file = atoi(value);
}
iparams = (int*) malloc(sizeof(int) * nparams);
// Initialize iparams
for (i = 0; i < nparams; ++i)
iparams[i] = 0;
/* for sequential and some MPI implementations it's OK to read first three args */
for (i = 0; i < nparams; ++i)
if (argc <= i + 1 || sscanf(argv[i + 1], "%d", iparams + i) != 1 || iparams[i] < 11)
iparams[i] = 0;
/* for some MPI environments, command line arguments may get complicated so we need a prefix */
for (i = 1; i <= argc && argv[i]; ++i)
for (j = 0; j < nparams; ++j)
if (startswith(argv[i], cparams[j]))
if (sscanf(argv[i] + strlen(cparams[j]), "%d", iparams + j) != 1)
iparams[j] = 0;
// Check if --rt was specified on the command line
int* rt = iparams + 3; // Assume runtime was not specified and will be read from the hpcg.dat file
if (iparams[3])
rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file
if (!iparams[0] && !iparams[1] && !iparams[2])
{ /* no geometry arguments on the command line */
char HPCG_DAT_FILE[HPCG_LINE_MAX];
if (argc > 1)
{
strcpy(HPCG_DAT_FILE, argv[1]);
}
else
{
strcpy(HPCG_DAT_FILE, "./hpcg.dat");
}
if (ReadHpcgDat(iparams, rt, iparams + 7, HPCG_DAT_FILE) == -1)
{
printf("No input data. Possible options:\n");
fflush(0);
printf("\t1) Specify path to input file: ./xhpcg <path to *.dat file>\n");
printf("\t2) Copy hpcg.dat to the run directory\n");
printf("\t3) Use command line parameters: ./xhpcg --nx <x> --ny <y> --nz <z> --rt <t>\n");
exit(-1);
}
broadcastParams = true;
}
// Check for small or unspecified nx, ny, nz values
// If any dimension is less than 16, make it the max over the other two dimensions, or 16, whichever is largest
for (i = 0; i < 3; ++i)
{
if (iparams[i] < 16)
for (j = 1; j <= 2; ++j)
if (iparams[(i + j) % 3] > iparams[i])
iparams[i] = iparams[(i + j) % 3];
if (iparams[i] < 16)
iparams[i] = 16;
}
#ifndef HPCG_NO_MPI
MPI_Comm_rank(MPI_COMM_WORLD, &params.comm_rank);
MPI_Comm_size(MPI_COMM_WORLD, &params.comm_size);
#else
params.comm_rank = 0;
params.comm_size = 1;
#endif
// Broadcast values of iparams to all MPI processes
#ifndef HPCG_NO_MPI
if (broadcastParams)
{
MPI_Bcast(iparams, nparams, MPI_INT, 0, MPI_COMM_WORLD);
}
#endif
params.nx = iparams[0];
params.ny = iparams[1];
params.nz = iparams[2];
params.runningTime = iparams[3];
params.npx = iparams[4];
params.npy = iparams[5];
params.npz = iparams[6];
params.benchmark_mode = iparams[7] > 0;
params.use_l2compression = iparams[8] > 0;
params.use_hpcg_mem_reduction = iparams[9] > 0;
/* 0: CPU only | 1: GPU only | 2: GPUCPU */
params.exec_mode = iparams[10] == 2 ? GPUCPU : (iparams[10] == 1 ? CPUONLY : GPUONLY);
params.g2c = iparams[11] == 0 ? 1 : iparams[11];
/* 0: NONE | 1: X | 1: Y | 2: Z */
params.diff_dim = iparams[12] == 3 ? Z : (iparams[12] == 2 ? Y : (iparams[12] == 1 ? X : NONE));
// GPU_RATIO=0/*NX, NY, NZ are local to GPU and g2c is a ratio*/
// GPU_ABS=1/*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
// GPU_CPU_RATIO=2/*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
// GPU_CPU_ABS=3/*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
if (iparams[13] == 1)
params.local_problem_def = GPU_ABS;
else if (iparams[13] == 2)
params.local_problem_def = GPU_CPU_RATIO;
else if (iparams[13] == 3)
params.local_problem_def = GPU_CPU_ABS;
else
params.local_problem_def = GPU_RATIO;
// P2P Communication method
if (iparams[14] == 1)
params.p2_mode = MPI_CPU_All2allv;
else if (iparams[14] == 2)
params.p2_mode = MPI_CUDA_AWARE;
else if (iparams[14] == 3)
params.p2_mode = MPI_GPU_All2allv;
else if (iparams[14] == 4)
params.p2_mode = NCCL;
else
params.p2_mode = MPI_CPU;
if (iparams[15] == 1)
{
params.use_output_file = 1;
use_output_file = 1;
}
else
{
params.use_output_file = 0;
use_output_file = 0;
}
// --gss
params.gpu_slice_size = iparams[16] > 0 ? iparams[16] : 4096;
// --css
params.cpu_slice_size = iparams[17] > 0 ? iparams[17] : 8;
if (params.comm_rank == 0)
{
printf("%s", VER_HEADER);
}
#ifdef HPCG_NO_OPENMP
params.numThreads = 1;
#else
#pragma omp parallel
#pragma omp single
params.numThreads = omp_get_num_threads();
#endif
time(&rawtime);
ptm = localtime(&rawtime);
sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
if (use_output_file)
{
if (0 == params.comm_rank)
{
HPCG_fout.open(fname);
}
else
{
#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d_%d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
ptm->tm_hour, ptm->tm_min, ptm->tm_sec, params.comm_rank);
HPCG_fout.open(fname);
#else
HPCG_fout.open(NULLDEVICE);
#endif
}
}
free(iparams);
return 0;
}

878
src/main.cpp Normal file
View File

@@ -0,0 +1,878 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*!
@file main.cpp
- All emums are in Geomerty.hpp
- Supports GPU-only, Grace-only, and GPU-Grace. GPU and Grace are different MPI ranks.
- The dimensions of GPU rank and CPU rank can only differ in one dimension (nx, ny, or nz).
- Parameters are explained in bin/RUNNING-*
*/
// Main routine of a program that calls the HPCG conjugate gradient
// solver to solve the problem, and then prints results.
#ifndef HPCG_NO_MPI
#include <mpi.h>
#endif
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#ifdef USE_GRACE
#include <nvpl_sparse.h>
#endif
#include "CG.hpp"
#include "CGData.hpp"
#include "CG_ref.hpp"
#include "CheckAspectRatio.hpp"
#include "CheckProblem.hpp"
#include "ComputeMG_ref.hpp"
#include "ComputeResidual.hpp"
#include "ComputeSPMV_ref.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"
#include "ExchangeHalo.hpp"
#include "GenerateCoarseProblem.hpp"
#include "GenerateGeometry.hpp"
#include "GenerateProblem.hpp"
#include "Geometry.hpp"
#include "OptimizeProblem.hpp"
#include "ReportResults.hpp"
#include "SetupHalo.hpp"
#include "SparseMatrix.hpp"
#include "TestCG.hpp"
#include "TestNorms.hpp"
#include "TestSymmetry.hpp"
#include "Vector.hpp"
#include "WriteProblem.hpp"
#include "hpcg.hpp"
#include "mytimer.hpp"
#ifdef HPCG_DETAILED_DEBUG
using std::cin;
#endif
using std::endl;
// Prints in a file or terminal
extern int use_output_file;
#ifdef USE_CUDA
cusparseHandle_t cusparsehandle;
cublasHandle_t cublashandle;
cudaStream_t stream;
cudaEvent_t copy_done;
cudaStream_t copy_stream;
int* ranktoId;
#endif
#ifdef USE_GRACE
nvpl_sparse_handle_t nvpl_sparse_handle;
#endif
// The communication mode used to send point-to-point messages
#ifndef HPCG_NO_MPI
p2p_comm_mode_t P2P_Mode;
#endif
// USE CUDA L2 compression
bool Use_Compression;
// USE HPCG aggresive memory reduction
bool Use_Hpcg_Mem_Reduction;
#ifndef HPCG_NO_MPI
// Used to find ranks for CPU and GPU programs
int* rankToId_h;
int* idToRank_h;
extern int* physical_rank_dims;
extern int* logical_rank_to_phys;
#endif
/*!
Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report
results.
@param[in] argc Standard argument count. Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in)
@param[in] argv Standard argument array. If argc==1, argv is unused. If argc==4, argv[1], argv[2], argv[3] will be
interpreted as nx, ny, nz, resp.
@return Returns zero on success and a non-zero value otherwise.
*/
int main(int argc, char* argv[])
{
#ifndef HPCG_NO_MPI
MPI_Init(&argc, &argv);
#endif
// Here I read all the parameters, including the execution mode (CPUONLY, GPUONLY, GPUCPU)
HPCG_Params params;
HPCG_Init(&argc, &argv, params);
bool quickPath = (params.runningTime == 0);
int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
bool benchmark_mode = params.benchmark_mode;
Use_Compression = params.use_l2compression;
Use_Hpcg_Mem_Reduction = true; // params.use_hpcg_mem_reduction;
P2P_Mode = params.p2_mode;
if (rank == 0)
{
printf("Build v0.6.0 \n");
#ifdef HPCG_ENG_VERSION
printf("\n%s%s\n", "========================================", "========================================");
#ifdef HPCG_COMMIT_HASH
printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit: %s\n",
XSTR(HPCG_COMMIT_HASH));
#else
printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit:\n");
#endif
printf("%s%s\n", "========================================", "========================================");
#endif
printf("\nStart of application (%s) ...\n",
params.exec_mode == GPUONLY ? "GPU-Only"
: params.exec_mode == CPUONLY ? "Grace-Only"
: "GPU+Grace");
if (benchmark_mode)
printf(" | Benchmark Mode !!!! CPU reference code is not performed \n");
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
if (Use_Compression)
printf(
" | L2 compression is activated !!!! Currently, it is not legal to submit HPCG results with L2 "
"compression\n");
#ifdef INDEX_64
printf(" | Using INT64 Indexing \n");
#endif
}
// Check P2P comm mode
// if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
// {
// #ifndef USE_GRACE
// if (rank == 0)
// printf(
// "Error: HPCG was not compiled for Grace execution. USE --exm=0 for GPU-only execution or add "
// "-DUSE_GRACE. Exiting ...\n");
// #ifndef HPCG_NO_MPI
// MPI_Finalize();
// #endif
// return 0;
// #endif // USE_GRACE
bool invalid = false;
if (P2P_Mode == NCCL)
{
if (rank == 0)
printf("Invalid P2P communication mode (NCCL) for CPUs, Exiting ...\n");
invalid = true;
}
if (P2P_Mode == MPI_GPU_All2allv)
{
if (rank == 0)
printf("Invalid P2P communication mode (MPI GPU All2allv) for CPUs, Exiting ...\n");
invalid = true;
}
if (P2P_Mode == MPI_CUDA_AWARE)
{
if (rank == 0)
printf("Invalid P2P communication mode (CUDA-Aware MPI) for CPUs, Exiting ...\n");
invalid = true;
}
if (invalid)
{
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
}
#ifndef USE_NCCL
if (params.exec_mode == GPUONLY)
{
if (rank == 0)
printf(
"Error: HPCG was not compiled with NCCL. USE --exm=1 for Grace-only execution or add -DUSE_NCCL. "
"Exiting ...\n");
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
#endif // USE_NCCL
// Check whether total number of ranks == npx*npy*npz
auto rank_grid_size = params.npx * params.npy * params.npz;
if (rank_grid_size > 0 && size != rank_grid_size)
{
if (rank == 0)
printf("Error: Total Number of ranks != npx*npy*npz. Exiting ...\n");
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
#ifndef USE_CUDA
if (params.exec_mode != CPUONLY)
{
if (rank == 0)
printf(
"Error: HPCG was not compiled for GPU execution. USE --exm=1 for Grace-only execution or add "
"-DUSE_CUDA. Exiting ...\n");
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
#endif
// Here, we decide the rank type
// assign a rank to GPU and CPU
InitializeRanks(params);
// Check if QuickPath option is enabled.
// If the running time is set to zero, we minimize all paths through the program
#ifdef HPCG_DETAILED_DEBUG
if (size < 100 && rank == 0)
HPCG_fout << "Process " << rank << " of " << size << " is alive with " << params.numThreads << " threads."
<< endl;
if (rank == 0)
{
char c;
std::cout << "Press key to continue" << std::endl;
std::cin.get(c);
}
#ifndef HPCG_NO_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
#endif
/////////////////////////
// Problem setup Phase //
/////////////////////////
#ifdef HPCG_DEBUG
double t1 = mytimer();
#endif
// Construct the geometry and linear system
Geometry* geom = new Geometry;
GenerateGeometry(params, geom);
int ierr = CheckAspectRatio(0.125, geom->nx, geom->ny, geom->nz, "local problem", rank == 0);
if (ierr)
return ierr;
ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank == 0);
if (ierr)
return ierr;
// Sync All Ranks
#ifndef HPCG_NO_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
// Test Library versions for cuSPARSE or NVPL Sparse
// The two librray versions has to be tested in
// GPU or Grace ranks
int cusparseMajor = 0, cusparseMinor = 0;
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
{
#ifdef USE_CUDA
// Cusparse Version
cusparseGetProperty(MAJOR_VERSION, &cusparseMajor);
cusparseGetProperty(MINOR_VERSION, &cusparseMinor);
if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 2))
{
if (rank == 0)
printf("cuSPARSE version must be 12.2 or higher (found v%d.%d) \n", cusparseMajor, cusparseMinor);
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
#endif
}
int nvspMajor = 0, nvspMinor = 0, nvspPatch = 0, nvspVersion = 0;
// if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
// {
// #ifdef USE_GRACE
// // NVPL Sparse Version
// nvpl_sparse_create(&(nvpl_sparse_handle));
// nvpl_sparse_get_version(nvpl_sparse_handle, &nvspVersion);
// nvspMajor = nvspVersion / 1000;
// nvspMinor = (nvspVersion % 1000) / 100;
// nvspPatch = nvspVersion % 100;
// if (nvspMajor < 0 || (nvspMajor == 0 && nvspMinor < 2))
// {
// if (rank == 0)
// printf("NVPL Sparse version must be 0.2 or higher (found v%d.%d) \n", nvspMajor, nvspMinor);
// #ifndef HPCG_NO_MPI
// MPI_Finalize();
// #endif
// return 0;
// }
// #endif // USE_GRACE
// }
SparseMatrix A;
Vector x_overlap, b_computed;
Vector b, x, xexact;
std::vector<double> times(10, 0.0);
CGData data;
InitializeSparseMatrix(A, geom);
size_t cpuRefMemory = 0;
int numberOfMgLevels = 4; // Number of levels including first
SparseMatrix* curLevelMatrix = &A;
if (params.rank_type == GPU)
{
A.rankType = GPU;
A.slice_size = params.gpu_slice_size;
cublasCreate(&(cublashandle));
cusparseCreate(&(cusparsehandle));
cudaStreamCreate(&(stream));
cudaStreamCreate(&(copy_stream));
cusparseSetStream(cusparsehandle, stream);
cublasSetStream(cublashandle, stream);
cusparseSetPointerMode(cusparsehandle, CUSPARSE_POINTER_MODE_HOST);
cublasSetPointerMode(cublashandle, CUBLAS_POINTER_MODE_HOST);
cudaEventCreate(&copy_done);
// Allocate GPU related data
AllocateMemCuda(A);
double setup_time = mytimer();
GenerateProblem(A, &b, &x, &xexact);
SetupHalo(A);
for (int level = 1; level < numberOfMgLevels; ++level)
{
GenerateCoarseProblem(*curLevelMatrix);
curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
}
setup_time = mytimer() - setup_time; // Capture total time of setup
delete[] physical_rank_dims;
delete[] logical_rank_to_phys;
times[9] = setup_time; // Save it for reporting
// Copy data from Device to Host.
// Note: exclude this from setup_time, as soon as it is needed only for reference calls.
cpuRefMemory = CopyDataToHostCuda(A, &b, &x, &xexact);
// Alocate the GPU data for optimized data structures
AllocateMemOptCuda(A);
}
// else
// {
// #ifdef USE_GRACE
// A.rankType = CPU;
// A.slice_size = params.cpu_slice_size;
// // Use this array for collecting timing information
// double setup_time = mytimer();
// GenerateProblem(A, &b, &x, &xexact);
// SetupHalo(A);
// for (int level = 1; level < numberOfMgLevels; ++level)
// {
// GenerateCoarseProblem(*curLevelMatrix);
// curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
// }
// // These global buffers only needed for problem setup
// delete[] rankToId_h;
// delete[] idToRank_h;
// delete[] physical_rank_dims;
// delete[] logical_rank_to_phys;
// setup_time = mytimer() - setup_time; // Capture total time of setup
// times[9] = setup_time; // Save it for reporting
// #endif // USE_GRACE
// }
curLevelMatrix = &A;
Vector* curb = &b;
Vector* curx = &x;
Vector* curxexact = &xexact;
for (int level = 0; level < numberOfMgLevels; ++level)
{
// Doesn't work for GPU or GPUCPU cases
// Data need to be transfered between CPU and GPU, which is not feasible
if (params.exec_mode == CPUONLY)
{
CheckProblem(*curLevelMatrix, curb, curx, curxexact);
//Delete mtxIndG since it is not needed anymore
delete [] curLevelMatrix->mtxIndG[0];
}
curLevelMatrix = curLevelMatrix->Ac; // Make the nextcoarse grid the next level
curb = 0; // No vectors after the top level
curx = 0;
curxexact = 0;
}
InitializeSparseCGData(A, data);
////////////////////////////////////
// Reference SpMV+MG Timing Phase //
////////////////////////////////////
// Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines
local_int_t nrow = A.localNumberOfRows;
local_int_t ncol = A.localNumberOfColumns;
InitializeVector(x_overlap, ncol, A.rankType); // Overlapped copy of x vector
InitializeVector(b_computed, nrow, A.rankType); // Computed RHS vector
// Record execution time of reference SpMV and MG kernels for reporting times
// First load vector with random values
FillRandomVector(x_overlap);
int numberOfCalls = 10;
if (quickPath)
numberOfCalls = 1; // QuickPath means we do on one call of each block of repetitive code
if (!benchmark_mode)
{
double t_begin = mytimer();
for (int i = 0; i < numberOfCalls; ++i)
{
ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
}
ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
}
}
times[8] = (mytimer() - t_begin) / ((double) numberOfCalls); // Total time divided by number of calls.
#ifdef HPCG_DEBUG
if (rank == 0)
HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl;
#endif
}
///////////////////////////////
// Reference CG Timing Phase //
///////////////////////////////
#ifdef HPCG_DEBUG
t1 = mytimer();
#endif
int global_failure = 0; // assume all is well: no failures
int niters = 0;
int totalNiters_ref = 0;
double normr = 1.0;
double normr0 = 1.0;
int refMaxIters = 50;
numberOfCalls = 1; // Only need to run the residual reduction analysis once
// Compute the residual reduction for the natural ordering and reference kernels
std::vector<double> ref_times(9, 0.0);
double tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations
int err_count = 0;
double refTolerance = 0.0055;
if (!benchmark_mode)
{
for (int i = 0; i < numberOfCalls; ++i)
{
ZeroVector(x);
ierr = CG_ref(A, data, b, x, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true,
i == 0); // TODO: TRUE
if (ierr)
++err_count; // count the number of errors in CG
totalNiters_ref += niters;
}
if (rank == 0 && err_count)
if (use_output_file)
{
HPCG_fout << err_count << " error(s) in call(s) to reference CG." << endl;
}
else
{
std::cout << err_count << " error(s) in call(s) to reference CG." << endl;
}
refTolerance = normr / normr0;
}
if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
{
#ifdef USE_CUDA
if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 5))
{
// Test for the most course matrix
if(A.localNumberOfRows/(8 * 8 * 8) < A.slice_size) {
if (rank == 0)
printf("cuSPARSE version must be 12.5 or higher (found v%d.%d) to allow a GPU slice size (%lld) larger than the matrix number of rows (%lld). Use --gss to set GPU slice size \n",
cusparseMajor, cusparseMinor, (long long)A.slice_size, (long long)(A.localNumberOfRows/(8*8*8)));
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}
}
#endif
}
// Call user-tunable set up function.
double t7 = mytimer();
size_t opt_mem = OptimizeProblem(A, data, b, x, xexact);
t7 = mytimer() - t7;
times[7] = t7;
#ifdef HPCG_DEBUG
if (rank == 0)
std::cout << "Total problem optimize in main (sec) = " << t7 << endl;
#endif
if (params.rank_type == GPU)
{
#ifdef USE_CUDA
int dev;
cudaDeviceProp props;
CHECK_CUDART(cudaGetDevice(&dev));
CHECK_CUDART(cudaGetDeviceProperties(&props, dev));
size_t free_bytes, total_bytes;
CHECK_CUDART(cudaMemGetInfo(&free_bytes, &total_bytes));
//Find the number of SMS
int numSMS = props.multiProcessorCount;
if (rank == 0)
printf(
"GPU Rank Info:\n"
" | cuSPARSE version %d.%d\n%s"
" | Reference CPU memory = %.2f MB\n"
" | GPU Name: '%s'\n"
" | Number of SMs: %d\n"
" | GPU Memory Use: %ld MB / %ld MB\n"
" | Process Grid: %dx%dx%d\n"
" | Local Domain: %dx%dx%d\n"
" | Number of CPU Threads: %d\n"
" | Slice Size: %lld\n",
cusparseMajor, cusparseMinor, Use_Compression ? " | L2 compression is activated\n" : "",
cpuRefMemory / 1024.0 / 1024.0, props.name, numSMS, (total_bytes - free_bytes) >> 20, total_bytes >> 20,
A.geom->npx, A.geom->npy, A.geom->npz, (int)A.geom->nx, (int)A.geom->ny, (int)A.geom->nz, params.numThreads, (long long)A.slice_size);
CHECK_CUDART(cudaDeviceSynchronize());
#endif
}
else
{
#ifdef USE_GRACE
cpuRefMemory = EstimateCpuRefMem(A);
if (rank == 0 || (params.exec_mode == GPUCPU && params.cpu_allowed_to_print))
printf(
"CPU Rank Info:\n"
" | NVPL Sparse version %d.%d.%d\n"
" | Reference CPU memory = %.2f MB\n"
" | Optimization Memory Use: %.2f MB\n"
" | Process Grid: %dx%dx%d\n"
" | Local Domain: %dx%dx%d\n"
" | Number of CPU Threads: %d\n"
" | Slice Size: %d\n",
nvspMajor, nvspMinor, nvspPatch, cpuRefMemory / 1024.0 / 1024.0, opt_mem / 1024.0 / 1024.0, A.geom->npx,
A.geom->npy, A.geom->npz, A.geom->nx, A.geom->ny, A.geom->nz, params.numThreads, A.slice_size);
#endif // USE_GRACE
}
#ifdef HPCG_DETAILED_DEBUG
if (geom->size == 1)
WriteProblem(*geom, A, b, x, xexact);
#endif
MPI_Barrier(MPI_COMM_WORLD);
//////////////////////////////
// Validation Testing Phase //
//////////////////////////////
#ifdef HPCG_DEBUG
t1 = mytimer();
#endif
TestCGData testcg_data;
testcg_data.count_pass = testcg_data.count_fail = 0;
TestCG(A, data, b, x, testcg_data);
TestSymmetryData testsymmetry_data;
TestSymmetry(A, b, xexact, testsymmetry_data);
#ifdef HPCG_DEBUG
if (rank == 0)
HPCG_fout << "Total validation (TestCG and TestSymmetry) execution time in main (sec) = " << mytimer() - t1
<< endl;
#endif
//////////////////////////////
// Optimized CG Setup Phase //
//////////////////////////////
// Need to permute the b vector
if (A.rankType == GPU)
{
#ifdef USE_CUDA
PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
#endif
}
else
{
#ifdef USE_GRACE
PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
#endif
}
niters = 0;
normr = 0.0;
normr0 = 0.0;
err_count = 0;
int tolerance_failures = 0;
int optMaxIters = 10 * refMaxIters;
int optNiters = refMaxIters;
double opt_worst_time = 0.0;
double opt_best_time = 9999999.0;
std::vector<double> bleh_times(9, 0.0);
ZeroVector(x); // start x at all zeros
ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &bleh_times[0], true, 1);
std::vector<double> opt_times(9, 0.0);
numberOfCalls = 1;
// Compute the residual reduction and residual count for the user ordering and optimized kernels.
for (int i = 0; i < numberOfCalls; ++i)
{
ZeroVector(x); // start x at all zeros
double last_cummulative_time = opt_times[0];
ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &opt_times[0], true, 0); // TODO:
// TRUE
if (ierr)
++err_count; // count the number of errors in CG
if (normr / normr0 > refTolerance)
++tolerance_failures; // the number of failures to reduce residual
// pick the largest number of iterations to guarantee convergence
if (niters > optNiters)
optNiters = niters;
double current_time = opt_times[0] - last_cummulative_time;
if (current_time > opt_worst_time)
opt_worst_time = current_time;
if (current_time < opt_best_time)
opt_best_time = current_time;
}
#ifndef HPCG_NO_MPI
// Get the absolute worst time across all MPI ranks (time in CG can be different)
double local_opt_worst_time = opt_worst_time;
MPI_Allreduce(&local_opt_worst_time, &opt_worst_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
#endif
if (rank == 0 && err_count)
if (use_output_file)
{
HPCG_fout << err_count << " error(s) in call(s) to optimized CG." << endl;
}
else
{
std::cout << err_count << " error(s) in call(s) to optimized CG." << endl;
}
if (tolerance_failures)
{
global_failure = 1;
if (rank == 0)
if (use_output_file)
{
HPCG_fout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
}
else
{
std::cout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
}
}
///////////////////////////////
// Optimized CG Timing Phase //
///////////////////////////////
// Here we finally run the benchmark phase
// The variable total_runtime is the target benchmark execution time in seconds
double total_runtime = params.runningTime;
int numberOfCgSets = int(total_runtime / opt_worst_time) + 1; // Run at least once, account for rounding
#ifdef HPCG_DEBUG
if (rank == 0)
{
HPCG_fout << "Projected running time: " << total_runtime << " seconds" << endl;
HPCG_fout << "Number of CG sets: " << numberOfCgSets << endl;
}
#endif
/* This is the timed run for a specified amount of time. */
optMaxIters = optNiters;
double optTolerance = 0.0; // Force optMaxIters iterations
TestNormsData testnorms_data;
testnorms_data.samples = numberOfCgSets;
testnorms_data.values = new double[numberOfCgSets];
#ifndef HPCG_NOMPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
for (int i = 0; i < numberOfCgSets; ++i)
{
ZeroVector(x); // Zero out x
ierr = CG(A, data, b, x, optMaxIters, optTolerance, niters, normr, normr0, &times[0], true, 0); // TODO: TRUE
if (ierr)
if (use_output_file)
{
HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
}
else
{
std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
}
if (rank == 0)
if (use_output_file)
{
HPCG_fout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
}
else
{
std::cout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
}
testnorms_data.values[i] = normr / normr0; // Record scaled residual from this run
}
if (params.rank_type == GPU)
{
#ifdef USE_CUDA
PermVectorCuda(A.ref2opt, x, A.localNumberOfRows);
CopyVectorD2H(x);
#endif
}
else
{
#ifdef USE_GRACE
// Reorder vector
Vector xOrdered;
InitializeVector(xOrdered, x.localLength, A.rankType);
CopyVector(x, xOrdered);
CopyAndReorderVector(xOrdered, x, A.ref2opt);
DeleteVector(xOrdered);
#endif
}
// Compute difference between known exact solution and computed solution
// All processors are needed here.
#ifdef HPCG_DEBUG
double residual = 0;
ierr = ComputeResidual(A.localNumberOfRows, x, xexact, residual);
if (ierr)
HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
if (rank == 0)
HPCG_fout << "Difference between computed and exact = " << residual << ".\n" << endl;
#endif
// Test Norm Results
ierr = TestNorms(testnorms_data);
//////////////////
// Report Results //
//////////////////
// Report results to YAML file
ReportResults(A, numberOfMgLevels, numberOfCgSets, refMaxIters, optMaxIters, &times[0], testcg_data,
testsymmetry_data, testnorms_data, global_failure, quickPath);
if (params.rank_type == GPU)
{
#ifdef USE_CUDA
DeleteMatrixGpu(A); // This delete will recursively delete all coarse grid data
#endif
}
else
{
#ifdef USE_GRACE
DeleteMatrixCpu(A); // This delete will recursively delete all coarse grid data
#endif
}
DeleteCGData(data);
DeleteVector(x);
DeleteVector(b);
DeleteVector(xexact);
DeleteVector(x_overlap);
DeleteVector(b_computed);
delete[] testnorms_data.values;
// Clean cuSPARSE data
if (params.rank_type == GPU)
{
#ifdef USE_CUDA
cublasDestroy(cublashandle);
cusparseDestroy(cusparsehandle);
cudaStreamDestroy(stream);
cudaStreamDestroy(copy_stream);
cudaEventDestroy(copy_done);
#endif
}
// We create the handle even in GPU ranks tp find library version
if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
{
#ifdef USE_GRACE
nvpl_sparse_destroy(nvpl_sparse_handle);
#endif
}
HPCG_Finalize();
// Finish up
#ifndef HPCG_NO_MPI
MPI_Finalize();
#endif
return 0;
}

59
src/mytimer.cpp Normal file
View File

@@ -0,0 +1,59 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
/////////////////////////////////////////////////////////////////////////
// Function to return time in seconds.
// If compiled with no flags, return CPU time (user and system).
// If compiled with -DWALL, returns elapsed time.
/////////////////////////////////////////////////////////////////////////
#ifndef HPCG_NO_MPI
#include <mpi.h>
double mytimer(void)
{
return MPI_Wtime();
}
#elif !defined(HPCG_NO_OPENMP)
// If this routine is compiled with HPCG_NO_MPI defined and not compiled with HPCG_NO_OPENMP then use the OpenMP timer
#include <omp.h>
double mytimer(void)
{
return omp_get_wtime();
}
#else
#include <cstdlib>
#include <sys/resource.h>
#include <sys/time.h>
double mytimer(void)
{
struct timeval tp;
static long start = 0, startu;
if (!start)
{
gettimeofday(&tp, NULL);
start = tp.tv_sec;
startu = tp.tv_usec;
return 0.0;
}
gettimeofday(&tp, NULL);
return ((double) (tp.tv_sec - start)) + (tp.tv_usec - startu) / 1000000.0;
}
#endif

18
src/mytimer.hpp Normal file
View File

@@ -0,0 +1,18 @@
//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra (dongarra@eecs.utk.edu)
// Piotr Luszczek (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER
#ifndef MYTIMER_HPP
#define MYTIMER_HPP
double mytimer(void);
#endif // MYTIMER_HPP