first commit
This commit is contained in:
309
src/ComputeSYMGS.cpp
Normal file
309
src/ComputeSYMGS.cpp
Normal file
@@ -0,0 +1,309 @@
|
||||
|
||||
//@HEADER
|
||||
// ***************************************************
|
||||
//
|
||||
// HPCG: High Performance Conjugate Gradient Benchmark
|
||||
//
|
||||
// Contact:
|
||||
// Michael A. Heroux ( maherou@sandia.gov)
|
||||
// Jack Dongarra (dongarra@eecs.utk.edu)
|
||||
// Piotr Luszczek (luszczek@eecs.utk.edu)
|
||||
//
|
||||
// ***************************************************
|
||||
//@HEADER
|
||||
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/*!
|
||||
@file ComputeSYMGS.cpp
|
||||
|
||||
HPCG routine
|
||||
*/
|
||||
#ifdef USE_CUDA
|
||||
#include "Cuda.hpp"
|
||||
#endif
|
||||
#ifndef HPCG_NO_MPI
|
||||
#include "ExchangeHalo.hpp"
|
||||
#endif
|
||||
#include "ComputeSPMV.hpp"
|
||||
#include "ComputeSYMGS.hpp"
|
||||
#include "CpuKernels.hpp"
|
||||
#include "CudaKernels.hpp"
|
||||
|
||||
/*!
|
||||
Routine to compute one step of symmetric Gauss-Seidel:
|
||||
|
||||
Assumption about the structure of matrix A:
|
||||
- Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
|
||||
- Entries in row 'i' are ordered such that:
|
||||
- lower triangular terms are stored before the diagonal element.
|
||||
- upper triangular terms are stored after the diagonal element.
|
||||
- No other assumptions are made about entry ordering.
|
||||
|
||||
Symmetric Gauss-Seidel notes:
|
||||
- We use the input vector x as the RHS and start with an initial guess for y of all zeros.
|
||||
- We perform one forward sweep. Since y is initially zero we can ignore the upper triangular terms of A.
|
||||
- We then perform one back sweep.
|
||||
- For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
|
||||
|
||||
@param[in] A the known system matrix
|
||||
@param[in] r the input vector
|
||||
@param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
|
||||
with r as the RHS.
|
||||
|
||||
@return returns 0 upon success and non-zero otherwise
|
||||
|
||||
@warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
|
||||
of sync with other kernels.
|
||||
|
||||
@see ComputeSYMGS_ref
|
||||
*/
|
||||
|
||||
#ifdef USE_CUDA
|
||||
int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
double* tmp_d;
|
||||
if (step == 1 && A.mgData != 0)
|
||||
{
|
||||
tmp_d = (*A.mgData->Axf).values_d;
|
||||
}
|
||||
else
|
||||
{
|
||||
tmp_d = A.tempBuffer;
|
||||
}
|
||||
const local_int_t nrow = A.localNumberOfRows;
|
||||
double alpha = 1.0;
|
||||
cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
|
||||
cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
|
||||
|
||||
if (step == 1)
|
||||
{
|
||||
// TRSV(D+L, r, t)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, t, t)
|
||||
SpmvDiagCuda(nrow, tmp_d, A.diagonal);
|
||||
|
||||
// TRSV(D+U, t, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
|
||||
|
||||
if (A.mgData != 0)
|
||||
{
|
||||
#ifndef HPCG_NO_MPI
|
||||
cudaStreamSynchronize(stream);
|
||||
PackSendBufferCuda(A, x, false, copy_stream);
|
||||
#endif
|
||||
|
||||
// SPMV(L, x, t): t = t + L * x
|
||||
double alpha = 1.0;
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
|
||||
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
|
||||
A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExchangeHaloCuda(A, x, copy_stream);
|
||||
double one = 1.0, zero = 0.0;
|
||||
ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
{ // step == 0
|
||||
#ifndef HPCG_NO_MPI
|
||||
cudaStreamSynchronize(stream);
|
||||
PackSendBufferCuda(A, x, false, copy_stream);
|
||||
#endif
|
||||
|
||||
// SPMV(U, x, t): t = U * x
|
||||
double alpha = 1.0, beta = 0.0;
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
|
||||
cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
|
||||
&beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
|
||||
|
||||
// tmp = rv - t
|
||||
AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
|
||||
ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
|
||||
double mone = -1.0, zero = 0.0;
|
||||
ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
|
||||
}
|
||||
#endif
|
||||
|
||||
// TRSV(D+L, r-t, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t += D*x
|
||||
SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
|
||||
|
||||
// TRSV(D+U, x, x)
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
|
||||
cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
|
||||
cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
|
||||
A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_GRACE
|
||||
int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
local_int_t nrow = A.localNumberOfRows;
|
||||
double* temp;
|
||||
if (step == 1 && A.mgData != 0)
|
||||
{
|
||||
temp = (*A.mgData->Axf).values;
|
||||
}
|
||||
else
|
||||
{
|
||||
temp = A.tempBuffer;
|
||||
}
|
||||
double* xv = x.values;
|
||||
double* rv = r.values;
|
||||
double one = 1.0, zero = 0.0;
|
||||
nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
|
||||
nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
|
||||
|
||||
if (step == 1)
|
||||
{
|
||||
// TRSV(L, r, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t = D*x
|
||||
SpmvDiagCpu(nrow, A.diagonal, xv, temp);
|
||||
|
||||
// TRSV(U, x, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrU);
|
||||
|
||||
if (A.mgData != 0)
|
||||
{
|
||||
// SPMV(L, x, t): t += L*x
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
|
||||
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
ExchangeHaloCpu(A, x);
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExtSpMVCpu(A, nrow, 1.0, xv, temp);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else if (step == 0)
|
||||
{
|
||||
// SPMV(U, x, t) t = U*x
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
|
||||
|
||||
nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
|
||||
NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
|
||||
|
||||
// axpy: t = r-t
|
||||
AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
|
||||
|
||||
#ifndef HPCG_NO_MPI
|
||||
// MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
|
||||
ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
|
||||
if (A.totalToBeSent > 0)
|
||||
{
|
||||
ExtSpMVCpu(A, nrow, -1.0, xv, temp);
|
||||
}
|
||||
#endif
|
||||
|
||||
// TRSV(L, r-t, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrL);
|
||||
|
||||
// SPMV(D, x, t) t += D*x
|
||||
SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
|
||||
|
||||
// TRSV(U, x, x)
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
|
||||
nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
|
||||
nvpl_sparse_sp_mat_set_attribute(
|
||||
A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
|
||||
nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
|
||||
A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
|
||||
A.nvplSparseOpt.spsvDescrU);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif // USE_GRACE
|
||||
|
||||
int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
|
||||
{
|
||||
if (A.rankType == GPU)
|
||||
{
|
||||
#ifdef USE_CUDA
|
||||
ComputeSYMGS_Gpu(A, r, x, step);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef USE_GRACE
|
||||
ComputeSYMGS_Cpu(A, r, x, step);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user