tssc-hpcg/src/TestCG.cpp


//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*
 * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*!
 @file TestCG.cpp

 HPCG routine
 */

// Changelog
//
// Version 0.4
// - Added timing of setup time for sparse MV
// - Corrected percentages reported for sparse MV with overhead
//
/////////////////////////////////////////////////////////////////////////

#include <fstream>
#include <iostream>
using std::endl;
#include "hpcg.hpp"
#include <vector>

#include "CG.hpp"
#include "CG_ref.hpp"
#include "TestCG.hpp"
#include "CpuKernels.hpp"
#include "CudaKernels.hpp"

extern int use_output_file;

/*!
  Test the correctness of the Preconditined CG implementation by using a system matrix with a dominant diagonal.

  @param[in]    geom The description of the problem's geometry.
  @param[in]    A    The known system matrix
  @param[in]    data the data structure with all necessary CG vectors preallocated
  @param[in]    b    The known right hand side vector
  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
  @param[out]   testcg_data the data structure with the results of the test including pass/fail information

  @return Returns zero on success and a non-zero value otherwise.

  @see CG()
 */

int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data)
{
    // Use this array for collecting timing information
    std::vector<double> times(8, 0.0);
    // Temporary storage for holding original diagonal and RHS
    Vector origDiagA, exaggeratedDiagA, origB;
    InitializeVector(origDiagA, A.localNumberOfRows, A.rankType);
    InitializeVector(exaggeratedDiagA, A.localNumberOfRows, A.rankType);
    InitializeVector(origB, A.localNumberOfRows, A.rankType);
    CopyMatrixDiagonal(A, origDiagA);
    if (A.rankType == GPU)
    {
#ifdef USE_CUDA
        CopyMatrixDiagonalCuda(A, origDiagA);
#endif
    }
    CopyVector(origDiagA, exaggeratedDiagA);
    CopyVector(b, origB);

    // Modify the matrix diagonal to greatly exaggerate diagonal values.
    // CG should converge in about 10 iterations for this problem, regardless of problem size
    for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
    {
        global_int_t globalRowID = A.localToGlobalMap[i];
        if (globalRowID < 9)
        {
            double scale = (globalRowID + 2) * 1.0e6;
            ScaleVectorValue(exaggeratedDiagA, i, scale);
            ScaleVectorValue(b, i, scale);
        }
        else
        {
            ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
            ScaleVectorValue(b, i, 1.0e6);
        }
    }

    // Reference Matrix
    ReplaceMatrixDiagonal(A, exaggeratedDiagA);

    if (A.rankType == GPU)
    {
#ifdef USE_CUDA
        CopyVectorH2D(exaggeratedDiagA);
        PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
        PermVectorCuda(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
        ReplaceMatrixDiagonalCuda(A, exaggeratedDiagA);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A.cusparseOpt.spsvDescrL, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A.cusparseOpt.spsvDescrU, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
#endif
    }
    else
    {
#ifdef USE_GRACE
        PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
        PermVectorCpu(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
        ReplaceMatrixDiagonalCpu(A, exaggeratedDiagA);
        nvpl_sparse_spsv_update_matrix(
            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
        nvpl_sparse_spsv_update_matrix(
            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
#endif
    }

    ////////////////////////////////

    int niters = 0;
    double normr = 0.0;
    double normr0 = 0.0;
    int maxIters = 50;
    int numberOfCgCalls = 2;
    double tolerance = 1.0e-12; // Set tolerance to reasonable value for grossly scaled diagonal terms
    testcg_data.expected_niters_no_prec
        = 12; // For the unpreconditioned CG call, we should take about 10 iterations, permit 12
    testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2
    testcg_data.niters_max_no_prec = 0;
    testcg_data.niters_max_prec = 0;
    for (int k = 0; k < 2; ++k)
    { // This loop tests both unpreconditioned and preconditioned runs
        int expected_niters = testcg_data.expected_niters_no_prec;
        if (k == 1)
            expected_niters = testcg_data.expected_niters_prec;
        for (int i = 0; i < numberOfCgCalls; ++i)
        {
            ZeroVector(x); // Zero out x
            int ierr = CG(A, data, b, x, maxIters, tolerance, niters, normr, normr0, &times[0], k == 1, 0);
            if (ierr)
                if (use_output_file)
                {
                    HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
                }
                else
                {
                    std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
                }
            if (niters <= expected_niters)
            {
                ++testcg_data.count_pass;
            }
            else
            {
                ++testcg_data.count_fail;
            }
            if (k == 0 && niters > testcg_data.niters_max_no_prec)
                testcg_data.niters_max_no_prec = niters; // Keep track of largest iter count
            if (k == 1 && niters > testcg_data.niters_max_prec)
                testcg_data.niters_max_prec = niters; // Same for preconditioned run
            if (A.geom->rank == 0)
            {
                if (use_output_file)
                {
                    HPCG_fout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
                              << normr / normr0 << "]" << endl;
                }
                else
                {
                    std::cout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
                              << normr / normr0 << "]" << endl;
                }
                if (niters > expected_niters)
                    if (use_output_file)
                    {
                        HPCG_fout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
                                  << endl;
                    }
                    else
                    {
                        std::cout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
                                  << endl;
                    }
            }
        }
    }

    // Restore matrix diagonal and RHS
    ReplaceMatrixDiagonal(A, origDiagA);

    if (A.rankType == GPU)
    {
#ifdef USE_CUDA
        ReplaceMatrixDiagonalCuda(A, origDiagA);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A.cusparseOpt.spsvDescrL, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
        cusparseSpSV_updateMatrix(
            cusparsehandle, A.cusparseOpt.spsvDescrU, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
#endif
    }
    else
    {
#ifdef USE_GRACE
        ReplaceMatrixDiagonalCpu(A, origDiagA);
        nvpl_sparse_spsv_update_matrix(
            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
        nvpl_sparse_spsv_update_matrix(
            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
#endif
    }

    CopyVector(origB, b);
    // Delete vectors
    DeleteVector(origDiagA);
    DeleteVector(exaggeratedDiagA);
    DeleteVector(origB);
    testcg_data.normr = normr;

    return 0;
}