first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/src/TestCG.cpp
+++ b/src/TestCG.cpp
@@ -0,0 +1,243 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file TestCG.cpp
+
+ HPCG routine
+ */
+
+// Changelog
+//
+// Version 0.4
+// - Added timing of setup time for sparse MV
+// - Corrected percentages reported for sparse MV with overhead
+//
+/////////////////////////////////////////////////////////////////////////
+
+#include <fstream>
+#include <iostream>
+using std::endl;
+#include "hpcg.hpp"
+#include <vector>
+
+#include "CG.hpp"
+#include "CG_ref.hpp"
+#include "TestCG.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+
+extern int use_output_file;
+
+/*!
+  Test the correctness of the Preconditined CG implementation by using a system matrix with a dominant diagonal.
+
+  @param[in]    geom The description of the problem's geometry.
+  @param[in]    A    The known system matrix
+  @param[in]    data the data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[out]   testcg_data the data structure with the results of the test including pass/fail information
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG()
+ */
+
+int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data)
+{
+    // Use this array for collecting timing information
+    std::vector<double> times(8, 0.0);
+    // Temporary storage for holding original diagonal and RHS
+    Vector origDiagA, exaggeratedDiagA, origB;
+    InitializeVector(origDiagA, A.localNumberOfRows, A.rankType);
+    InitializeVector(exaggeratedDiagA, A.localNumberOfRows, A.rankType);
+    InitializeVector(origB, A.localNumberOfRows, A.rankType);
+    CopyMatrixDiagonal(A, origDiagA);
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyMatrixDiagonalCuda(A, origDiagA);
+#endif
+    }
+    CopyVector(origDiagA, exaggeratedDiagA);
+    CopyVector(b, origB);
+
+    // Modify the matrix diagonal to greatly exaggerate diagonal values.
+    // CG should converge in about 10 iterations for this problem, regardless of problem size
+    for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
+    {
+        global_int_t globalRowID = A.localToGlobalMap[i];
+        if (globalRowID < 9)
+        {
+            double scale = (globalRowID + 2) * 1.0e6;
+            ScaleVectorValue(exaggeratedDiagA, i, scale);
+            ScaleVectorValue(b, i, scale);
+        }
+        else
+        {
+            ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
+            ScaleVectorValue(b, i, 1.0e6);
+        }
+    }
+
+    // Reference Matrix
+    ReplaceMatrixDiagonal(A, exaggeratedDiagA);
+
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyVectorH2D(exaggeratedDiagA);
+        PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
+        PermVectorCuda(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
+        ReplaceMatrixDiagonalCuda(A, exaggeratedDiagA);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrL, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrU, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
+        PermVectorCpu(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
+        ReplaceMatrixDiagonalCpu(A, exaggeratedDiagA);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+
+    ////////////////////////////////
+
+    int niters = 0;
+    double normr = 0.0;
+    double normr0 = 0.0;
+    int maxIters = 50;
+    int numberOfCgCalls = 2;
+    double tolerance = 1.0e-12; // Set tolerance to reasonable value for grossly scaled diagonal terms
+    testcg_data.expected_niters_no_prec
+        = 12; // For the unpreconditioned CG call, we should take about 10 iterations, permit 12
+    testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2
+    testcg_data.niters_max_no_prec = 0;
+    testcg_data.niters_max_prec = 0;
+    for (int k = 0; k < 2; ++k)
+    { // This loop tests both unpreconditioned and preconditioned runs
+        int expected_niters = testcg_data.expected_niters_no_prec;
+        if (k == 1)
+            expected_niters = testcg_data.expected_niters_prec;
+        for (int i = 0; i < numberOfCgCalls; ++i)
+        {
+            ZeroVector(x); // Zero out x
+            int ierr = CG(A, data, b, x, maxIters, tolerance, niters, normr, normr0, &times[0], k == 1, 0);
+            if (ierr)
+                if (use_output_file)
+                {
+                    HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
+                }
+                else
+                {
+                    std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
+                }
+            if (niters <= expected_niters)
+            {
+                ++testcg_data.count_pass;
+            }
+            else
+            {
+                ++testcg_data.count_fail;
+            }
+            if (k == 0 && niters > testcg_data.niters_max_no_prec)
+                testcg_data.niters_max_no_prec = niters; // Keep track of largest iter count
+            if (k == 1 && niters > testcg_data.niters_max_prec)
+                testcg_data.niters_max_prec = niters; // Same for preconditioned run
+            if (A.geom->rank == 0)
+            {
+                if (use_output_file)
+                {
+                    HPCG_fout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
+                              << normr / normr0 << "]" << endl;
+                }
+                else
+                {
+                    std::cout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
+                              << normr / normr0 << "]" << endl;
+                }
+                if (niters > expected_niters)
+                    if (use_output_file)
+                    {
+                        HPCG_fout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
+                                  << endl;
+                    }
+                    else
+                    {
+                        std::cout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
+                                  << endl;
+                    }
+            }
+        }
+    }
+
+    // Restore matrix diagonal and RHS
+    ReplaceMatrixDiagonal(A, origDiagA);
+
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        ReplaceMatrixDiagonalCuda(A, origDiagA);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrL, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrU, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ReplaceMatrixDiagonalCpu(A, origDiagA);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+
+    CopyVector(origB, b);
+    // Delete vectors
+    DeleteVector(origDiagA);
+    DeleteVector(exaggeratedDiagA);
+    DeleteVector(origB);
+    testcg_data.normr = normr;
+
+    return 0;
+}