first commit

2026-01-18 20:37:50 +08:00
commit fff9f18287
123 changed files with 1385491 additions and 0 deletions
--- a/src/CG.cpp
+++ b/src/CG.cpp
@@ -0,0 +1,241 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CG.cpp
+
+ HPCG routine
+ */
+
+#include <fstream>
+
+#include <cmath>
+
+#include "hpcg.hpp"
+
+#include "CG.hpp"
+#include "ComputeDotProduct.hpp"
+#include "ComputeMG.hpp"
+#include "ComputeSPMV.hpp"
+#include "ComputeWAXPBY.hpp"
+#include "mytimer.hpp"
+#include <iostream>
+
+#include "CpuKernels.hpp"
+
+#include <mpi.h>
+
+extern int use_output_file;
+
+#define TICKD() t0 = mytimer()       //!< record current time in 't0'
+#define TOCKD(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
+
+/*!
+  Routine to compute an approximate solution to Ax = b
+
+  @param[in]    geom The description of the problem's geometry.
+  @param[inout] A    The known system matrix
+  @param[inout] data The data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
+  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
+  @param[out]   niters    The number of iterations actually performed.
+  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
+  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
+  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
+  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG_ref()
+*/
+int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
+{
+
+    double t_begin = mytimer(); // Start timing right away
+    normr = 0.0;
+    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
+    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+    // #ifndef HPCG_NO_MPI
+    //   double t6 = 0.0;
+    // #endif
+    local_int_t nrow = A.localNumberOfRows;
+    Vector& r = data.r; // Residual vector
+    Vector& z = data.z; // Preconditioned residual vector
+    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
+    Vector& Ap = data.Ap;
+
+    if (!doPreconditioning && A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+        else
+        {
+            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+
+    int print_freq = 1;
+    if (print_freq > 50)
+        print_freq = 50;
+    if (print_freq < 1)
+        print_freq = 1;
+
+    // p is of length ncols, copy x to p for sparse MV operation
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyVectorD2D(x, p);
+#endif
+    }
+    else
+    {
+        CopyVector(x, p);
+    }
+
+    TICKD();
+    ComputeSPMV(A, p, Ap);
+    TOCKD(t3); // Ap = A*p
+    TICKD();
+    ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized, A.rankType);
+    TOCKD(t2); // r = b - Ax (x stored in p)
+    TICKD();
+    ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
+    TOCKD(t1);
+    normr = sqrt(normr);
+
+    if (A.geom->rank == 0 && flag)
+        if (use_output_file)
+        {
+            HPCG_fout << "Initial Residual = " << normr << std::endl;
+        }
+        else
+        {
+            std::cout << "Initial Residual = " << normr << std::endl;
+        }
+
+    // Record initial residual for convergence testing
+    normr0 = normr;
+
+    // Start iterations
+    for (int k = 1; k <= max_iter && normr / normr0 * (1.0 + 1.0e-6) > tolerance; k++)
+    {
+        TICKD();
+        if (doPreconditioning)
+        {
+            ComputeMG(A, r, z); // Apply preconditioner
+            if (A.rankType == GPU)
+            {
+#ifdef USE_CUDA
+                cudaStreamSynchronize(stream);
+#endif
+            }
+        }
+        else
+        {
+            if (A.rankType == GPU)
+            {
+#ifdef USE_CUDA
+                CopyVectorD2D(r, z); // copy r to z (no preconditioning)
+#endif
+            }
+            else
+            {
+                CopyVector(r, z); // copy r to z (no preconditioning)
+            }
+        }
+        TOCKD(t5); // Preconditioner apply time
+
+        if (k == 1)
+        {
+            TICKD();
+            ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized, A.rankType);
+            TOCKD(t2); // Copy Mr to p
+            TICKD();
+            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
+            TOCKD(t1); // rtz = r'*z
+        }
+        else
+        {
+            oldrtz = rtz;
+            TICKD();
+            ComputeDotProduct(nrow, r, z, rtz, t4, A.isDotProductOptimized, A.rankType);
+            TOCKD(t1); // rtz = r'*z
+            beta = rtz / oldrtz;
+            TICKD();
+            ComputeWAXPBY(nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized, A.rankType);
+            TOCKD(t2); // p = beta*p + z
+        }
+        TICKD();
+        ComputeSPMV(A, p, Ap);
+        TOCKD(t3); // Ap = A*p
+        TICKD();
+        ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized, A.rankType);
+        TOCKD(t1); // alpha = p'*Ap
+        alpha = rtz / pAp;
+
+        TICKD();
+        ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized, A.rankType); // x = x + alpha*p
+        ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized, A.rankType);
+        TOCKD(t2); // r = r - alpha*Ap
+        TICKD();
+        ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized, A.rankType);
+        TOCKD(t1);
+
+        normr = sqrt(normr);
+
+        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
+            if (use_output_file)
+            {
+                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+            else
+            {
+                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+
+        niters = k;
+    }
+
+    // Store times
+    times[1] += t1; // dot-product time
+    times[2] += t2; // WAXPBY time
+    times[3] += t3; // SPMV time
+    times[4] += t4; // AllReduce time
+    times[5] += t5; // preconditioner apply time
+                    // #ifndef HPCG_NO_MPI
+    //   times[6] += t6; // exchange halo time
+    // #endif
+    times[0] += mytimer() - t_begin; // Total time. All done...
+    return 0;
+}
--- a/src/CG.hpp
+++ b/src/CG.hpp
@@ -0,0 +1,55 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CG_HPP
+#define CG_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int CG(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
+
+// this function will compute the Conjugate Gradient iterations.
+// geom - Domain and processor topology information
+// A - Matrix
+// b - constant
+// x - used for return value
+// max_iter - how many times we iterate
+// tolerance - Stopping tolerance for preconditioned iterations.
+// niters - number of iterations performed
+// normr - computed residual norm
+// normr0 - Original residual
+// times - array of timing information
+// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
+
+#endif // CG_HPP
--- a/src/CGData.hpp
+++ b/src/CGData.hpp
@@ -0,0 +1,84 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CGData.hpp
+
+ HPCG data structure
+ */
+
+#ifndef CGDATA_HPP
+#define CGDATA_HPP
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+struct CGData_STRUCT
+{
+    Vector r;  //!< pointer to residual vector
+    Vector z;  //!< pointer to preconditioned residual vector
+    Vector p;  //!< pointer to direction vector
+    Vector Ap; //!< pointer to Krylov vector
+};
+typedef struct CGData_STRUCT CGData;
+
+/*!
+ Constructor for the data structure of CG vectors.
+
+ @param[in]  A    the data structure that describes the problem matrix and its structure
+ @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
+ */
+inline void InitializeSparseCGData(SparseMatrix& A, CGData& data)
+{
+    local_int_t nrow = A.localNumberOfRows;
+    local_int_t ncol = A.localNumberOfColumns;
+    InitializeVector(data.r, nrow, A.rankType);
+    InitializeVector(data.z, ncol, A.rankType, true /*Only when rank type is GPU*/);
+    InitializeVector(data.p, ncol, A.rankType, true);
+    InitializeVector(data.Ap, nrow, A.rankType);
+    return;
+}
+
+/*!
+ Destructor for the CG vectors data.
+
+ @param[inout] data the CG vectors data structure whose storage is deallocated
+ */
+inline void DeleteCGData(CGData& data)
+{
+    DeleteVector(data.r);
+    DeleteVector(data.z);
+    DeleteVector(data.p);
+    DeleteVector(data.Ap);
+    return;
+}
+
+#endif // CGDATA_HPP
--- a/src/CG_ref.cpp
+++ b/src/CG_ref.cpp
@@ -0,0 +1,198 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file CG_ref.cpp
+
+ HPCG routine
+ */
+
+#include <cmath>
+#include <fstream>
+#include <iostream>
+
+#include "hpcg.hpp"
+
+#include "CG_ref.hpp"
+#include "ComputeDotProduct_ref.hpp"
+#include "ComputeMG_ref.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "ComputeWAXPBY_ref.hpp"
+#include "mytimer.hpp"
+
+extern int use_output_file;
+
+// Use TICK and TOCK to time a code section in MATLAB-like fashion
+#define TICK() t0 = mytimer()       //!< record current time in 't0'
+#define TOCK(t) t += mytimer() - t0 //!< store time difference in 't' using time in 't0'
+
+/*!
+  Reference routine to compute an approximate solution to Ax = b
+
+  @param[inout] A    The known system matrix
+  @param[inout] data The data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
+  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
+  @param[out]   niters    The number of iterations actually performed.
+  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
+  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
+  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
+  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG()
+*/
+
+int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag)
+{
+
+    double t_begin = mytimer(); // Start timing right away
+    normr = 0.0;
+    double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;
+
+    double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
+    // #ifndef HPCG_NO_MPI
+    //   double t6 = 0.0;
+    // #endif
+
+    local_int_t nrow = A.localNumberOfRows;
+
+    Vector& r = data.r; // Residual vector
+    Vector& z = data.z; // Preconditioned residual vector
+    Vector& p = data.p; // Direction vector (in MPI mode ncol>=nrow)
+    Vector& Ap = data.Ap;
+
+    if (!doPreconditioning && A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+        else
+        {
+            std::cout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;
+        }
+
+#if 1
+    // def HPCG_DEBUG
+    int print_freq = 1;
+    if (print_freq > 50)
+        print_freq = 50;
+    if (print_freq < 1)
+        print_freq = 1;
+#endif
+    // p is of length ncols, copy x to p for sparse MV operation
+    CopyVector(x, p);
+    TICK();
+    ComputeSPMV_ref(A, p, Ap);
+    TOCK(t3); // Ap = A*p
+    TICK();
+    ComputeWAXPBY_ref(nrow, 1.0, b, -1.0, Ap, r);
+    TOCK(t2); // r = b - Ax (x stored in p)
+    TICK();
+    ComputeDotProduct_ref(nrow, r, r, normr, t4);
+    TOCK(t1);
+    normr = sqrt(normr);
+#if 1
+    // def HPCG_DEBUG
+    if (A.geom->rank == 0 && flag)
+        if (use_output_file)
+        {
+            HPCG_fout << "Initial Residual = " << normr << std::endl;
+        }
+        else
+        {
+            std::cout << "Initial Residual = " << normr << std::endl;
+        }
+#endif
+
+    // Record initial residual for convergence testing
+    normr0 = normr;
+
+    // Start iterations
+
+    for (int k = 1; k <= max_iter && normr / normr0 > tolerance; k++)
+    {
+        TICK();
+        if (doPreconditioning)
+            ComputeMG_ref(A, r, z); // Apply preconditioner
+        else
+            ComputeWAXPBY_ref(nrow, 1.0, r, 0.0, r, z); // copy r to z (no preconditioning)
+        TOCK(t5);                                       // Preconditioner apply time
+
+        if (k == 1)
+        {
+            CopyVector(z, p);
+            TOCK(t2); // Copy Mr to p
+            TICK();
+            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
+            TOCK(t1); // rtz = r'*z
+        }
+        else
+        {
+            oldrtz = rtz;
+            TICK();
+            ComputeDotProduct_ref(nrow, r, z, rtz, t4);
+            TOCK(t1); // rtz = r'*z
+            beta = rtz / oldrtz;
+            TICK();
+            ComputeWAXPBY_ref(nrow, 1.0, z, beta, p, p);
+            TOCK(t2); // p = beta*p + z
+        }
+
+        TICK();
+        ComputeSPMV_ref(A, p, Ap);
+        TOCK(t3); // Ap = A*p
+        TICK();
+        ComputeDotProduct_ref(nrow, p, Ap, pAp, t4);
+        TOCK(t1); // alpha = p'*Ap
+        alpha = rtz / pAp;
+        TICK();
+        ComputeWAXPBY_ref(nrow, 1.0, x, alpha, p, x); // x = x + alpha*p
+        ComputeWAXPBY_ref(nrow, 1.0, r, -alpha, Ap, r);
+        TOCK(t2); // r = r - alpha*Ap
+        TICK();
+        ComputeDotProduct_ref(nrow, r, r, normr, t4);
+        TOCK(t1);
+        normr = sqrt(normr);
+#if 1
+        // def HPCG_DEBUG
+        if (flag && A.geom->rank == 0 && (k % print_freq == 0 || k == max_iter))
+            if (use_output_file)
+            {
+                HPCG_fout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+            else
+            {
+                std::cout << "Iteration = " << k << "   Scaled Residual = " << normr / normr0 << std::endl;
+            }
+#endif
+        niters = k;
+    }
+
+    // Store times
+    times[1] += t1; // dot product time
+    times[2] += t2; // WAXPBY time
+    times[3] += t3; // SPMV time
+    times[4] += t4; // AllReduce time
+    times[5] += t5; // preconditioner apply time
+    // #ifndef HPCG_NO_MPI
+    //   times[6] += t6; // exchange halo time
+    // #endif
+    times[0] += mytimer() - t_begin; // Total time. All done...
+    return 0;
+}
--- a/src/CG_ref.hpp
+++ b/src/CG_ref.hpp
@@ -0,0 +1,42 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CG_REF_HPP
+#define CG_REF_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+// The use of CPU and GPU Sparse Matrix is intended to resolve
+// the linked list structures for MG coarse levels
+// There is no change of th erefernce code
+
+int CG_ref(const SparseMatrix& A, CGData& data, const Vector& b, Vector& x, const int max_iter, const double tolerance,
+    int& niters, double& normr, double& normr0, double* times, bool doPreconditioning, int flag);
+
+// this function will compute the Conjugate Gradient iterations.
+// geom - Domain and processor topology information
+// A - Matrix
+// b - constant
+// x - used for return value
+// max_iter - how many times we iterate
+// tolerance - Stopping tolerance for preconditioned iterations.
+// niters - number of iterations performed
+// normr - computed residual norm
+// normr0 - Original residual
+// times - array of timing information
+// doPreconditioning - bool to specify whether or not symmetric GS will be applied.
+
+#endif // CG_REF_HPP
--- a/src/CheckAspectRatio.cpp
+++ b/src/CheckAspectRatio.cpp
@@ -0,0 +1,84 @@
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CheckAspectRatio.cpp
+
+ HPCG routine
+ */
+
+#include <algorithm>
+#include <iostream>
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include "hpcg.hpp"
+
+#include "CheckAspectRatio.hpp"
+
+extern int use_output_file;
+
+int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo)
+{
+    double current_ratio = std::min(std::min(x, y), z) / double(std::max(std::max(x, y), z));
+
+    if (current_ratio < smallest_ratio)
+    { // ratio of the smallest to the largest
+        if (DoIo)
+        {
+            if (use_output_file)
+            {
+                HPCG_fout << "The " << what << " sizes (" << x << "," << y << "," << z
+                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
+                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
+                HPCG_fout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl;
+                HPCG_fout.flush();
+            }
+            else
+            {
+                std::cout << "The " << what << " sizes (" << x << "," << y << "," << z
+                          << ") are invalid because the ratio min(x,y,z)/max(x,y,z)=" << current_ratio
+                          << " is too small (at least " << smallest_ratio << " is required)." << std::endl;
+                std::cout << "The shape should resemble a 3D cube. Please adjust and try again." << std::endl
+                          << std::flush;
+            }
+        }
+
+#ifndef HPCG_NO_MPI
+        MPI_Abort(MPI_COMM_WORLD, 127);
+#endif
+
+        return 127;
+    }
+
+    return 0;
+}
--- a/src/CheckAspectRatio.hpp
+++ b/src/CheckAspectRatio.hpp
@@ -0,0 +1,18 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CHECKASPECTRATIO_HPP
+#define CHECKASPECTRATIO_HPP
+extern int CheckAspectRatio(double smallest_ratio, int x, int y, int z, const char* what, bool DoIo);
+#endif // CHECKASPECTRATIO_HPP
--- a/src/CheckProblem.cpp
+++ b/src/CheckProblem.cpp
@@ -0,0 +1,192 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file CheckProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#endif
+#include <cassert>
+
+#include "CheckProblem.hpp"
+
+/*!
+  Check the contents of the generated sparse matrix to see if values match expected contents.
+
+  @param[in]  A      The known system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+
+void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz;     // This is the size of our subblock
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+
+    local_int_t localNumberOfNonzeros = 0;
+    // TODO:  This triply nested loop could be flattened or use nested parallelism
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t iz = 0; iz < nz; iz++)
+    {
+        global_int_t giz = giz0 + iz;
+        for (local_int_t iy = 0; iy < ny; iy++)
+        {
+            global_int_t giy = giy0 + iy;
+            for (local_int_t ix = 0; ix < nx; ix++)
+            {
+                global_int_t gix = gix0 + ix;
+                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
+                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
+                assert(A.localToGlobalMap[currentLocalRow] == currentGlobalRow);
+#ifdef HPCG_DETAILED_DEBUG
+                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
+                          << A.globalToLocalMap.find(currentGlobalRow)->second << endl;
+#endif
+                char numberOfNonzerosInRow = 0;
+                double* currentValuePointer
+                    = A.matrixValues[currentLocalRow]; // Pointer to current value in current row
+                global_int_t* currentIndexPointerG
+                    = A.mtxIndG[currentLocalRow]; // Pointer to current index in current row
+                for (int sz = -1; sz <= 1; sz++)
+                {
+                    if (giz + sz > -1 && giz + sz < gnz)
+                    {
+                        for (int sy = -1; sy <= 1; sy++)
+                        {
+                            if (giy + sy > -1 && giy + sy < gny)
+                            {
+                                for (int sx = -1; sx <= 1; sx++)
+                                {
+                                    if (gix + sx > -1 && gix + sx < gnx)
+                                    {
+                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
+                                        if (curcol == currentGlobalRow)
+                                        {
+                                            assert(A.matrixDiagonal[currentLocalRow] == currentValuePointer);
+                                            assert(*currentValuePointer++ == 26.0);
+                                        }
+                                        else
+                                        {
+                                            assert(*currentValuePointer++ == -1.0);
+                                        }
+                                        assert(*currentIndexPointerG++ == curcol);
+                                        numberOfNonzerosInRow++;
+                                    } // end x bounds test
+                                } // end sx loop
+                            } // end y bounds test
+                        } // end sy loop
+                    } // end z bounds test
+                } // end sz loop
+                assert(A.nonzerosInRow[currentLocalRow] == numberOfNonzerosInRow);
+#ifndef HPCG_NO_OPENMP
+#pragma omp critical
+#endif
+                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
+                if (b != 0)
+                    assert(bv[currentLocalRow] == 26.0 - ((double) (numberOfNonzerosInRow - 1)));
+                if (x != 0)
+                    assert(xv[currentLocalRow] == 0.0);
+                if (xexact != 0)
+                    assert(xexactv[currentLocalRow] == 1.0);
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 0;
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to sum all nonzeros
+#ifdef HPCG_NO_LONG_LONG
+    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
+    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+    totalNumberOfNonzeros = gnnz; // Copy back
+#endif
+#else
+    totalNumberOfNonzeros = localNumberOfNonzeros;
+#endif
+
+    assert(A.totalNumberOfRows == totalNumberOfRows);
+    assert(A.totalNumberOfNonzeros == totalNumberOfNonzeros);
+    assert(A.localNumberOfRows == localNumberOfRows);
+    assert(A.localNumberOfNonzeros == localNumberOfNonzeros);
+
+    return;
+}
--- a/src/CheckProblem.hpp
+++ b/src/CheckProblem.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef CHECKPROBLEM_HPP
+#define CHECKPROBLEM_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+void CheckProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // CHECKPROBLEM_HPP
--- a/src/ComputeDotProduct.cpp
+++ b/src/ComputeDotProduct.cpp
@@ -0,0 +1,114 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeDotProduct.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#include "ComputeDotProduct.hpp"
+#include "ComputeDotProduct_ref.hpp"
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#define CHECK_CUBLAS(x)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cublasStatus_t cublasStatus = (x);                                                                             \
+        if (cublasStatus != CUBLAS_STATUS_SUCCESS)                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "CUBLAS: %s = %d at (%s:%d)\n", #x, cublasStatus, __FILE__, __LINE__);                     \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+/*!
+  Routine to compute the dot product of two vectors.
+
+  This routine calls the reference dot-product implementation by default, but
+  can be replaced by a custom routine that is optimized and better suited for
+  the target system.
+
+  @param[in]  n the number of vector elements (on this processor)
+  @param[in]  x, y the input vectors
+  @param[out] result a pointer to scalar value, on exit will contain the result.
+  @param[out] time_allreduce the time it took to perform the communication between processes
+  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
+  otherwise leave it unchanged
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeDotProduct_ref
+*/
+
+int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
+    bool& isOptimized, rank_type_t rt)
+{
+
+    double local_result = 0.0;
+    if (rt == GPU)
+    {
+#ifdef USE_CUDA
+        cublasStatus_t t = cublasDdot(cublashandle, n, x.values_d, 1, y.values_d, 1, &local_result);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        // Consider replacing with NVPL BLAS dot product
+        ComputeDotProductCpu(n, x, y, local_result, isOptimized);
+#endif
+    }
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double t0 = mytimer();
+    double global_result = 0.0;
+    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    result = global_result;
+    t0 = mytimer() - t0;
+    time_allreduce += t0;
+#else
+    time_allreduce += 0.0;
+    result = local_result;
+#endif
+
+    return 0;
+}
--- a/src/ComputeDotProduct.hpp
+++ b/src/ComputeDotProduct.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTEDOTPRODUCT_HPP
+#define COMPUTEDOTPRODUCT_HPP
+#include "Vector.hpp"
+
+int ComputeDotProduct(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce,
+    bool& isOptimized, rank_type_t rt);
+
+#endif // COMPUTEDOTPRODUCT_HPP
--- a/src/ComputeDotProduct_ref.cpp
+++ b/src/ComputeDotProduct_ref.cpp
@@ -0,0 +1,84 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeDotProduct_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include "ComputeDotProduct_ref.hpp"
+#include <cassert>
+
+/*!
+  Routine to compute the dot product of two vectors where:
+
+  This is the reference dot-product implementation.  It _CANNOT_ be modified for the
+  purposes of this benchmark.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] x, y the input vectors
+  @param[in] result a pointer to scalar value, on exit will contain result.
+  @param[out] time_allreduce the time it took to perform the communication between processes
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeDotProduct
+*/
+int ComputeDotProduct_ref(const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce)
+{
+    assert(x.localLength >= n); // Test vector lengths
+    assert(y.localLength >= n);
+
+    double local_result = 0.0;
+    double* xv = x.values;
+    double* yv = y.values;
+    if (yv == xv)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : local_result)
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            local_result += xv[i] * xv[i];
+    }
+    else
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : local_result)
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            local_result += xv[i] * yv[i];
+    }
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double t0 = mytimer();
+    double global_result = 0.0;
+    MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    result = global_result;
+    time_allreduce += mytimer() - t0;
+#else
+    time_allreduce += 0.0;
+    result = local_result;
+#endif
+
+    return 0;
+}
--- a/src/ComputeDotProduct_ref.hpp
+++ b/src/ComputeDotProduct_ref.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEDOTPRODUCT_REF_HPP
+#define COMPUTEDOTPRODUCT_REF_HPP
+#include "Vector.hpp"
+int ComputeDotProduct_ref(
+    const local_int_t n, const Vector& x, const Vector& y, double& result, double& time_allreduce);
+
+#endif // COMPUTEDOTPRODUCT_REF_HPP
--- a/src/ComputeMG.cpp
+++ b/src/ComputeMG.cpp
@@ -0,0 +1,96 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeMG.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeMG.hpp"
+#include "ComputeProlongation.hpp"
+#include "ComputeRestriction.hpp"
+#include "ComputeSYMGS.hpp"
+#include "CudaKernels.hpp"
+
+/*!
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
+  r.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeMG_ref
+*/
+
+int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+    int ierr = 0;
+    if (A.mgData != 0)
+    { // Go to next coarse level if defined
+        ComputeSYMGS(A, r, x, 1);
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            ComputeRestrictionCuda(A, r);
+#endif
+        }
+        else
+        {
+#ifdef USE_GRACE
+            ComputeRestriction(A, r);
+#endif
+        }
+
+        ierr = ComputeMG(*A.Ac, *A.mgData->rc, *A.mgData->xc);
+
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            ComputeProlongationCuda(A, x);
+#endif
+        }
+        else
+        {
+#ifdef USE_GRACE
+            ComputeProlongation(A, x);
+#endif
+        }
+
+        ComputeSYMGS(A, r, x, 0);
+    }
+    else
+    {
+        ComputeSYMGS(A, r, x, 1);
+    }
+    return 0;
+}
--- a/src/ComputeMG.hpp
+++ b/src/ComputeMG.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEMG_HPP
+#define COMPUTEMG_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeMG(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTEMG_HPP
--- a/src/ComputeMG_ref.cpp
+++ b/src/ComputeMG_ref.cpp
@@ -0,0 +1,81 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeMG_ref.hpp"
+#include "ComputeProlongation_ref.hpp"
+#include "ComputeRestriction_ref.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "ComputeSYMGS_ref.hpp"
+#include <cassert>
+#include <iostream>
+
+/*!
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On exit contains the result of the multigrid V-cycle with r as the RHS, x is the approximation to Ax =
+  r.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeMG
+*/
+
+int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
+
+    ZeroVector(x); // initialize x to zero
+
+    int ierr = 0;
+    if (A.mgData != 0)
+    { // Go to next coarse level if defined
+        int numberOfPresmootherSteps = A.mgData->numberOfPresmootherSteps;
+        for (int i = 0; i < numberOfPresmootherSteps; ++i)
+            ierr += ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeSPMV_ref(A, x, *A.mgData->Axf);
+        if (ierr != 0)
+            return ierr;
+        // Perform restriction operation using simple injection
+        ierr = ComputeRestriction_ref(A, r);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeMG_ref(*A.Ac, *A.mgData->rc, *A.mgData->xc);
+        if (ierr != 0)
+            return ierr;
+        ierr = ComputeProlongation_ref(A, x);
+        if (ierr != 0)
+            return ierr;
+        int numberOfPostsmootherSteps = A.mgData->numberOfPostsmootherSteps;
+        for (int i = 0; i < numberOfPostsmootherSteps; ++i)
+            ierr += ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+    }
+    else
+    {
+        ierr = ComputeSYMGS_ref(A, r, x);
+        if (ierr != 0)
+            return ierr;
+    }
+    return 0;
+}
--- a/src/ComputeMG_ref.hpp
+++ b/src/ComputeMG_ref.hpp
@@ -0,0 +1,26 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEMG_REF_HPP
+#define COMPUTEMG_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+// The use of CPU and GPU Sparse Matrix is intended to resolve
+// the linked list structures for MG coarse levels (A->Ac)
+// There is no change of th erefernce code
+
+int ComputeMG_ref(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTEMG_REF_HPP
--- a/src/ComputeOptimalShapeXYZ.cpp
+++ b/src/ComputeOptimalShapeXYZ.cpp
@@ -0,0 +1,175 @@
+
+#include <cmath>
+#include <cstdlib>
+
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+#include <algorithm>
+#endif
+#include <map>
+
+#include "ComputeOptimalShapeXYZ.hpp"
+#include "MixedBaseCounter.hpp"
+
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+static int min3(int a, int b, int c)
+{
+    return std::min(a, std::min(b, c));
+}
+
+static int max3(int a, int b, int c)
+{
+    return std::max(a, std::max(b, c));
+}
+
+static void cubic_radical_search(int n, int& x, int& y, int& z)
+{
+    double best = 0.0;
+
+    for (int f1 = (int) (pow(n, 1.0 / 3.0) + 0.5); f1 > 0; --f1)
+        if (n % f1 == 0)
+        {
+            int n1 = n / f1;
+            for (int f2 = (int) (pow(n1, 0.5) + 0.5); f2 > 0; --f2)
+                if (n1 % f2 == 0)
+                {
+                    int f3 = n1 / f2;
+                    double current = (double) min3(f1, f2, f3) / max3(f1, f2, f3);
+                    if (current > best)
+                    {
+                        best = current;
+                        x = f1;
+                        y = f2;
+                        z = f3;
+                    }
+                }
+        }
+}
+
+#else
+
+static void ComputePrimeFactors(int n, std::map<int, int>& factors)
+{
+    int d, sq = int((sqrt(double(n))) + 1L);
+    div_t r;
+
+    // remove 2 as a factor with shifts instead "/" and "%"
+    for (; n > 1 && (n & 1) == 0; n >>= 1)
+    {
+        factors[2]++;
+    }
+
+    // keep removing subsequent odd numbers
+    for (d = 3; d <= sq; d += 2)
+    {
+        while (1)
+        {
+            r = div(n, d);
+            if (r.rem == 0)
+            {
+                factors[d]++;
+                n = r.quot;
+                continue;
+            }
+            break;
+        }
+    }
+    if (n > 1 || factors.size() == 0) // left with a prime or x==1
+        factors[n]++;
+}
+
+static int pow_i(int x, int p)
+{
+    int v;
+
+    if (0 == x || 1 == x)
+        return x;
+
+    if (p < 0)
+        return 0;
+
+    for (v = 1; p; p >>= 1)
+    {
+        if (1 & p)
+            v *= x;
+        x *= x;
+    }
+
+    return v;
+}
+
+#endif
+
+void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z)
+{
+#ifdef HPCG_CUBIC_RADICAL_SEARCH
+    cubic_radical_search(xyz, x, y, z);
+#else
+    std::map<int, int> factors;
+
+    ComputePrimeFactors(xyz, factors); // factors are sorted: ascending order
+
+    std::map<int, int>::iterator iter = factors.begin();
+
+    // there is at least one prime factor
+    x = (iter++)->first; // cache the first factor, move to the next one
+
+    y = iter != factors.end() ? (iter++)->first : y; // try to cache the second factor in "y"
+
+    if (factors.size() == 1)
+    { // only a single factor
+        z = pow_i(x, factors[x] / 3);
+        y = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 2 ? 1 : 0));
+        x = pow_i(x, factors[x] / 3 + ((factors[x] % 3) >= 1 ? 1 : 0));
+    }
+    else if (factors.size() == 2 && factors[x] == 1 && factors[y] == 1)
+    { // two distinct prime factors
+        z = 1;
+    }
+    else if (factors.size() == 2 && factors[x] + factors[y] == 3)
+    {                                // three prime factors, one repeated
+        z = factors[x] == 2 ? x : y; // test which factor is repeated
+    }
+    else if (factors.size() == 3 && factors[x] == 1 && factors[y] == 1 && iter->second == 1)
+    { // three distinct and single prime factors
+        z = iter->first;
+    }
+    else
+    { // 3 or more prime factors so try all possible 3-subsets
+
+        int i, distinct_factors[32 + 1], count_factors[32 + 1];
+
+        i = 0;
+        for (std::map<int, int>::iterator iter = factors.begin(); iter != factors.end(); ++iter, ++i)
+        {
+            distinct_factors[i] = iter->first;
+            count_factors[i] = iter->second;
+        }
+
+        // count total number of prime factors in "c_main" and distribute some factors into "c1"
+        MixedBaseCounter c_main(count_factors, factors.size()), c1(count_factors, factors.size());
+
+        // at the beginning, minimum area is the maximum area
+        double area, min_area = 2.0 * xyz + 1.0;
+
+        for (c1.next(); !c1.is_zero(); c1.next())
+        {
+            MixedBaseCounter c2(c_main, c1); // "c2" gets the factors remaining in "c_main" that "c1" doesn't have
+            for (c2.next(); !c2.is_zero(); c2.next())
+            {
+                int tf1 = c1.product(distinct_factors);
+                int tf2 = c2.product(distinct_factors);
+                int tf3 = xyz / tf1 / tf2; // we derive the third dimension, we don't keep track of the factors it has
+
+                area = tf1 * double(tf2) + tf2 * double(tf3) + tf1 * double(tf3);
+                if (area < min_area)
+                {
+                    min_area = area;
+                    x = tf1;
+                    y = tf2;
+                    z = tf3;
+                }
+            }
+        }
+    }
+#endif
+}
--- a/src/ComputeOptimalShapeXYZ.hpp
+++ b/src/ComputeOptimalShapeXYZ.hpp
@@ -0,0 +1,2 @@
+
+void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);
--- a/src/ComputeProlongation.cpp
+++ b/src/ComputeProlongation.cpp
@@ -0,0 +1,72 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeProlongation.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeProlongation.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
+  operator.
+  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeProlongation(const SparseMatrix& Af, Vector& xf)
+{
+    double* xfv = xf.values;
+    double* xcv = Af.mgData->xc->values;
+    local_int_t* f2c = Af.mgData->f2cOperator;
+    local_int_t nc = Af.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+    {
+        xfv[Af.f2cPerm[i]] += xcv[i];
+    }
+
+    return 0;
+}
--- a/src/ComputeProlongation.hpp
+++ b/src/ComputeProlongation.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEPROLONGATION_HPP
+#define COMPUTEPROLONGATION_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeProlongation(const SparseMatrix& Af, Vector& xf);
+#endif // COMPUTEPROLONGATION_HPP
--- a/src/ComputeProlongation_ref.cpp
+++ b/src/ComputeProlongation_ref.cpp
@@ -0,0 +1,55 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeProlongation_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeProlongation_ref.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[in]  Af - Fine grid sparse matrix object containing pointers to current coarse grid correction and the f2c
+  operator.
+  @param[inout] xf - Fine grid solution vector, update with coarse grid correction.
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf)
+{
+
+    double* xfv = xf.values;
+    double* xcv = Af.mgData->xc->values;
+    local_int_t* f2c = Af.mgData->f2cOperator;
+    local_int_t nc = Af.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    // TODO: Somehow note that this loop can be safely vectorized since f2c has no repeated indices
+    for (local_int_t i = 0; i < nc; ++i)
+        xfv[f2c[i]] += xcv[i]; // This loop is safe to vectorize
+
+    return 0;
+}
--- a/src/ComputeProlongation_ref.hpp
+++ b/src/ComputeProlongation_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEPROLONGATION_REF_HPP
+#define COMPUTEPROLONGATION_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeProlongation_ref(const SparseMatrix& Af, Vector& xf);
+#endif // COMPUTEPROLONGATION_REF_HPP
--- a/src/ComputeResidual.cpp
+++ b/src/ComputeResidual.cpp
@@ -0,0 +1,95 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeResidual.cpp
+
+ HPCG routine
+ */
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "Vector.hpp"
+
+#ifdef HPCG_DETAILED_DEBUG
+#include "hpcg.hpp"
+#include <fstream>
+#endif
+
+#include "ComputeResidual.hpp"
+#include <cmath> // needed for fabs
+#ifdef HPCG_DETAILED_DEBUG
+#include <iostream>
+#endif
+
+/*!
+  Routine to compute the inf-norm difference between two vectors where:
+
+  @param[in]  n        number of vector elements (local to this processor)
+  @param[in]  v1, v2   input vectors
+  @param[out] residual pointer to scalar value; on exit, will contain result: inf-norm difference
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual)
+{
+
+    double* v1v = v1.values;
+    double* v2v = v2.values;
+    double local_residual = 0.0;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel shared(local_residual, v1v, v2v)
+    {
+        double threadlocal_residual = 0.0;
+#pragma omp for
+        for (local_int_t i = 0; i < n; i++)
+        {
+            double diff = std::fabs(v1v[i] - v2v[i]);
+            if (diff > threadlocal_residual)
+                threadlocal_residual = diff;
+        }
+#pragma omp critical
+        {
+            if (threadlocal_residual > local_residual)
+                local_residual = threadlocal_residual;
+        }
+    }
+#else // No threading
+    for (local_int_t i = 0; i < n; i++)
+    {
+        double diff = std::fabs(v1v[i] - v2v[i]);
+        if (diff > local_residual)
+            local_residual = diff;
+#ifdef HPCG_DETAILED_DEBUG
+        HPCG_fout << " Computed, exact, diff = " << v1v[i] << " " << v2v[i] << " " << diff << std::endl;
+#endif
+    }
+#endif
+
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to collect all partial sums
+    double global_residual = 0;
+    MPI_Allreduce(&local_residual, &global_residual, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    residual = global_residual;
+#else
+    residual = local_residual;
+#endif
+
+    return 0;
+}
--- a/src/ComputeResidual.hpp
+++ b/src/ComputeResidual.hpp
@@ -0,0 +1,19 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESIDUAL_HPP
+#define COMPUTERESIDUAL_HPP
+#include "Vector.hpp"
+int ComputeResidual(const local_int_t n, const Vector& v1, const Vector& v2, double& residual);
+#endif // COMPUTERESIDUAL_HPP
--- a/src/ComputeRestriction.cpp
+++ b/src/ComputeRestriction.cpp
@@ -0,0 +1,75 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeRestriction.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeRestriction.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
+  mgData->rc the coarse residual vector.
+  @param[in]    rf - Fine grid RHS.
+
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeRestriction(const SparseMatrix& A, const Vector& rf)
+{
+
+    double* Axfv = A.mgData->Axf->values;
+    double* rfv = rf.values;
+    double* rcv = A.mgData->rc->values;
+    local_int_t* f2c = A.mgData->f2cOperator;
+    local_int_t nc = A.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+    {
+        rcv[i] = rfv[A.f2cPerm[i]] - Axfv[A.f2cPerm[i]];
+    }
+
+    return 0;
+}
--- a/src/ComputeRestriction.hpp
+++ b/src/ComputeRestriction.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESTRICTION_HPP
+#define COMPUTERESTRICTION_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeRestriction(const SparseMatrix& A, const Vector& rf);
+#endif // COMPUTERESTRICTION_HPP
--- a/src/ComputeRestriction_ref.cpp
+++ b/src/ComputeRestriction_ref.cpp
@@ -0,0 +1,56 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeRestriction_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "ComputeRestriction_ref.hpp"
+
+/*!
+  Routine to compute the coarse residual vector.
+
+  @param[inout]  A - Sparse matrix object containing pointers to mgData->Axf, the fine grid matrix-vector product and
+  mgData->rc the coarse residual vector.
+  @param[in]    rf - Fine grid RHS.
+
+
+  Note that the fine grid residual is never explicitly constructed.
+  We only compute it for the fine grid points that will be injected into corresponding coarse grid points.
+
+  @return Returns zero on success and a non-zero value otherwise.
+*/
+int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf)
+{
+
+    double* Axfv = A.mgData->Axf->values;
+    double* rfv = rf.values;
+    double* rcv = A.mgData->rc->values;
+    local_int_t* f2c = A.mgData->f2cOperator;
+    local_int_t nc = A.mgData->rc->localLength;
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nc; ++i)
+        rcv[i] = rfv[f2c[i]] - Axfv[f2c[i]];
+
+    return 0;
+}
--- a/src/ComputeRestriction_ref.hpp
+++ b/src/ComputeRestriction_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTERESTRICTION_REF_HPP
+#define COMPUTERESTRICTION_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+int ComputeRestriction_ref(const SparseMatrix& A, const Vector& rf);
+#endif // COMPUTERESTRICTION_REF_HPP
--- a/src/ComputeSPMV.cpp
+++ b/src/ComputeSPMV.cpp
@@ -0,0 +1,111 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeSPMV.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeSPMV.hpp"
+#include "ComputeSPMV_ref.hpp"
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#include "CpuKernels.hpp"
+/*!
+  Routine to compute sparse matrix vector product y = Ax where:
+  Precondition: First call exchange_externals to get off-processor values of x
+
+  This routine calls the reference SpMV implementation by default, but
+  can be replaced by a custom, optimized routine suited for
+  the target system.
+
+  @param[in]  A the known system matrix
+  @param[in]  x the known vector
+  @param[out] y the On exit contains the result: Ax.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSPMV_ref
+*/
+int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y)
+{
+
+    double one = 1.0, zero = 0.0;
+    if (A.rankType == GPU)
+    {
+// #ifdef USE_CUDA
+#ifndef HPCG_NO_MPI
+        PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, y.values_d);
+        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, A.cusparseOpt.matA, A.cusparseOpt.vecX,
+            &zero, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+#ifndef HPCG_NO_MPI
+        if (A.totalToBeSent > 0)
+        {
+            ExchangeHaloCuda(A, x, copy_stream);
+            ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, y.values_d);
+        }
+#endif
+
+        cudaStreamSynchronize(stream);
+// #endif
+    }
+//     else
+//     {
+// #ifdef USE_GRACE
+//         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, x.values);
+//         nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, y.values);
+//         nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matA,
+//             A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+//             NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvADescr);
+
+// #ifndef HPCG_NO_MPI
+//         if (A.totalToBeSent > 0)
+//         {
+//             ExchangeHaloCpu(A, x);
+//             ExtSpMVCpu(A, A.localNumberOfRows, 1.0, x.values, y.values);
+//         }
+// #endif
+// #endif // USE_GRACE
+//     }
+    return 0;
+}
--- a/src/ComputeSPMV.hpp
+++ b/src/ComputeSPMV.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESPMV_HPP
+#define COMPUTESPMV_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSPMV(const SparseMatrix& A, Vector& x, Vector& y);
+
+#endif // COMPUTESPMV_HPP
--- a/src/ComputeSPMV_ref.cpp
+++ b/src/ComputeSPMV_ref.cpp
@@ -0,0 +1,74 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSPMV_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeSPMV_ref.hpp"
+
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include <cassert>
+
+/*!
+  Routine to compute matrix vector product y = Ax where:
+  Precondition: First call exchange_externals to get off-processor values of x
+
+  This is the reference SPMV implementation.  It CANNOT be modified for the
+  purposes of this benchmark.
+
+  @param[in]  A the known system matrix
+  @param[in]  x the known vector
+  @param[out] y the On exit contains the result: Ax.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSPMV
+*/
+int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y)
+{
+
+    assert(x.localLength >= A.localNumberOfColumns); // Test vector lengths
+    assert(y.localLength >= A.localNumberOfRows);
+
+#ifndef HPCG_NO_MPI
+    ExchangeHalo(A, x);
+#endif
+    const double* const xv = x.values;
+    double* const yv = y.values;
+    const local_int_t nrow = A.localNumberOfRows;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < nrow; i++)
+    {
+        double sum = 0.0;
+        const double* const cur_vals = A.matrixValues[i];
+        const local_int_t* const cur_inds = A.mtxIndL[i];
+        const int cur_nnz = A.nonzerosInRow[i];
+
+        for (int j = 0; j < cur_nnz; j++)
+            sum += cur_vals[j] * xv[cur_inds[j]];
+        yv[i] = sum;
+    }
+    return 0;
+}
--- a/src/ComputeSPMV_ref.hpp
+++ b/src/ComputeSPMV_ref.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESPMV_REF_HPP
+#define COMPUTESPMV_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSPMV_ref(const SparseMatrix& A, Vector& x, Vector& y);
+
+#endif // COMPUTESPMV_REF_HPP
--- a/src/ComputeSYMGS.cpp
+++ b/src/ComputeSYMGS.cpp
@@ -0,0 +1,309 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeSYMGS.cpp
+
+ HPCG routine
+ */
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#endif
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#include "ComputeSPMV.hpp"
+#include "ComputeSYMGS.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+
+/*!
+  Routine to compute one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  Since y is initially zero we can ignore the upper triangular terms of A.
+  - We then perform one back sweep.
+       - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
+  with r as the RHS.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
+  of sync with other kernels.
+
+  @see ComputeSYMGS_ref
+*/
+
+#ifdef USE_CUDA
+int ComputeSYMGS_Gpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    double* tmp_d;
+    if (step == 1 && A.mgData != 0)
+    {
+        tmp_d = (*A.mgData->Axf).values_d;
+    }
+    else
+    {
+        tmp_d = A.tempBuffer;
+    }
+    const local_int_t nrow = A.localNumberOfRows;
+    double alpha = 1.0;
+    cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
+    cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
+
+    if (step == 1)
+    {
+        // TRSV(D+L, r, t)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, r.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, tmp_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
+
+        // SPMV(D, t, t)
+        SpmvDiagCuda(nrow, tmp_d, A.diagonal);
+
+        // TRSV(D+U, t, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
+
+        if (A.mgData != 0)
+        {
+#ifndef HPCG_NO_MPI
+            cudaStreamSynchronize(stream);
+            PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+            // SPMV(L, x, t): t = t + L * x
+            double alpha = 1.0;
+            cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+            cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
+            cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matL,
+                A.cusparseOpt.vecX, &alpha, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+#ifndef HPCG_NO_MPI
+            if (A.totalToBeSent > 0)
+            {
+                ExchangeHaloCuda(A, x, copy_stream);
+                double one = 1.0, zero = 0.0;
+                ExtSpMVCuda((SparseMatrix&) A, one, x.values_d + A.localNumberOfRows, (*A.mgData->Axf).values_d);
+            }
+#endif
+        }
+    }
+    else
+    { // step == 0
+#ifndef HPCG_NO_MPI
+        cudaStreamSynchronize(stream);
+        PackSendBufferCuda(A, x, false, copy_stream);
+#endif
+
+        // SPMV(U, x, t): t = U * x
+        double alpha = 1.0, beta = 0.0;
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, x.values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, (*A.mgData->Axf).values_d);
+        cusparseSpMV(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matU, A.cusparseOpt.vecX,
+            &beta, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, A.bufferMvA);
+
+        // tmp = rv - t
+        AxpbyCuda(nrow, r.values_d, (*A.mgData->Axf).values_d, tmp_d);
+
+#ifndef HPCG_NO_MPI
+        if (A.totalToBeSent > 0)
+        {
+            // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
+            ExchangeHaloCuda(A, x, copy_stream, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
+            double mone = -1.0, zero = 0.0;
+            ExtSpMVCuda((SparseMatrix&) A, mone, x.values_d + A.localNumberOfRows, tmp_d);
+        }
+#endif
+
+        // TRSV(D+L, r-t, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, tmp_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t += D*x
+        SpFmaCuda(nrow, x.values_d, A.diagonal, (*A.mgData->Axf).values_d);
+
+        // TRSV(D+U, x, x)
+        cusparseDnVecSetValues(A.cusparseOpt.vecX, (*A.mgData->Axf).values_d);
+        cusparseDnVecSetValues(A.cusparseOpt.vecY, x.values_d);
+        cusparseSpMatSetAttribute(A.cusparseOpt.matA, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        cusparseSpSV_solve(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A.cusparseOpt.matA,
+            A.cusparseOpt.vecX, A.cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A.cusparseOpt.spsvDescrU);
+    }
+    return 0;
+}
+#endif
+
+#ifdef USE_GRACE
+int ComputeSYMGS_Cpu(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    local_int_t nrow = A.localNumberOfRows;
+    double* temp;
+    if (step == 1 && A.mgData != 0)
+    {
+        temp = (*A.mgData->Axf).values;
+    }
+    else
+    {
+        temp = A.tempBuffer;
+    }
+    double* xv = x.values;
+    double* rv = r.values;
+    double one = 1.0, zero = 0.0;
+    nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
+    nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
+
+    if (step == 1)
+    {
+        // TRSV(L, r, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, r.values);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t = D*x
+        SpmvDiagCpu(nrow, A.diagonal, xv, temp);
+
+        // TRSV(U, x, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrU);
+
+        if (A.mgData != 0)
+        {
+            // SPMV(L, x, t): t += L*x
+            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
+            nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, temp);
+            nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+                A.nvplSparseOpt.vecX, &one, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvLDescr);
+
+#ifndef HPCG_NO_MPI
+            ExchangeHaloCpu(A, x);
+            if (A.totalToBeSent > 0)
+            {
+                ExtSpMVCpu(A, nrow, 1.0, xv, temp);
+            }
+#endif
+        }
+    }
+    else if (step == 0)
+    {
+        // SPMV(U, x, t) t = U*x
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, xv);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, (*A.mgData->Axf).values);
+
+        nvpl_sparse_spmv(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, &zero, A.nvplSparseOpt.vecY, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+            NVPL_SPARSE_SPMV_ALG_DEFAULT, A.nvplSparseOpt.spmvUDescr);
+
+        // axpy: t = r-t
+        AxpbyCpu(nrow, rv, (*A.mgData->Axf).values, temp);
+
+#ifndef HPCG_NO_MPI
+        // MPI_Ibarrier --> will help improve MPI_Allreduce in dot product
+        ExchangeHaloCpu(A, x, A.level == 0 ? 1 /*call MPI_Ibarrier*/ : 0);
+        if (A.totalToBeSent > 0)
+        {
+            ExtSpMVCpu(A, nrow, -1.0, xv, temp);
+        }
+#endif
+
+        // TRSV(L, r-t, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, temp);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matL,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrL);
+
+        // SPMV(D, x, t) t += D*x
+        SpFmaCpu(nrow, A.diagonal, xv, (*A.mgData->Axf).values);
+
+        // TRSV(U, x, x)
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecX, (*A.mgData->Axf).values);
+        nvpl_sparse_dn_vec_set_values(A.nvplSparseOpt.vecY, xv);
+        nvpl_sparse_sp_mat_set_attribute(
+            A.nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        nvpl_sparse_spsv_solve(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &one, A.nvplSparseOpt.matU,
+            A.nvplSparseOpt.vecX, A.nvplSparseOpt.vecY, NVPL_SPARSE_R_64F, NVPL_SPARSE_SPSV_ALG_DEFAULT,
+            A.nvplSparseOpt.spsvDescrU);
+    }
+
+    return 0;
+}
+#endif // USE_GRACE
+
+int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        ComputeSYMGS_Gpu(A, r, x, step);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ComputeSYMGS_Cpu(A, r, x, step);
+#endif
+    }
+
+    return 0;
+}
--- a/src/ComputeSYMGS.hpp
+++ b/src/ComputeSYMGS.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTESYMGS_HPP
+#define COMPUTESYMGS_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSYMGS(const SparseMatrix& A, const Vector& r, Vector& x, bool step);
+
+#endif // COMPUTESYMGS_HPP
--- a/src/ComputeSYMGS_ref.cpp
+++ b/src/ComputeSYMGS_ref.cpp
@@ -0,0 +1,110 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeSYMGS_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#endif
+#include "ComputeSYMGS_ref.hpp"
+#include <cassert>
+
+/*!
+  Computes one step of symmetric Gauss-Seidel:
+
+  Assumption about the structure of matrix A:
+  - Each row 'i' of the matrix has nonzero diagonal value whose address is matrixDiagonal[i]
+  - Entries in row 'i' are ordered such that:
+       - lower triangular terms are stored before the diagonal element.
+       - upper triangular terms are stored after the diagonal element.
+       - No other assumptions are made about entry ordering.
+
+  Symmetric Gauss-Seidel notes:
+  - We use the input vector x as the RHS and start with an initial guess for y of all zeros.
+  - We perform one forward sweep.  x should be initially zero on the first GS sweep, but we do not attempt to exploit
+  this fact.
+  - We then perform one back sweep.
+  - For simplicity we include the diagonal contribution in the for-j loop, then correct the sum after
+
+  @param[in] A the known system matrix
+  @param[in] r the input vector
+  @param[inout] x On entry, x should contain relevant values, on exit x contains the result of one symmetric GS sweep
+  with r as the RHS.
+
+
+  @warning Early versions of this kernel (Version 1.1 and earlier) had the r and x arguments in reverse order, and out
+  of sync with other kernels.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeSYMGS
+*/
+int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x)
+{
+
+    assert(x.localLength == A.localNumberOfColumns); // Make sure x contain space for halo values
+
+#ifndef HPCG_NO_MPI
+    ExchangeHalo(A, x);
+#endif
+
+    const local_int_t nrow = A.localNumberOfRows;
+    double** matrixDiagonal = A.matrixDiagonal; // An array of pointers to the diagonal entries A.matrixValues
+    const double* const rv = r.values;
+    double* const xv = x.values;
+
+    for (local_int_t i = 0; i < nrow; i++)
+    {
+        const double* const currentValues = A.matrixValues[i];
+        const local_int_t* const currentColIndices = A.mtxIndL[i];
+        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+        double sum = rv[i];                                  // RHS value
+
+        for (int j = 0; j < currentNumberOfNonzeros; j++)
+        {
+            local_int_t curCol = currentColIndices[j];
+            sum -= currentValues[j] * xv[curCol];
+        }
+        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
+
+        xv[i] = sum / currentDiagonal;
+    }
+
+    // Now the back sweep.
+
+    for (local_int_t i = nrow - 1; i >= 0; i--)
+    {
+        const double* const currentValues = A.matrixValues[i];
+        const local_int_t* const currentColIndices = A.mtxIndL[i];
+        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+        const double currentDiagonal = matrixDiagonal[i][0]; // Current diagonal value
+        double sum = rv[i];                                  // RHS value
+
+        for (int j = 0; j < currentNumberOfNonzeros; j++)
+        {
+            local_int_t curCol = currentColIndices[j];
+            sum -= currentValues[j] * xv[curCol];
+        }
+        sum += xv[i] * currentDiagonal; // Remove diagonal contribution from previous loop
+
+        xv[i] = sum / currentDiagonal;
+    }
+
+    return 0;
+}
--- a/src/ComputeSYMGS_ref.hpp
+++ b/src/ComputeSYMGS_ref.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTESYMGS_REF_HPP
+#define COMPUTESYMGS_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+int ComputeSYMGS_ref(const SparseMatrix& A, const Vector& r, Vector& x);
+
+#endif // COMPUTESYMGS_REF_HPP
--- a/src/ComputeWAXPBY.cpp
+++ b/src/ComputeWAXPBY.cpp
@@ -0,0 +1,89 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ComputeWAXPBY.cpp
+
+ HPCG routine
+ */
+#ifndef HPCG_NO_MPI
+#include "mytimer.hpp"
+#include <mpi.h>
+#endif
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#endif
+#include "ComputeWAXPBY.hpp"
+#include "ComputeWAXPBY_ref.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "SparseMatrix.hpp"
+
+/*!
+  Routine to compute the update of a vector with the sum of two
+  scaled vectors where: w = alpha*x + beta*y
+
+  This routine calls the reference WAXPBY implementation by default, but
+  can be replaced by a custom, optimized routine suited for
+  the target system.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] alpha, beta the scalars applied to x and y respectively.
+  @param[in] x, y the input vectors
+  @param[out] w the output vector
+  @param[out] isOptimized should be set to false if this routine uses the reference implementation (is not optimized);
+  otherwise leave it unchanged
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeWAXPBY_ref
+*/
+
+int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized, rank_type_t rt)
+{
+    if (rt == GPU)
+    {
+#ifdef USE_CUDA
+        ComputeWAXPBYCuda(n, alpha, x, beta, y, w);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ComputeWAXPBYCpu(n, alpha, x, beta, y, w, isOptimized);
+#endif
+    }
+    return 0;
+}
--- a/src/ComputeWAXPBY.hpp
+++ b/src/ComputeWAXPBY.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPUTEWAXPBY_HPP
+#define COMPUTEWAXPBY_HPP
+#include "Vector.hpp"
+
+int ComputeWAXPBY(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized, rank_type_t rt);
+
+#endif // COMPUTEWAXPBY_HPP
--- a/src/ComputeWAXPBY_ref.cpp
+++ b/src/ComputeWAXPBY_ref.cpp
@@ -0,0 +1,79 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file ComputeWAXPBY_ref.cpp
+
+ HPCG routine
+ */
+
+#include "ComputeWAXPBY_ref.hpp"
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+#include <cassert>
+/*!
+  Routine to compute the update of a vector with the sum of two
+  scaled vectors where: w = alpha*x + beta*y
+
+  This is the reference WAXPBY impmentation.  It CANNOT be modified for the
+  purposes of this benchmark.
+
+  @param[in] n the number of vector elements (on this processor)
+  @param[in] alpha, beta the scalars applied to x and y respectively.
+  @param[in] x, y the input vectors
+  @param[out] w the output vector.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeWAXPBY
+*/
+int ComputeWAXPBY_ref(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w)
+{
+
+    assert(x.localLength >= n); // Test vector lengths
+    assert(y.localLength >= n);
+
+    const double* const xv = x.values;
+    const double* const yv = y.values;
+    double* const wv = w.values;
+
+    if (alpha == 1.0)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = xv[i] + beta * yv[i];
+    }
+    else if (beta == 1.0)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = alpha * xv[i] + yv[i];
+    }
+    else
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < n; i++)
+            wv[i] = alpha * xv[i] + beta * yv[i];
+    }
+
+    return 0;
+}
--- a/src/ComputeWAXPBY_ref.hpp
+++ b/src/ComputeWAXPBY_ref.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef COMPUTEWAXPBY_REF_HPP
+#define COMPUTEWAXPBY_REF_HPP
+#include "Vector.hpp"
+int ComputeWAXPBY_ref(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
+#endif // COMPUTEWAXPBY_REF_HPP
--- a/src/CpuKernels.cpp
+++ b/src/CpuKernels.cpp
--- a/src/CpuKernels.hpp
+++ b/src/CpuKernels.hpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CPUKERNELS_HPP
+#define CPUKERNELS_HPP
+
+#ifdef USE_GRACE
+
+#include <nvpl_sparse.h>
+extern nvpl_sparse_handle_t nvpl_sparse_handle;
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+///////// Deallocate CPU Memory for data structures //
+void DeleteMatrixCpu(SparseMatrix& A);
+
+///////// Find the size of CPU reference allocated memory //
+size_t EstimateCpuRefMem(SparseMatrix& A);
+
+/*
+    Translation of a 3D point in all directions
+    27 possibilities
+*/
+constexpr int tid2indCpu[32][4] = {{-1, -1, -1, 0}, {0, -1, -1, 0}, {1, -1, -1, 0}, {-1, 0, -1, 0}, {0, 0, -1, 0},
+    {1, 0, -1, 0}, {-1, 1, -1, 0}, {0, 1, -1, 0}, {1, 1, -1, 0}, {-1, -1, 0, 0}, {0, -1, 0, 0}, {1, -1, 0, 0},
+    {-1, 0, 0, 0}, {0, 0, 0, 0}, {1, 0, 0, 0}, {-1, 1, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, {-1, -1, 1, 0}, {0, -1, 1, 0},
+    {1, -1, 1, 0}, {-1, 0, 1, 0}, {0, 0, 1, 0}, {1, 0, 1, 0}, {-1, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, {0, 0, 0, 0},
+    {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}};
+
+// Generate Problem
+// Inclusive Prefix Sum
+void PrefixsumCpu(int* x, int N);
+
+// Optimize Problem
+size_t AllocateMemCpu(SparseMatrix& A_in);
+void ColorMatrixCpu(SparseMatrix& A, int* num_colors);
+void CreateSellPermCpu(SparseMatrix& A);
+void F2cPermCpu(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2c_perm, local_int_t* perm_f, local_int_t* iperm_c);
+
+// Permute a vector using coloring buffer
+void PermVectorCpu(local_int_t* perm, Vector& x, local_int_t length);
+
+// Test CG
+void ReplaceMatrixDiagonalCpu(SparseMatrix& A, Vector diagonal);
+
+// CG Support Kernels
+// Dot-product Per single rank
+void ComputeDotProductCpu(const local_int_t n, const Vector& x, const Vector& y, double& result, bool& isOptimized);
+
+// WAXPBY
+int ComputeWAXPBYCpu(const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y,
+    Vector& w, bool& isOptimized);
+// SYMGS
+void SpmvDiagCpu(local_int_t n, const double* x, double* y, double* z);
+void AxpbyCpu(local_int_t n, double* x, double* y, double* z);
+void SpFmaCpu(local_int_t n, const double* x, double* y, double* z);
+
+// External Matrix SpMV + Scatter
+void ExtSpMVCpu(const SparseMatrix& A, const local_int_t n, const double alpha, const double* x, double* y);
+
+#endif // USE_GRACE
+#endif // CPUKERNELS_HPP
--- a/src/Cuda.hpp
+++ b/src/Cuda.hpp
@@ -0,0 +1,87 @@
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifdef USE_CUDA
+#include "cublas_v2.h"
+#include "cuda_runtime_api.h"
+#include "cusparse.h"
+#include <cuda.h>
+#ifdef USE_NCCL
+#include "nccl.h"
+#endif
+#ifdef USE_NVTX
+#include <nvToolsExt.h>
+#endif
+#include <unistd.h>
+
+extern cusparseHandle_t cusparsehandle;
+extern cublasHandle_t cublashandle;
+extern cudaStream_t stream;
+extern cudaEvent_t copy_done;
+extern cudaStream_t copy_stream;
+extern int* ranktoId;   // DEV:Compress rank in MPI_WORLD to Neighbors
+extern int* rankToId_h; // HOST:Compress rank in MPI_WORLD to Neighbors
+extern int* idToRank_h;
+extern bool Use_Compression;        /*USE CUDA L2 compression*/
+extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
+#endif
+
+#ifdef USE_CUDA
+#define CHECK_CUDART(x)                                                                                                \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t res = (x);                                                                                         \
+        if (res != cudaSuccess)                                                                                        \
+        {                                                                                                              \
+            char rank_name[1024];                                                                                      \
+            gethostname(rank_name, 1024);                                                                              \
+            fprintf(stderr, "CUDART: %s = %d (%s) on %s at (%s:%d)\n", #x, res, cudaGetErrorString(res), rank_name,    \
+                __FILE__, __LINE__);                                                                                   \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+// IF NVTX is needed for profiling, please define USE_NVTX
+// Then, add PUSH_RANGE and POP_RANGE around the target code block
+// See, https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/
+// #define USE_NVTX
+#ifdef USE_NVTX
+const uint32_t colors[] = {0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff};
+const int num_colors = sizeof(colors) / sizeof(uint32_t);
+#define PUSH_RANGE(name, cid)                                                                                          \
+    {                                                                                                                  \
+        int color_id = cid;                                                                                            \
+        color_id = color_id % num_colors;                                                                              \
+        nvtxEventAttributes_t eventAttrib = {0};                                                                       \
+        eventAttrib.version = NVTX_VERSION;                                                                            \
+        eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;                                                              \
+        eventAttrib.colorType = NVTX_COLOR_ARGB;                                                                       \
+        eventAttrib.color = colors[color_id];                                                                          \
+        eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;                                                             \
+        eventAttrib.message.ascii = name;                                                                              \
+        nvtxRangePushEx(&eventAttrib);                                                                                 \
+    }
+#define POP_RANGE nvtxRangePop();
+#else
+#define PUSH_RANGE(name, cid)                                                                                          \
+    {                                                                                                                  \
+    }
+#define POP_RANGE
+#endif
+#endif
--- a/src/CudaKernels.cu
+++ b/src/CudaKernels.cu
--- a/src/CudaKernels.hpp
+++ b/src/CudaKernels.hpp
@@ -0,0 +1,92 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#ifdef USE_CUDA
+#include "SparseMatrix.hpp"
+
+///////// L2 Memory Compression Allocation Support Routines //
+cudaError_t setProp(CUmemAllocationProp* prop);
+cudaError_t cudaMallocCompressible(void** adr, size_t size);
+cudaError_t cudaFreeCompressible(void* ptr, size_t size);
+
+///////// Allocate CUDA Memory for data structures //
+local_int_t EstimateLUmem(local_int_t n, local_int_t padded_n, local_int_t level);
+void AllocateMemCuda(SparseMatrix& A_in);
+void AllocateMemOptCuda(SparseMatrix& A_in);
+
+///////// Deallocate CUDA Memory for data structures //
+void DeleteMatrixGpu(SparseMatrix& A);
+
+///////// Genrerate Problem //
+void GenerateProblemCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+
+// Halo Exchange
+void SetupHaloCuda(SparseMatrix& A, local_int_t sendbufld, local_int_t* sendlen, local_int_t* sendbuff,
+    local_int_t* tot_to_send, int* nneighs, int* neighs_h, local_int_t* sendlen_h, local_int_t** elem_to_send_d);
+void ExtToLocMapCuda(
+    local_int_t localNumberOfRows, local_int_t str, local_int_t end, local_int_t* extToLocMap, local_int_t* eltsToRecv);
+void ExtTolocCuda(local_int_t localNumberOfRows, int neighborId, local_int_t ext_nnz, local_int_t* csr_ext_columns,
+    double* csr_ext_values, local_int_t* ext2csr_offsets, local_int_t* extToLocMap, local_int_t* csrColumns);
+void PackSendBufferCuda(const SparseMatrix& A, Vector& x, bool cpu_data, cudaStream_t stream1);
+void ExchangeHaloCuda(const SparseMatrix& A, Vector& x, cudaStream_t stream1, int use_ibarrier = 0);
+
+// Optimize Problem
+void SetVectorAscCuda(local_int_t* arr, local_int_t n);
+void ColorMatrixCuda(double* A_vals, local_int_t* A_col, local_int_t* nnzPerRow, local_int_t rows, local_int_t* color,
+    int* num_colors, int* count_colors, int max_colors, local_int_t* ref2opt, local_int_t* opt2ref, int rank, int nx,
+    int* rowhash);
+void PermElemToSendCuda(local_int_t totalToBeSent, local_int_t* elementsToSend, local_int_t* perm);
+void EllPermColumnsValuesCuda(local_int_t localNumberOfRows, local_int_t* nnzPerRow, local_int_t* csrColumns,
+    double* csrValues, local_int_t* permOffsets, local_int_t* permColumns, double* permValues, local_int_t* opt2ref,
+    local_int_t* ref2opt, local_int_t* diagonalIdx, local_int_t* permLOffsets, local_int_t* permUOffsets, bool diag);
+void TransposeCuda(local_int_t n, local_int_t slice_size, local_int_t* sellCollIndex, double* sellValues);
+void EllMaxRowLenPerBlockCuda(local_int_t nrow, int sliceSize, local_int_t* sellLPermOffsets,
+    local_int_t* sellUPermOffsets, local_int_t* sellLSliceMrl, local_int_t* sellUSliceMrl);
+void PrefixsumCuda(local_int_t localNumberOfRows, local_int_t* arr);
+void MultiplyBySliceSizeCUDA(local_int_t nrow, int slice_size, local_int_t* arr);
+void CreateAMatrixSliceOffsetsCuda(local_int_t nrow, local_int_t slice_size, local_int_t* arr);
+void CreateSellLUColumnsValuesCuda(const local_int_t n, int sliceSize, local_int_t* columns, double* values,
+    local_int_t* sellLSliceOffset, local_int_t* sellLColumns, double* sellLValues, local_int_t* sellUSliceOffset,
+    local_int_t* sellUColumns, double* sellUValues, int level);
+void PermVectorCuda(local_int_t* perm, Vector& x, local_int_t length);
+void F2cPermCuda(local_int_t nrow_c, local_int_t* f2c, local_int_t* f2cPerm, local_int_t* permF, local_int_t* ipermC);
+
+// Test CG
+void ReplaceMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
+void CopyMatrixDiagonalCuda(SparseMatrix& A, Vector& diagonal);
+
+// CG Support Kernels
+// 1. MG
+void ComputeRestrictionCuda(const SparseMatrix& A, const Vector& r);
+void ComputeProlongationCuda(const SparseMatrix& A, Vector& x);
+
+// 2. WAXPBY
+void ComputeWAXPBYCuda(
+    const local_int_t n, const double alpha, const Vector& x, const double beta, const Vector& y, Vector& w);
+
+// 3.SYMGS
+void SpmvDiagCuda(local_int_t n, double* x, double* d);
+void AxpbyCuda(local_int_t n, double* x, double* y, double* z);
+void SpFmaCuda(local_int_t n, double* x, double* y, double* z);
+
+// 4.External Matrix SpMV + Scatter
+void ExtSpMVCuda(SparseMatrix& A, double alpha, double* x, double* y);
+
+// Transfer Problem to CPU
+size_t CopyDataToHostCuda(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif
--- a/src/ExchangeHalo.cpp
+++ b/src/ExchangeHalo.cpp
@@ -0,0 +1,205 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ExchangeHalo.cpp
+
+ HPCG routine
+ */
+
+// Compile this routine only if running with MPI
+#ifndef HPCG_NO_MPI
+#include "ExchangeHalo.hpp"
+#include "Geometry.hpp"
+#include <cstdlib>
+#include <mpi.h>
+
+extern p2p_comm_mode_t P2P_Mode;
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+ */
+void ExchangeHalo(const SparseMatrix& A, Vector& x)
+{
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t * receiveLength = A.receiveLength;
+    local_int_t * sendLength = A.sendLength;
+    int * neighbors = A.neighbors;
+    double * sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t * elementsToSend = A.elementsToSend;
+  
+    double * const xv = x.values;
+  
+    int size, rank; // Number of MPI processes, My process ID
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  
+    //
+    //  first post receives, these are immediate receives
+    //  Do not wait for result to come, will do that at the
+    //  wait call below.
+    //
+  
+    int MPI_MY_TAG = 99;
+  
+    MPI_Request * request = new MPI_Request[num_neighbors];
+  
+    //
+    // Externals are at end of locals
+    //
+    double * x_external = (double *) xv + localNumberOfRows;
+  
+    // Post receives first
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_recv = receiveLength[i];
+      MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request+i);
+      x_external += n_recv;
+    }
+  
+  
+    //
+    // Fill up send buffer
+    //
+  
+    // TODO: Thread this loop
+    for (local_int_t i=0; i<totalToBeSent; i++) sendBuffer[i] = xv[elementsToSend[i]];
+  
+    //
+    // Send to each neighbor
+    //
+  
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      local_int_t n_send = sendLength[i];
+      MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+      sendBuffer += n_send;
+    }
+  
+    //
+    // Complete the reads issued above
+    //
+  
+    MPI_Status status;
+    // TODO: Thread this loop
+    for (int i = 0; i < num_neighbors; i++) {
+      if ( MPI_Wait(request+i, &status) ) {
+        std::exit(-1); // TODO: have better error exit
+      }
+    }
+  
+    delete [] request;
+  
+    return;
+}
+
+/*!
+  Communicates data that is at the border of the part of the domain assigned to this processor. A more optimized version of ExchangeHalo that is used for Grace path.
+
+  @param[in]    A The known system matrix
+  @param[inout] x On entry: the local vector entries followed by entries to be communicated; on exit: the vector with
+  non-local entries updated by other processors
+  @param[in]   use_ibarrier [Experimental] If 1, call MPI_Ibarrier after the communication is complete. A smart trick to improve MPI_Allreduce in DDOT, 
+    by calling MPI_Ibarrier once at the last routine call in MG.
+ */
+void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier)
+{
+    // Extract Matrix pieces
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    int num_neighbors = A.numberOfSendNeighbors;
+    local_int_t* receiveLength = A.receiveLength;
+    local_int_t* sendLength = A.sendLength;
+    int* neighbors = A.neighborsPhysical;
+    double* sendBuffer = A.sendBuffer;
+    local_int_t totalToBeSent = A.totalToBeSent;
+    local_int_t* elementsToSend = A.elementsToSend;
+
+    if (P2P_Mode == MPI_CPU)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        int MPI_MY_TAG = 99;
+        MPI_Request* request = new MPI_Request[num_neighbors];
+
+        // Post receives first
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_recv = receiveLength[i];
+            MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+            x_external += n_recv;
+        }
+
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+
+        //
+        // Send to each neighbor
+        //
+        for (int i = 0; i < num_neighbors; i++)
+        {
+            local_int_t n_send = sendLength[i];
+            MPI_Send(sendBuffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+            sendBuffer += n_send;
+        }
+
+        //
+        // Complete the reads issued above
+        //
+
+        MPI_Waitall(num_neighbors, request, MPI_STATUSES_IGNORE);
+
+        //[Experimental] Can improve MPI_Allreduce performance
+        #if 0
+        if (use_ibarrier == 1)
+            MPI_Ibarrier(MPI_COMM_WORLD, request);
+        #endif
+
+        delete[] request;
+    }
+    else if (P2P_Mode == MPI_CPU_All2allv)
+    {
+        double* const xv = x.values;
+        double* x_external = (double*) xv + localNumberOfRows;
+        for (local_int_t i = 0; i < totalToBeSent; i++)
+            sendBuffer[i] = xv[elementsToSend[i]];
+        MPI_Alltoallv(
+            sendBuffer, A.scounts, A.sdispls, MPI_DOUBLE, x_external, A.rcounts, A.rdispls, MPI_DOUBLE, MPI_COMM_WORLD);
+    }
+    return;
+}
+#endif
+// ifndef HPCG_NO_MPI
--- a/src/ExchangeHalo.hpp
+++ b/src/ExchangeHalo.hpp
@@ -0,0 +1,38 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef EXCHANGEHALO_HPP
+#define EXCHANGEHALO_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+void ExchangeHalo(const SparseMatrix& A, Vector& x);
+void ExchangeHaloCpu(const SparseMatrix& A, Vector& x, int use_ibarrier = 0);
+#endif // EXCHANGEHALO_HPP
--- a/src/GenerateCoarseProblem.cpp
+++ b/src/GenerateCoarseProblem.cpp
@@ -0,0 +1,158 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "GenerateCoarseProblem.hpp"
+#include "GenerateGeometry.hpp"
+#include "GenerateProblem.hpp"
+#include "SetupHalo.hpp"
+#include <cassert>
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+#endif
+
+/*!
+  Routine to construct a prolongation/restriction operator for a given fine grid matrix
+  solution (as computed by a direct solver).
+
+  @param[inout]  Af - The known system matrix, on output its coarse operator, fine-to-coarse operator and auxiliary
+  vectors will be defined.
+
+  Note that the matrix Af is considered const because the attributes we are modifying are declared as mutable.
+
+*/
+
+void GenerateCoarseProblem(const SparseMatrix& Af)
+{
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nxf = Af.geom->nx;
+    global_int_t nyf = Af.geom->ny;
+    global_int_t nzf = Af.geom->nz;
+
+    local_int_t nxc, nyc, nzc; // Coarse nx, ny, nz
+    assert(nxf % 2 == 0);
+    assert(nyf % 2 == 0);
+    assert(nzf % 2 == 0); // Need fine grid dimensions to be divisible by 2
+    nxc = nxf / 2;
+    nyc = nyf / 2;
+    nzc = nzf / 2;
+    local_int_t* f2cOperator = new local_int_t[Af.localNumberOfRows];
+
+    local_int_t localNumberOfRows = nxc * nyc * nzc; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if "int" overflows)
+
+    for (int i = 0; i < 3 * global_total_ranks; i++)
+        physical_rank_dims[i] = physical_rank_dims[i] / 2;
+
+    // Construct the geometry and linear system
+    Geometry* geomc = new Geometry;
+    GenerateGeometry(Af.geom->size, Af.geom->rank, Af.geom->numThreads, nxc, nyc, nzc, Af.geom->npx, Af.geom->npy,
+        Af.geom->npz, Af.geom->different_dim, geomc);
+    Vector* rc = new Vector;
+    Vector* xc = new Vector;
+    Vector* Axf = new Vector;
+    MGData* mgData = new MGData;
+    if (Af.rankType == GPU)
+    {
+        SparseMatrix* Ac = Af.Ac;
+        Ac->rankType = GPU;
+        InitializeSparseMatrix(*Ac, geomc);
+        GenerateProblem(*Ac, 0, 0, 0);
+        SetupHalo(*Ac);
+        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
+        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
+        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
+#ifdef USE_CUDA
+        cudaMemcpy(f2cOperator, Af.gpuAux.f2c, sizeof(local_int_t) * localNumberOfRows, cudaMemcpyDeviceToHost);
+#endif
+    }
+    else
+    {
+        SparseMatrix* Ac = new SparseMatrix;
+        InitializeSparseMatrix(*Ac, geomc);
+        Ac->rankType = CPU;
+        (*Ac).Ac = 0;
+        GenerateProblem(*Ac, 0, 0, 0);
+        SetupHalo(*Ac);
+        InitializeVector(*rc, Ac->localNumberOfRows, Ac->rankType);
+        InitializeVector(*xc, Ac->localNumberOfColumns, Ac->rankType);
+        InitializeVector(*Axf, Af.localNumberOfColumns, Ac->rankType);
+        Af.Ac = Ac;
+
+        // Use a parallel loop to do initial assignment:
+        // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        {
+            f2cOperator[i] = 0;
+        }
+
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for(local_int_t i = 0; i < nzc * nyc * nxc; i++)
+        {
+            local_int_t izc = (i / (nxc * nyc));
+            local_int_t iyc = (i - izc * nxc * nyc) / nxc;
+            local_int_t ixc = i - (izc * nyc + iyc) * nxc;
+
+            local_int_t izf = 2 * izc;
+            local_int_t iyf = 2 * iyc;
+            local_int_t ixf = 2 * ixc;
+
+            local_int_t currentCoarseRow = izc * nxc * nyc + iyc * nxc + ixc;
+            local_int_t currentFineRow = izf * nxf * nyf + iyf * nxf + ixf;
+            f2cOperator[currentCoarseRow] = currentFineRow;
+        }
+    }
+    InitializeMGData(f2cOperator, rc, xc, Axf, *mgData);
+    Af.mgData = mgData;
+
+    return;
+}
+
--- a/src/GenerateCoarseProblem.hpp
+++ b/src/GenerateCoarseProblem.hpp
@@ -0,0 +1,19 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATECOARSEPROBLEM_HPP
+#define GENERATECOARSEPROBLEM_HPP
+#include "SparseMatrix.hpp"
+void GenerateCoarseProblem(const SparseMatrix& A);
+#endif // GENERATECOARSEPROBLEM_HPP
--- a/src/GenerateGeometry.cpp
+++ b/src/GenerateGeometry.cpp
@@ -0,0 +1,801 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateGeometry.cpp
+
+ HPCG routine
+ */
+
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "ComputeOptimalShapeXYZ.hpp"
+#include "GenerateGeometry.hpp"
+
+#include <cstdio>
+
+#ifdef HPCG_DEBUG
+#include "hpcg.hpp"
+#include <fstream>
+using std::endl;
+
+#endif
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+extern int* logical_rank_to_phys;
+#endif
+
+/*!
+  Computes the factorization of the total number of processes into a
+  3-dimensional process grid that is as close as possible to a cube. The
+  quality of the factorization depends on the prime number structure of the
+  total number of processes. It then stores this decompostion together with the
+  parallel parameters of the run in the geometry data structure.
+
+  @param[in]  size total number of MPI processes
+  @param[in]  rank this process' rank among other MPI processes
+  @param[in]  numThreads number of OpenMP threads in this process
+  @param[in]  nx, ny, nz number of grid points for each local block in the x, y, and z dimensions, respectively
+  @param[out] geom data structure that will store the above parameters and the factoring of total number of processes
+  into three dimensions
+*/
+
+// Level 0 Generation, we need to decide nx, ny, nz based on
+// G2C ratio and npx, npy, npz
+//  Remap rank IDs to logical IDs to enforce 3D shape correctness when exec_mode is GPUCPU
+void GenerateGeometry(HPCG_Params& params, Geometry* geom)
+{
+    int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
+    int nx = params.nx, ny = params.ny, nz = params.nz;
+    int npx = params.npx, npy = params.npy, npz = params.npz;
+
+    // If npx. npy, and npz are not provided by user
+    // find the optimal shape
+    if (npx * npy * npz <= 0 || npx * npy * npz > size)
+        ComputeOptimalShapeXYZ(size, npx, npy, npz);
+
+    // When search_for_same0 is true, finds the next rank that is the same as local
+    //  problem size as rank 0. When false, finds the ranks that are not the same as rank 0
+    auto loop_over_ranks = [](int index, int lp, bool search_for_same0) -> int
+    {
+        for (int p = index; p < global_total_ranks; p++)
+        {
+            int nnpx = physical_rank_dims[3 * p];
+            int nnpy = physical_rank_dims[3 * p + 1];
+            int nnpz = physical_rank_dims[3 * p + 2];
+            bool same_zero = false;
+            if (nnpx == physical_rank_dims[0] && nnpy == physical_rank_dims[1] && nnpz == physical_rank_dims[2])
+                same_zero = true;
+
+            if (same_zero == search_for_same0)
+            {
+                logical_rank_to_phys[lp] = p;
+                index = p + 1;
+                break;
+            }
+        }
+        return index;
+    };
+
+    // Here decide and broadcast nx, ny, nz
+    // 1 Check for GPU and CPU execution modes
+    auto user_diff_dim = NONE;
+    if (params.exec_mode == GPUCPU)
+    {
+        // User defined diff direction between GPU and CPU
+        // If user decides that nz should be diff between GPU and CPU
+        //  and NPZ is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        if (params.diff_dim == Z && (npz & 1) == 0)
+        {
+            user_diff_dim = Z;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nz = nz / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    nz = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nz = nz / params.g2c;
+                if (params.rank_type == GPU)
+                    nz = nz - (nz / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    nz = params.g2c;
+                if (params.rank_type == GPU)
+                    nz = nz - params.g2c;
+            }
+        }
+        // If user decides that ny should be diff between GPU and CPU
+        //  and NPY is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        else if (params.diff_dim == Y && (npy & 1) == 0)
+        {
+            user_diff_dim = Y;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    ny = ny / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    ny = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    ny = ny / params.g2c;
+                if (params.rank_type == GPU)
+                    ny = ny - (ny / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    ny = params.g2c;
+                if (params.rank_type == GPU)
+                    ny = ny - params.g2c;
+            }
+        }
+        // If user decides that nx should be diff between GPU and CPU
+        //  and NPX is even --> Decide GPU and CPU local size based on
+        //  local_problem_def and g2c
+        else if (params.diff_dim == X && (npx & 1) == 0)
+        {
+            user_diff_dim = X;
+            if (params.local_problem_def == GPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nx = nx / params.g2c;
+            }
+            else if (params.local_problem_def == GPU_ABS)
+            {
+                if (params.rank_type == CPU)
+                    nx = params.g2c;
+            }
+            else if (params.local_problem_def == GPU_CPU_RATIO)
+            {
+                if (params.rank_type == CPU)
+                    nx = nx / params.g2c;
+                if (params.rank_type == GPU)
+                    nx = nx - (nx / params.g2c);
+            }
+            else
+            { /*GPU_CPU_ABS*/
+                if (params.rank_type == CPU)
+                    nx = params.g2c;
+                if (params.rank_type == GPU)
+                    nx = nx - params.g2c;
+            }
+        }
+        // Automatic partition direction
+        // When user does not specify the diff dimension
+        if (user_diff_dim == NONE)
+        { // Did not succeed with user choice
+            if ((npz & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nz = nz / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        nz = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nz = nz / params.g2c;
+                    if (params.rank_type == GPU)
+                        nz = nz - (nz / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        nz = params.g2c;
+                    if (params.rank_type == GPU)
+                        nz = nz - params.g2c;
+                }
+            }
+            else if ((npy & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        ny = ny / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        ny = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        ny = ny / params.g2c;
+                    if (params.rank_type == GPU)
+                        ny = ny - (ny / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        ny = params.g2c;
+                    if (params.rank_type == GPU)
+                        ny = ny - params.g2c;
+                }
+            }
+            else if ((npx & 1) == 0)
+            {
+                if (params.local_problem_def == GPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nx = nx / params.g2c;
+                }
+                else if (params.local_problem_def == GPU_ABS)
+                {
+                    if (params.rank_type == CPU)
+                        nx = params.g2c;
+                }
+                else if (params.local_problem_def == GPU_CPU_RATIO)
+                {
+                    if (params.rank_type == CPU)
+                        nx = nx / params.g2c;
+                    if (params.rank_type == GPU)
+                        nx = nx - (nx / params.g2c);
+                }
+                else
+                { /*GPU_CPU_ABS*/
+                    if (params.rank_type == CPU)
+                        nx = params.g2c;
+                    if (params.rank_type == GPU)
+                        nx = nx - params.g2c;
+                }
+            }
+        }
+    }
+
+    // Now let us exchange dimensions
+    int sendBuf[] = {nx, ny, nz};
+#ifndef HPCG_NO_MPI
+    MPI_Allgather(sendBuf, 3, MPI_INT, physical_rank_dims, 3, MPI_INT, MPI_COMM_WORLD);
+#endif
+
+    // My logical rank Id
+    int logical_rank;
+    // last physical position for the rank that has the same size as 0
+    int same_as_0_position = 0;
+    // last physical position for the rank that does not have the same size as 0
+    int not_same_as_0_position = 0;
+    auto different_dim = NONE;
+
+    bool all_same = true;
+    int num_ranks_same = 1;
+    int num_ranks_not_same = 0;
+    int x0 = physical_rank_dims[0];
+    int y0 = physical_rank_dims[1];
+    int z0 = physical_rank_dims[2];
+    for (int p = 1; p < global_total_ranks; p++)
+    {
+        int x = physical_rank_dims[3 * p];
+        int y = physical_rank_dims[3 * p + 1];
+        int z = physical_rank_dims[3 * p + 2];
+        if (x != x0 || y != y0 || z != z0)
+            num_ranks_not_same++;
+        else
+            num_ranks_same++;
+    }
+
+    if (num_ranks_not_same > 0)
+        all_same = false;
+
+    if (!all_same)
+    {
+        // try twice: user-based, automatic
+        for (int i = 0; i < 2; i++)
+        {
+            bool z_condition = (i == 0) ? user_diff_dim == Z && (npz & 1) == 0 : (npz & 1) == 0;
+            bool y_condition = (i == 0) ? user_diff_dim == Y && (npy & 1) == 0 : (npy & 1) == 0;
+            bool x_condition = (i == 0) ? user_diff_dim == X && (npx & 1) == 0 : (npx & 1) == 0;
+            // Let us start with Z
+            if (z_condition)
+            { // Z is even
+                different_dim = Z;
+                bool x_same = true;
+                bool y_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int x = physical_rank_dims[3 * p];
+                    int y = physical_rank_dims[3 * p + 1];
+                    assert(x == x0 && y == y0);
+                }
+            }
+            else if (y_condition)
+            { // Y is even
+                different_dim = Y;
+                bool x_same = true;
+                bool z_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int x = physical_rank_dims[3 * p];
+                    int z = physical_rank_dims[3 * p + 2];
+                    assert(x == x0 && z == z0);
+                }
+            }
+            else if (x_condition)
+            {
+                different_dim = X;
+                bool y_same = true;
+                bool z_same = true;
+                for (int p = 1; p < global_total_ranks; p++)
+                {
+                    int y = physical_rank_dims[3 * p + 1];
+                    int z = physical_rank_dims[3 * p + 2];
+                    assert(z == z0 && y == y0);
+                }
+            }
+
+            if (z_condition || y_condition || x_condition)
+                break;
+        }
+    }
+
+    // When exec_mode is GPUCPU, GPU and CPU ranks can have different dims. Therefore,
+    // we must rearrange the ranks such that the 3D shape is correct.
+    int same_rank_counter = 0;
+    if (different_dim != NONE)
+    {
+        for (int iz = 0; iz < npz; iz++)
+            for (int iy = 0; iy < npy; iy++)
+                for (int ix = 0; ix < npx; ix++)
+                {
+                    int logical_position = iz * npy * npx + iy * npx + ix;
+
+                    // Different dim is Z
+                    // The first NPXxNPY are GPUs, then the next NPXxNPY is CPUs, and so on
+                    if (different_dim == Z)
+                    {
+                        if ((iz & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                    // Different dim is Y
+                    // The first NPXxNPZ are GPUs, then the next NPXxNPZ is CPUs, and so on
+                    else if (different_dim == Y)
+                    {
+                        if ((iy & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                    // Different dim is X
+                    // The first NPYxNPZ are GPUs, then the next NPYxNPZ is CPUs, and so on
+                    else if (different_dim == X)
+                    {
+                        if ((ix & 1) == 0 && same_rank_counter < num_ranks_same)
+                        { // same as 0
+                            same_as_0_position = loop_over_ranks(same_as_0_position, logical_position, true);
+                            same_rank_counter++;
+                        }
+                        else
+                        { // Not same as 0
+                            not_same_as_0_position = loop_over_ranks(not_same_as_0_position, logical_position, false);
+                        }
+                    }
+                }
+    }
+    else
+    {
+        // Keep rank Ids the same if all ranks have the same problem size
+        for (int p = 0; p < global_total_ranks; p++)
+            logical_rank_to_phys[p] = p;
+    }
+
+    for (int p = 0; p < global_total_ranks; p++)
+    {
+        if (rank == logical_rank_to_phys[p])
+        {
+            logical_rank = p;
+        }
+    }
+
+    // Now compute this process's indices in the 3D cube
+    int ipz = logical_rank / (npx * npy);
+    int ipy = (logical_rank - ipz * npx * npy) / npx;
+    int ipx = logical_rank % npx;
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        HPCG_fout << "size = " << size << endl
+                  << "nx  = " << nx << endl
+                  << "ny  = " << ny << endl
+                  << "nz  = " << nz << endl
+                  << "npx = " << npx << endl
+                  << "npy = " << npy << endl
+                  << "npz = " << npz << endl;
+
+    HPCG_fout << "For rank = " << rank << endl
+              << "ipx = " << ipx << endl
+              << "ipy = " << ipy << endl
+              << "ipz = " << ipz << endl;
+
+    assert(size >= npx * npy * npz);
+#endif
+    geom->size = size;
+    geom->rank = rank;
+    geom->logical_rank = logical_rank;
+    geom->different_dim = different_dim;
+    geom->numThreads = params.numThreads;
+    geom->nx = nx;
+    geom->ny = ny;
+    geom->nz = nz;
+    geom->npx = npx;
+    geom->npy = npy;
+    geom->npz = npz;
+    geom->ipx = ipx;
+    geom->ipy = ipy;
+    geom->ipz = ipz;
+
+    // These values should be defined to take into account changes in nx, ny, nz values
+    // due to variable local grid sizes
+    global_int_t gnx = 0;
+    global_int_t gny = 0;
+    global_int_t gnz = 0;
+
+    // Find the global NX. NY, and NZ
+    //  For diff dims, accumulate sequentially
+    //  For similar dims, just multiply rank 3D location by the local dim
+    if (different_dim == X)
+        for (int i = 0; i < npx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gnx += physical_rank_dims[p * 3];
+        }
+    else
+        gnx = npx * nx;
+
+    if (different_dim == Y)
+        for (int i = 0; i < npy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gny += physical_rank_dims[p * 3 + 1];
+        }
+    else
+        gny = npy * ny;
+
+    if (different_dim == Z)
+        for (int i = 0; i < npz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gnz += physical_rank_dims[p * 3 + 2];
+        }
+    else
+        gnz = npz * nz;
+
+    // Here, we find the initial global indices (gix0, giy0, and giz0)
+    // for each rank based on its 3d location in the grid
+    // Also, for the diff dim find the previous and next neighbor IDs
+    // Notice, on the diff dims the previous and next neighbors have
+    // the different dimension!
+    int prev_n = 0;
+    int next_n = 0;
+    global_int_t giz0 = 0;
+    global_int_t gix0 = 0;
+    global_int_t giy0 = 0;
+    if (different_dim == X)
+    {
+        for (int i = 0; i < ipx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gix0 += physical_rank_dims[p * 3];
+            if (i == ipx - 1)
+            {
+                prev_n = physical_rank_dims[p * 3];
+            }
+        }
+        if (ipx + 1 < npx)
+        {
+            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3];
+        }
+    }
+    else
+        gix0 = ipx * nx;
+
+    if (different_dim == Y)
+    {
+        for (int i = 0; i < ipy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giy0 += physical_rank_dims[p * 3 + 1];
+            if (i == ipy - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 1];
+            }
+        }
+        if (ipy + 1 < npy)
+        {
+            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 1];
+        }
+    }
+    else
+        giy0 = ipy * ny;
+
+    if (different_dim == Z)
+    {
+        for (int i = 0; i < ipz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giz0 += physical_rank_dims[p * 3 + 2];
+            if (i == ipz - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 2];
+            }
+        }
+        if (ipz + 1 < npz)
+        {
+            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 2];
+        }
+    }
+    else
+        giz0 = ipz * nz;
+
+    // Keep these values for later
+    geom->gnx = gnx;
+    geom->gny = gny;
+    geom->gnz = gnz;
+    geom->gix0 = gix0;
+    geom->giy0 = giy0;
+    geom->giz0 = giz0;
+    geom->previous_neighbor_dim = prev_n;
+    geom->next_neighbor_dim = next_n;
+
+    return;
+}
+
+// Simpler generateion for next/coarse levels
+// Do not need to find nx, ny, nz for CPU and GPU based on parameters
+// Do not need to find logical rank IDs
+void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
+    int npy, int npz, dim_3d_t different_dim, Geometry* geom)
+{
+
+    // My logical rank Id
+    int logical_rank;
+    for (int p = 0; p < global_total_ranks; p++)
+    {
+        if (rank == logical_rank_to_phys[p])
+        {
+            logical_rank = p;
+        }
+    }
+
+    // Now compute this process's indices in the 3D cube
+    int ipz = logical_rank / (npx * npy);
+    int ipy = (logical_rank - ipz * npx * npy) / npx;
+    int ipx = logical_rank % npx;
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        HPCG_fout << "size = " << size << endl
+                  << "nx  = " << nx << endl
+                  << "ny  = " << ny << endl
+                  << "nz  = " << nz << endl
+                  << "npx = " << npx << endl
+                  << "npy = " << npy << endl
+                  << "npz = " << npz << endl;
+
+    HPCG_fout << "For rank = " << rank << endl
+              << "ipx = " << ipx << endl
+              << "ipy = " << ipy << endl
+              << "ipz = " << ipz << endl;
+
+    assert(size >= npx * npy * npz);
+#endif
+    geom->size = size;
+    geom->rank = rank;
+    geom->logical_rank = logical_rank;
+    geom->different_dim = different_dim;
+    geom->numThreads = numThreads;
+    geom->nx = nx;
+    geom->ny = ny;
+    geom->nz = nz;
+    geom->npx = npx;
+    geom->npy = npy;
+    geom->npz = npz;
+    geom->ipx = ipx;
+    geom->ipy = ipy;
+    geom->ipz = ipz;
+
+    // Find the global NX. NY, and NZ
+    //  For diff dims, accumulate sequentially
+    //  For similar dims, just multiply rank 3D location by the local dim
+    global_int_t gnx = 0;
+    global_int_t gny = 0;
+    global_int_t gnz = 0;
+    if (different_dim == X)
+        for (int i = 0; i < npx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gnx += physical_rank_dims[p * 3];
+        }
+    else
+        gnx = npx * nx;
+
+    if (different_dim == Y)
+        for (int i = 0; i < npy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gny += physical_rank_dims[p * 3 + 1];
+        }
+    else
+        gny = npy * ny;
+
+    if (different_dim == Z)
+        for (int i = 0; i < npz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            gnz += physical_rank_dims[p * 3 + 2];
+        }
+    else
+        gnz = npz * nz;
+
+    // Here, we find the initial global indices (gix0, giy0, and giz0)
+    // for each rank based on its 3d location in the grid
+    // Also, for the diff dim find the previous and next neighbor IDs
+    // Notice, on the diff dims the previous and next neighbors have
+    // the different dimension!
+    int prev_n = 0;
+    int next_n = 0;
+    global_int_t giz0 = 0;
+    global_int_t gix0 = 0;
+    global_int_t giy0 = 0;
+    if (different_dim == X)
+    {
+        for (int i = 0; i < ipx; i++)
+        {
+            int r = ipz * npx * npy + ipy * npx + i;
+            int p = logical_rank_to_phys[r];
+            gix0 += physical_rank_dims[p * 3];
+            if (i == ipx - 1)
+            {
+                prev_n = physical_rank_dims[p * 3];
+            }
+        }
+        if (ipx + 1 < npx)
+        {
+            int r = ipz * npx * npy + ipy * npx + (ipx + 1);
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3];
+        }
+    }
+    else
+        gix0 = ipx * nx;
+
+    if (different_dim == Y)
+    {
+        for (int i = 0; i < ipy; i++)
+        {
+            int r = ipz * npx * npy + i * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giy0 += physical_rank_dims[p * 3 + 1];
+            if (i == ipy - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 1];
+            }
+        }
+        if (ipy + 1 < npy)
+        {
+            int r = ipz * npx * npy + (ipy + 1) * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 1];
+        }
+    }
+    else
+        giy0 = ipy * ny;
+
+    if (different_dim == Z)
+    {
+        for (int i = 0; i < ipz; i++)
+        {
+            int r = i * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            giz0 += physical_rank_dims[p * 3 + 2];
+            if (i == ipz - 1)
+            {
+                prev_n = physical_rank_dims[p * 3 + 2];
+            }
+        }
+        if (ipz + 1 < npz)
+        {
+            int r = (ipz + 1) * npx * npy + ipy * npx + ipx;
+            int p = logical_rank_to_phys[r];
+            next_n = physical_rank_dims[p * 3 + 2];
+        }
+    }
+    else
+        giz0 = ipz * nz;
+
+    // Keep these values for later
+    geom->gnx = gnx;
+    geom->gny = gny;
+    geom->gnz = gnz;
+    geom->gix0 = gix0;
+    geom->giy0 = giy0;
+    geom->giz0 = giz0;
+    geom->previous_neighbor_dim = prev_n;
+    geom->next_neighbor_dim = next_n;
+
+    return;
+}
--- a/src/GenerateGeometry.hpp
+++ b/src/GenerateGeometry.hpp
@@ -0,0 +1,39 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GENERATEGEOMETRY_HPP
+#define GENERATEGEOMETRY_HPP
+#include "Geometry.hpp"
+#include "hpcg.hpp"
+void GenerateGeometry(HPCG_Params& params, Geometry* geom);
+void GenerateGeometry(int size, int rank, int numThreads, local_int_t nx, local_int_t ny, local_int_t nz, int npx,
+    int npy, int npz, dim_3d_t partition_by, Geometry* geom);
+#endif // GENERATEGEOMETRY_HPP
--- a/src/GenerateProblem.cpp
+++ b/src/GenerateProblem.cpp
@@ -0,0 +1,404 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file GenerateProblem.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "mytimer.hpp"
+
+#include "GenerateProblem.hpp"
+#include "GenerateProblem_ref.hpp"
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+/*!
+  Routine to generate a sparse matrix, right hand side, initial guess, and exact solution.
+
+  @param[in]  A        The generated system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+#ifdef USE_CUDA
+void GenerateProblem_Gpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz;
+    local_int_t numberOfNonzerosPerRow = 27;
+    global_int_t totalNumberOfRows = gnx * gny * gnz;
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, GPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, GPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, GPU);
+
+    GenerateProblemCuda(A, b, x, xexact);
+
+    local_int_t localNumberOfNonzeros = A.localNumberOfNonzeros;
+    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+        + 18LL
+            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
+        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+
+    return;
+}
+#endif
+
+#ifdef USE_GRACE
+// Neighbor rank to sequential ID and vice versa
+extern int *rankToId_h, *idToRank_h;
+// GenerateProblem_Cpu is called 4 times for each level
+// Sometimes we need to perform actions based on the level (global across the applications)
+int global_steps = 0;
+void GenerateProblem_Cpu(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+    int npx = A.geom->npx;
+    int npy = A.geom->npy;
+
+    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+    local_int_t numberOfNonzerosPerRow
+        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
+
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    assert(totalNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+
+    // Allocate arrays that are of length localNumberOfRows
+    if (global_steps == 0)
+    {
+        rankToId_h = new int[A.geom->size + 1];
+        idToRank_h = new int[27];
+        global_steps++;
+    }
+    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
+    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
+    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
+    double** matrixValues = new double*[localNumberOfRows];
+    double** matrixDiagonal = new double*[localNumberOfRows];
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, CPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, CPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, CPU);
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+    A.localToGlobalMap.resize(localNumberOfRows);
+
+    // Use a parallel loop to do initial assignment:
+    // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+    {
+        matrixValues[i] = 0;
+        matrixDiagonal[i] = 0;
+        mtxIndG[i] = 0;
+        mtxIndL[i] = 0;
+    }
+
+    if (global_steps == 1)
+    {
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+        for (local_int_t i = 0; i < A.geom->size + 1; i++)
+        {
+            rankToId_h[i] = 0;
+        }
+        global_steps++;
+    }
+
+    // Now allocate the arrays pointed to
+    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
+    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+
+    local_int_t localNumberOfNonzeros = 0;
+    local_int_t ext_nnz = 0;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for reduction(+ : localNumberOfNonzeros) reduction(+ : ext_nnz)
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
+        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
+        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
+
+        const local_int_t iz = (i / (nx * ny));
+        const local_int_t iy = (i - iz * nx * ny) / nx;
+        const local_int_t ix = i - (iz * ny + iy) * nx;
+        const global_int_t gix = ix + gix0;
+        const global_int_t giy = iy + giy0;
+        const global_int_t giz = iz + giz0;
+
+        local_int_t currentLocalRow = i;
+        global_int_t currentGlobalRow = gix + giy * gnx + giz * gnx * gny;
+
+        A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
+
+        char numberOfNonzerosInRow = 0;
+        double* currentValuePointer = matrixValues[currentLocalRow];
+        global_int_t* currentIndexPointerG = mtxIndG[currentLocalRow];
+        global_int_t curcol;
+        double* diagonalPointer = nullptr;
+        // Go through all the neighbors around a 3D point to decide
+        //  which one is a halo and which one is local to the rank
+        for (int k = 0; k < 27; k++)
+        {
+            // Neibor global Ids
+            long long int cgix = gix + tid2indCpu[k][0];
+            long long int cgiy = giy + tid2indCpu[k][1];
+            long long int cgiz = giz + tid2indCpu[k][2];
+
+            // These used when the point is local to the rank
+            local_int_t zi = (cgiz) % nz;
+            local_int_t yi = (cgiy) % ny;
+            local_int_t xi = (cgix) % nx;
+            // local column Id
+            local_int_t lcol = zi * ny * nx + yi * nx + xi;
+
+            // Is the global 3D point inside the global problem?
+            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+
+            if (ok /*Yes this a valid point globally*/)
+            {
+                *currentIndexPointerG++ = cgix + cgiy * gnx + cgiz * gnx * gny;
+                ;
+                if (k == 13)
+                {
+                    *currentValuePointer = 26.0;
+                    diagonalPointer = currentValuePointer;
+                }
+                else
+                {
+                    *currentValuePointer = -1.0;
+                }
+
+                // Rank Id in the global domain
+                int ipz = cgiz / nz;
+                int ipy = cgiy / ny;
+                int ipx = cgix / nx;
+
+                // For GPUCPU exec mode, when the CPU and GPU have diff dims in a direction,
+                //  we need to find the point rank manually, not based on its local dimension
+                //  but based on its physical location to the local problem
+                //  Note the halo size is always 1
+                if (A.geom->different_dim == Z)
+                {
+                    long long int local = cgiz - giz0;
+                    if (local >= 0 && local < nz)
+                        ipz = A.geom->ipz;
+                    else if (local < 0)
+                        ipz = A.geom->ipz - 1;
+                    else if (local >= nz)
+                        ipz = A.geom->ipz + 1;
+                }
+                else if (A.geom->different_dim == Y)
+                {
+                    long long int local = cgiy - giy0;
+                    if (local >= 0 && local < ny)
+                        ipy = A.geom->ipy;
+                    else if (local < 0)
+                        ipy = A.geom->ipy - 1;
+                    else if (local >= ny)
+                        ipy = A.geom->ipy + 1;
+                }
+                else if (A.geom->different_dim == X)
+                {
+                    long long int local = cgix - gix0;
+                    if (local >= 0 && local < nx)
+                        ipx = A.geom->ipx;
+                    else if (local < 0)
+                        ipx = A.geom->ipx - 1;
+                    else if (local >= nx)
+                        ipx = A.geom->ipx + 1;
+                }
+
+                // Now, after find the point rank from the location
+                //  in the 3D grid (ranks domain NPXxNPYxNPZ)
+                int col_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                // The neighbor point rank is diff than the current point rank
+                if (A.geom->logical_rank != col_rank)
+                {
+                    if (global_steps == 2)
+                        rankToId_h[col_rank + 1] = 1; // To find its sequential Id (will be prefix summed later)
+                    ext_nnz++;
+                }
+
+                currentValuePointer++;
+                numberOfNonzerosInRow++;
+            }
+        }
+
+        matrixDiagonal[currentLocalRow] = diagonalPointer;
+        nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
+        localNumberOfNonzeros += numberOfNonzerosInRow;
+        if (b != 0)
+            bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
+        if (x != 0)
+            xv[currentLocalRow] = 0.0;
+        if (xexact != 0)
+            xexactv[currentLocalRow] = 1.0;
+    }
+
+    // Prefixsum to RakToId
+    // Map physical neighbor ranks to sequential IDs
+    //  less memory consumption
+    if (global_steps == 2)
+    {
+        PrefixsumCpu(rankToId_h + 1, A.geom->size);
+        int counter = 1;
+        for (int i = 1; i < A.geom->size + 1; i++)
+        {
+            if (rankToId_h[i] == counter)
+            {
+                idToRank_h[counter - 1] = i - 1;
+                counter++;
+            }
+        }
+        global_steps++;
+    }
+
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 27LL * ((gnx - 2LL) * (gny - 2LL) * (gnz - 2LL))
+        + 18LL
+            * (2LL * ((gnx - 2LL) * (gny - 2LL)) + 2LL * ((gnx - 2LL) * (gnz - 2LL))
+                + 2LL * ((gny - 2LL) * (gnz - 2LL)))
+        + 12LL * (4LL * (gnx - 2LL) + 4LL * (gny - 2LL) + 4LL * (gnz - 2LL)) + 8LL * 8LL;
+
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
+    assert(totalNumberOfNonzeros
+        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+    A.nonzerosInRow = nonzerosInRow;
+    A.mtxIndG = mtxIndG;
+    A.mtxIndL = mtxIndL;
+    A.matrixValues = matrixValues;
+    A.matrixDiagonal = matrixDiagonal;
+    A.extNnz = ext_nnz;
+
+    return;
+}
+#endif // USE_GRACE
+
+void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        GenerateProblem_Gpu(A, b, x, xexact);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        GenerateProblem_Cpu(A, b, x, xexact);
+#endif
+    }
+}
--- a/src/GenerateProblem.hpp
+++ b/src/GenerateProblem.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATEPROBLEM_HPP
+#define GENERATEPROBLEM_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+void GenerateProblem(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // GENERATEPROBLEM_HPP
--- a/src/GenerateProblem_ref.cpp
+++ b/src/GenerateProblem_ref.cpp
@@ -0,0 +1,251 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file GenerateProblem_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#endif
+#include <cassert>
+
+#include "GenerateProblem_ref.hpp"
+
+/*!
+  Reference version of GenerateProblem to generate the sparse matrix, right hand side, initial guess, and exact
+  solution.
+
+  @param[in]  A      The known system matrix
+  @param[inout] b      The newly allocated and generated right hand side vector (if b!=0 on entry)
+  @param[inout] x      The newly allocated solution vector with entries set to 0.0 (if x!=0 on entry)
+  @param[inout] xexact The newly allocated solution vector with entries set to the exact solution (if the xexact!=0
+  non-zero on entry)
+
+  @see GenerateGeometry
+*/
+
+void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact)
+{
+
+    // Make local copies of geometry information.  Use global_int_t since the RHS products in the calculations
+    // below may result in global range values.
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+    local_int_t localNumberOfRows = nx * ny * nz; // This is the size of our subblock
+    // If this assert fails, it most likely means that the local_int_t is set to int and should be set to long long
+    assert(localNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+    local_int_t numberOfNonzerosPerRow
+        = 27; // We are approximating a 27-point finite element/volume/difference 3D stencil
+
+    global_int_t totalNumberOfRows = gnx * gny * gnz; // Total number of grid points in mesh
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    assert(totalNumberOfRows
+        > 0); // Throw an exception of the number of rows is less than zero (can happen if int overflow)
+
+    // Allocate arrays that are of length localNumberOfRows
+    local_int_t* nonzerosInRow = new local_int_t[localNumberOfRows];
+    global_int_t** mtxIndG = new global_int_t*[localNumberOfRows];
+    local_int_t** mtxIndL = new local_int_t*[localNumberOfRows];
+    double** matrixValues = new double*[localNumberOfRows];
+    double** matrixDiagonal = new double*[localNumberOfRows];
+
+    if (b != 0)
+        InitializeVector(*b, localNumberOfRows, CPU);
+    if (x != 0)
+        InitializeVector(*x, localNumberOfRows, CPU);
+    if (xexact != 0)
+        InitializeVector(*xexact, localNumberOfRows, CPU);
+    double* bv = 0;
+    double* xv = 0;
+    double* xexactv = 0;
+    if (b != 0)
+        bv = b->values; // Only compute exact solution if requested
+    if (x != 0)
+        xv = x->values; // Only compute exact solution if requested
+    if (xexact != 0)
+        xexactv = xexact->values; // Only compute exact solution if requested
+    A.localToGlobalMap.resize(localNumberOfRows);
+
+    // Use a parallel loop to do initial assignment:
+    // distributes the physical placement of arrays of pointers across the memory system
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+    {
+        matrixValues[i] = 0;
+        matrixDiagonal[i] = 0;
+        mtxIndG[i] = 0;
+        mtxIndL[i] = 0;
+    }
+
+#ifndef HPCG_CONTIGUOUS_ARRAYS
+    // Now allocate the arrays pointed to
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        matrixValues[i] = new double[numberOfNonzerosPerRow];
+    for (local_int_t i = 0; i < localNumberOfRows; ++i)
+        mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
+
+#else
+    // Now allocate the arrays pointed to
+    mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+    matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
+    mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
+
+    for (local_int_t i = 1; i < localNumberOfRows; ++i)
+    {
+        mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
+        matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
+        mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
+    }
+#endif
+
+    local_int_t localNumberOfNonzeros = 0;
+    // TODO:  This triply nested loop could be flattened or use nested parallelism
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t iz = 0; iz < nz; iz++)
+    {
+        global_int_t giz = giz0 + iz;
+        for (local_int_t iy = 0; iy < ny; iy++)
+        {
+            global_int_t giy = giy0 + iy;
+            for (local_int_t ix = 0; ix < nx; ix++)
+            {
+                global_int_t gix = gix0 + ix;
+                local_int_t currentLocalRow = iz * nx * ny + iy * nx + ix;
+                global_int_t currentGlobalRow = giz * gnx * gny + giy * gnx + gix;
+#ifndef HPCG_NO_OPENMP
+                // C++ std::map is not threadsafe for writing
+#pragma omp critical
+#endif
+                A.globalToLocalMap[currentGlobalRow] = currentLocalRow;
+
+                A.localToGlobalMap[currentLocalRow] = currentGlobalRow;
+#ifdef HPCG_DETAILED_DEBUG
+                HPCG_fout << " rank, globalRow, localRow = " << A.geom->rank << " " << currentGlobalRow << " "
+                          << A.globalToLocalMap[currentGlobalRow] << endl;
+#endif
+                char numberOfNonzerosInRow = 0;
+                double* currentValuePointer = matrixValues[currentLocalRow]; // Pointer to current value in current row
+                global_int_t* currentIndexPointerG
+                    = mtxIndG[currentLocalRow]; // Pointer to current index in current row
+                for (int sz = -1; sz <= 1; sz++)
+                {
+                    if (giz + sz > -1 && giz + sz < gnz)
+                    {
+                        for (int sy = -1; sy <= 1; sy++)
+                        {
+                            if (giy + sy > -1 && giy + sy < gny)
+                            {
+                                for (int sx = -1; sx <= 1; sx++)
+                                {
+                                    if (gix + sx > -1 && gix + sx < gnx)
+                                    {
+                                        global_int_t curcol = currentGlobalRow + sz * gnx * gny + sy * gnx + sx;
+                                        if (curcol == currentGlobalRow)
+                                        {
+                                            matrixDiagonal[currentLocalRow] = currentValuePointer;
+                                            *currentValuePointer++ = 26.0;
+                                        }
+                                        else
+                                        {
+                                            *currentValuePointer++ = -1.0;
+                                        }
+                                        *currentIndexPointerG++ = curcol;
+                                        numberOfNonzerosInRow++;
+                                    } // end x bounds test
+                                } // end sx loop
+                            } // end y bounds test
+                        } // end sy loop
+                    } // end z bounds test
+                } // end sz loop
+                nonzerosInRow[currentLocalRow] = numberOfNonzerosInRow;
+#ifndef HPCG_NO_OPENMP
+#pragma omp critical
+#endif
+                localNumberOfNonzeros += numberOfNonzerosInRow; // Protect this with an atomic
+                if (b != 0)
+                    bv[currentLocalRow] = 26.0 - ((double) (numberOfNonzerosInRow - 1));
+                if (x != 0)
+                    xv[currentLocalRow] = 0.0;
+                if (xexact != 0)
+                    xexactv[currentLocalRow] = 1.0;
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfRows << " rows."
+              << endl
+              << "Process " << A.geom->rank << " of " << A.geom->size << " has " << localNumberOfNonzeros
+              << " nonzeros." << endl;
+#endif
+
+    global_int_t totalNumberOfNonzeros = 0;
+#ifndef HPCG_NO_MPI
+    // Use MPI's reduce function to sum all nonzeros
+#ifdef HPCG_NO_LONG_LONG
+    MPI_Allreduce(&localNumberOfNonzeros, &totalNumberOfNonzeros, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+#else
+    long long lnnz = localNumberOfNonzeros, gnnz = 0; // convert to 64 bit for MPI call
+    MPI_Allreduce(&lnnz, &gnnz, 1, MPI_LONG_LONG_INT, MPI_SUM, MPI_COMM_WORLD);
+    totalNumberOfNonzeros = gnnz; // Copy back
+#endif
+#else
+    totalNumberOfNonzeros = localNumberOfNonzeros;
+#endif
+    // If this assert fails, it most likely means that the global_int_t is set to int and should be set to long long
+    // This assert is usually the first to fail as problem size increases beyond the 32-bit integer range.
+    assert(totalNumberOfNonzeros
+        > 0); // Throw an exception of the number of nonzeros is less than zero (can happen if int overflow)
+
+    A.title = 0;
+    A.totalNumberOfRows = totalNumberOfRows;
+    A.totalNumberOfNonzeros = totalNumberOfNonzeros;
+    A.localNumberOfRows = localNumberOfRows;
+    A.localNumberOfColumns = localNumberOfRows;
+    A.localNumberOfNonzeros = localNumberOfNonzeros;
+    A.nonzerosInRow = nonzerosInRow;
+    A.mtxIndG = mtxIndG;
+    A.mtxIndL = mtxIndL;
+    A.matrixValues = matrixValues;
+    A.matrixDiagonal = matrixDiagonal;
+
+    return;
+}
--- a/src/GenerateProblem_ref.hpp
+++ b/src/GenerateProblem_ref.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef GENERATEPROBLEM_REF_HPP
+#define GENERATEPROBLEM_REF_HPP
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+void GenerateProblem_ref(SparseMatrix& A, Vector* b, Vector* x, Vector* xexact);
+#endif // GENERATEPROBLEM_REF_HPP
--- a/src/Geometry.hpp
+++ b/src/Geometry.hpp
@@ -0,0 +1,207 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file Geometry.hpp
+
+ HPCG data structure for problem geometry
+ */
+
+#ifndef GEOMETRY_HPP
+#define GEOMETRY_HPP
+
+/*!
+  This defines the type for integers that have local subdomain dimension.
+
+  Define as "long long" when local problem dimension is > 2^31
+*/
+// #define INDEX_64
+
+#ifndef INDEX_64
+typedef int local_int_t;
+#else
+typedef long long local_int_t;
+#endif
+
+/*!
+  This defines the type for integers that have global dimension
+
+  Define as "long long" when global problem dimension is > 2^31
+*/
+#ifdef HPCG_NO_LONG_LONG
+typedef int global_int_t;
+#else
+typedef long long global_int_t;
+#endif
+
+#define HPCG_MAX_ROW_LEN 27
+
+// Enums
+typedef enum
+{
+    X = 0,
+    Y = 1,
+    Z = 2,
+    NONE = 3
+} dim_3d_t;
+typedef enum
+{
+    MPI_CPU,
+    MPI_CUDA_AWARE,
+    MPI_GPU_All2allv,
+    MPI_CPU_All2allv,
+    NCCL /*GPUONLY*/
+} p2p_comm_mode_t;
+typedef enum
+{
+    CPU,
+    GPU
+} rank_type_t;
+typedef enum
+{
+    GPUONLY = 0,
+    CPUONLY = 1,
+    GPUCPU = 2
+} exec_mode_t;
+typedef enum
+{
+    GPU_RATIO = 0 /*NX, NY, NZ are local to GPU and g2c is a ratio*/,
+    GPU_ABS = 1 /*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
+    GPU_CPU_RATIO = 2 /*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
+    GPU_CPU_ABS = 3 /*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
+} local_problem_def_t;
+
+// This macro should be defined if the global_int_t is not long long
+// in order to stop complaints from non-C++11 compliant compilers.
+// #define HPCG_NO_LONG_LONG
+
+/*!
+  This is a data structure to contain all processor geometry information
+*/
+struct Geometry_STRUCT
+{
+    int size;         //!< Number of MPI processes
+    int rank;         //!< This process' rank in the range [0 to size - 1]
+    int logical_rank; //!< For hetrogeneous setup,
+    int numThreads;   //!< This process' number of threads
+    local_int_t nx;   //!< Number of x-direction grid points for each local subdomain
+    local_int_t ny;   //!< Number of y-direction grid points for each local subdomain
+    local_int_t nz;   //!< Number of z-direction grid points for each local subdomain
+    int npx;          //!< Number of processors in x-direction
+    int npy;          //!< Number of processors in y-direction
+    int npz;          //!< Number of processors in z-direction
+    int pz;           //!< partition ID of z-dimension process that starts the second region of nz values
+    int npartz;       //!< Number of partitions with varying nz values
+    int* partz_ids;   //!< Array of partition ids of processor in z-direction where new value of nz starts (valid values
+                      //!< are 1 to npz)
+    local_int_t* partz_nz; //!< Array of length npartz containing the nz values for each partition
+    int ipx;               //!< Current rank's x location in the npx by npy by npz processor grid
+    int ipy;               //!< Current rank's y location in the npx by npy by npz processor grid
+    int ipz;               //!< Current rank's z location in the npx by npy by npz processor grid
+    global_int_t gnx;      //!< Global number of x-direction grid points
+    global_int_t gny;      //!< Global number of y-direction grid points
+    global_int_t gnz;      //!< Global number of z-direction grid points
+    global_int_t gix0;     //!< Base global x index for this rank in the npx by npy by npz processor grid
+    global_int_t giy0;     //!< Base global y index for this rank in the npx by npy by npz processor grid
+    global_int_t giz0;     //!< Base global z index for this rank in the npx by npy by npz processor grid
+
+    dim_3d_t different_dim; //!< The dimension that the GPU and CPU rank are partitioned along
+    int previous_neighbor_dim;
+    int next_neighbor_dim;
+};
+typedef struct Geometry_STRUCT Geometry;
+
+/*!
+  Returns the rank of the MPI process that is assigned the global row index
+  given as the input argument.
+
+  @param[in] geom  The description of the problem's geometry.
+  @param[in] index The global row index
+
+  @return Returns the MPI rank of the process assigned the row
+*/
+inline int ComputeRankOfMatrixRow(const Geometry& geom, global_int_t index)
+{
+    global_int_t gnx = geom.gnx;
+    global_int_t gny = geom.gny;
+
+    global_int_t iz = index / (gny * gnx);
+    global_int_t iy = (index - iz * gny * gnx) / gnx;
+    global_int_t ix = index % gnx;
+    // We now permit varying values for nz for any nx-by-ny plane of MPI processes.
+    // npartz is the number of different groups of nx-by-ny groups of processes.
+    // partz_ids is an array of length npartz where each value indicates the z process of the last process in the ith
+    // nx-by-ny group. partz_nz is an array of length npartz containing the value of nz for the ith group.
+
+    //        With no variation, npartz = 1, partz_ids[0] = npz, partz_nz[0] = nz
+
+    int ipz = 0;
+    int ipartz_ids = 0;
+    for (int i = 0; i < geom.npartz; ++i)
+    {
+        int ipart_nz = geom.partz_nz[i];
+        ipartz_ids = geom.partz_ids[i] - ipartz_ids;
+        if (iz <= ipart_nz * ipartz_ids)
+        {
+            ipz += iz / ipart_nz;
+            break;
+        }
+        else
+        {
+            ipz += ipartz_ids;
+            iz -= ipart_nz * ipartz_ids;
+        }
+    }
+    //  global_int_t ipz = iz/geom.nz;
+    int ipy = iy / geom.ny;
+    int ipx = ix / geom.nx;
+    int rank = ipx + ipy * geom.npx + ipz * geom.npy * geom.npx;
+    return rank;
+}
+
+/*!
+ Destructor for geometry data.
+
+ @param[inout] data the geometry data structure whose storage is deallocated
+ */
+inline void DeleteGeometry(Geometry& geom)
+{
+
+    // Not used anymore
+    // if(geom.partz_nz != 0)
+    //   delete [] geom.partz_nz;
+
+    // if(geom.partz_ids != 0)
+    //   delete [] geom.partz_ids;
+
+    return;
+}
+#endif // GEOMETRY_HPP
--- a/src/MGData.hpp
+++ b/src/MGData.hpp
@@ -0,0 +1,81 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file MGData.hpp
+
+ HPCG data structure
+ */
+
+#ifndef MGDATA_HPP
+#define MGDATA_HPP
+
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include <cassert>
+
+struct MGData_STRUCT
+{
+    int numberOfPresmootherSteps;  // Call ComputeSYMGS this many times prior to coarsening
+    int numberOfPostsmootherSteps; // Call ComputeSYMGS this many times after coarsening
+    local_int_t*
+        f2cOperator; //!< 1D array containing the fine operator local IDs that will be injected into coarse space.
+    Vector* rc;      // coarse grid residual vector
+    Vector* xc;      // coarse grid solution vector
+    Vector* Axf;     // fine grid residual vector
+    /*!
+     This is for storing optimized data structres created in OptimizeProblem and
+     used inside optimized ComputeSPMV().
+     */
+    void* optimizationData;
+};
+typedef struct MGData_STRUCT MGData;
+
+/*!
+ Constructor for the data structure of CG vectors.
+
+ @param[in] Ac - Fully-formed coarse matrix
+ @param[in] f2cOperator -
+ @param[out] data the data structure for CG vectors that will be allocated to get it ready for use in CG iterations
+ */
+inline void InitializeMGData(local_int_t* f2cOperator, Vector* rc, Vector* xc, Vector* Axf, MGData& data)
+{
+    data.numberOfPresmootherSteps = 1;
+    data.numberOfPostsmootherSteps = 1;
+    data.f2cOperator = f2cOperator; // Space for injection operator
+    data.rc = rc;
+    data.xc = xc;
+    data.Axf = Axf;
+    return;
+}
+
+/*!
+ Destructor for the CG vectors data.
+
+ @param[inout] data the MG data structure whose storage is deallocated
+ */
+inline void DeleteMGData(MGData& data)
+{
+
+    delete[] data.f2cOperator;
+    DeleteVector(*data.Axf);
+    DeleteVector(*data.rc);
+    DeleteVector(*data.xc);
+    delete data.Axf;
+    delete data.rc;
+    delete data.xc;
+    return;
+}
+
+#endif // MGDATA_HPP
--- a/src/MixedBaseCounter.cpp
+++ b/src/MixedBaseCounter.cpp
@@ -0,0 +1,66 @@
+
+#include <map>
+
+#include "MixedBaseCounter.hpp"
+
+MixedBaseCounter::MixedBaseCounter(int* counts, int length)
+{
+    this->length = length;
+
+    int i;
+
+    for (i = 0; i < 32; ++i)
+    {
+        this->max_counts[i] = counts[i];
+        this->cur_counts[i] = 0;
+    }
+    // terminate with 0's
+    this->max_counts[i] = this->cur_counts[i] = 0;
+    this->max_counts[length] = this->cur_counts[length] = 0;
+}
+
+MixedBaseCounter::MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right)
+{
+    this->length = left.length;
+    for (int i = 0; i < left.length; ++i)
+    {
+        this->max_counts[i] = left.max_counts[i] - right.cur_counts[i];
+        this->cur_counts[i] = 0;
+    }
+}
+
+void MixedBaseCounter::next()
+{
+    for (int i = 0; i < this->length; ++i)
+    {
+        this->cur_counts[i]++;
+        if (this->cur_counts[i] > this->max_counts[i])
+        {
+            this->cur_counts[i] = 0;
+            continue;
+        }
+        break;
+    }
+}
+
+int MixedBaseCounter::is_zero()
+{
+    for (int i = 0; i < this->length; ++i)
+        if (this->cur_counts[i])
+            return 0;
+    return 1;
+}
+
+int MixedBaseCounter::product(int* multipliers)
+{
+    int k = 0, x = 1;
+
+    for (int i = 0; i < this->length; ++i)
+        for (int j = 0; j < this->cur_counts[i]; ++j)
+        {
+            k = 1;
+            x *= multipliers[i];
+        }
+
+    return x * k;
+}
--- a/src/MixedBaseCounter.hpp
+++ b/src/MixedBaseCounter.hpp
@@ -0,0 +1,16 @@
+
+
+class MixedBaseCounter
+{
+private:
+    int length;             //!< number of prime factor counts (cannot exceed 32 for a 32-bit integer)
+    int max_counts[32 + 1]; //!< maximum value for prime factor counts
+    int cur_counts[32 + 1]; //!< current prime factor counts
+
+public:
+    MixedBaseCounter(int* counts, int length);
+    MixedBaseCounter(MixedBaseCounter& left, MixedBaseCounter& right);
+    void next();
+    int is_zero();
+    int product(int* multipliers);
+};
--- a/src/OptimizeProblem.cpp
+++ b/src/OptimizeProblem.cpp
@@ -0,0 +1,427 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file OptimizeProblem.cpp
+
+ HPCG routine
+ */
+
+#include "OptimizeProblem.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "Cuda.hpp"
+#include "WriteProblem.hpp"
+#include "mytimer.hpp"
+
+extern bool Use_Hpcg_Mem_Reduction; /*USE HPCG aggresive memory reduction*/
+
+/*!
+  Optimizes the data structures used for CG iteration to increase the
+  performance of the benchmark version of the preconditioned CG algorithm.
+
+  @param[inout] A      The known system matrix, also contains the MG hierarchy in attributes Ac and mgData.
+  @param[inout] data   The data structure with all necessary CG vectors preallocated
+  @param[inout] b      The known right hand side vector
+  @param[inout] x      The solution vector to be computed in future CG iteration
+  @param[inout] xexact The exact solution vector
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see GenerateGeometry
+  @see GenerateProblem
+*/
+
+#ifdef USE_CUDA
+size_t OptimizeProblemGpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    // This function can be used to completely transform any part of the data structures.
+    // Right now it does nothing, so compiling with a check for unused variables results in complaints
+    SparseMatrix* A = &A_in;
+    local_int_t numberOfMgLevels = 4;
+    local_int_t slice_size = A->slice_size;
+    for (int level = 0; level < numberOfMgLevels; ++level)
+    {
+        const local_int_t nrow = A->localNumberOfRows;
+        int totalColors = 8;
+
+        // Let's deal with perm and iperm
+        SetVectorAscCuda(A->ref2opt, nrow);
+        SetVectorAscCuda(A->opt2ref, nrow);
+
+        // Let us color the matrix
+        int num_colors = 0;
+        ColorMatrixCuda(NULL, A->gpuAux.columns, A->gpuAux.nnzPerRow, A->localNumberOfRows, A->gpuAux.color,
+            &(num_colors), A->gpuAux.colorCountCpu, 8, A->ref2opt, A->opt2ref, A->geom->rank, A->geom->nx, NULL);
+        A->totalColors = totalColors;
+        PermElemToSendCuda(A->totalToBeSent, A->gpuAux.elementsToSend, A->ref2opt);
+
+        // Create (S)ELL
+        local_int_t TranslateIndex = slice_size * HPCG_MAX_ROW_LEN;
+        local_int_t* translated_ell_col_index = A->sellAPermColumns + TranslateIndex;
+        double* translated_ell_values = A->sellAPermValues + TranslateIndex;
+
+        EllPermColumnsValuesCuda(nrow, A->gpuAux.nnzPerRow, A->gpuAux.columns, A->gpuAux.values,
+            A->gpuAux.csrAPermOffsets, translated_ell_col_index, translated_ell_values, A->opt2ref, A->ref2opt,
+            A->gpuAux.sellADiagonalIdx, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets, false);
+
+        // Coloumn mojor blocked/sliced ellpack
+        TransposeCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues);
+
+        // Per block max row len
+        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
+        EllMaxRowLenPerBlockCuda(nrow, slice_size, A->gpuAux.csrLPermOffsets, A->gpuAux.csrUPermOffsets,
+            A->sellLSliceMrl, A->sellUSliceMrl);
+
+        // Find prefix sum for sliced ell
+        PrefixsumCuda(num_slices, A->sellLSliceMrl);
+        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellLSliceMrl + 1);
+
+        PrefixsumCuda(num_slices, A->sellUSliceMrl);
+        MultiplyBySliceSizeCUDA(num_slices, slice_size, A->sellUSliceMrl + 1);
+
+        // Set the general matrix slice_offsets
+        CreateAMatrixSliceOffsetsCuda(num_slices + 1, A->slice_size, A->sellASliceMrl);
+
+        // Lower Upper ELL variant parts
+        CreateSellLUColumnsValuesCuda(nrow, slice_size, A->sellAPermColumns, A->sellAPermValues, A->sellLSliceMrl,
+            A->sellLPermColumns, A->sellLPermValues, A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, level);
+
+        local_int_t sell_slices = (nrow + slice_size - 1) / slice_size;
+        const local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
+
+        local_int_t sell_l_nnz = 0;
+        cudaMemcpyAsync(
+            &sell_l_nnz, &(A->sellLSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
+
+        local_int_t sell_u_nnz = 0;
+        cudaMemcpyAsync(
+            &sell_u_nnz, &(A->sellUSliceMrl[sell_slices]), sizeof(local_int_t), cudaMemcpyDeviceToHost, stream);
+
+        auto INDEX_TYPE = CUSPARSE_INDEX_32I;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = CUSPARSE_INDEX_64I;
+#endif
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
+            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
+            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        local_int_t sell_nnz = sell_slices * slice_size * HPCG_MAX_ROW_LEN;
+        cusparseCreateSlicedEll(&(A->cusparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz, slice_size,
+            A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE, CUSPARSE_INDEX_BASE_ZERO,
+            CUDA_R_64F);
+
+        double alpha = 1.0, beta = 0.0;
+        size_t e_buf_size = 0;
+        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
+        cusparseDnVecDescr_t dummy1, dummy2;
+        cusparseCreateDnVec(&dummy1, nrow, x.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&dummy2, nrow, b.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&(A->cusparseOpt.vecX), nrow, x.values_d, CUDA_R_64F);
+        cusparseCreateDnVec(&(A->cusparseOpt.vecY), nrow, b.values_d, CUDA_R_64F);
+        max_buf_size = e_buf_size;
+
+        // MV
+        // Lower
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &l_buf_size);
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &u_buf_size);
+        cusparseSpMV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matA, dummy1,
+            &beta, dummy2, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &i_buf_size);
+
+        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
+
+        // SV
+        // Lower
+        size_t buffer_size_sv_l, buffer_size_sv_u;
+        cusparseFillMode_t fillmode_l = CUSPARSE_FILL_MODE_LOWER;
+        cusparseFillMode_t fillmode_u = CUSPARSE_FILL_MODE_UPPER;
+        cusparseDiagType_t diagtype = CUSPARSE_DIAG_TYPE_NON_UNIT;
+
+        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrL);
+        cusparseSpSV_createDescr(&A->cusparseOpt.spsvDescrU);
+        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
+        cusparseSpMatSetAttribute(A->cusparseOpt.matL, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+
+        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
+        {
+            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
+                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
+                A->cusparseOpt.spsvDescrL, &buffer_size_sv_l);
+            cudaMalloc(&A->bufferSvL, buffer_size_sv_l);
+        }
+        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matL,
+            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrL,
+            A->bufferSvL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A->cusparseOpt.spsvDescrL, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+
+        cusparseSpMatSetAttribute(A->cusparseOpt.matU, CUSPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+
+        if (!Use_Hpcg_Mem_Reduction || (nrow % 8 != 0))
+        {
+            cusparseSpSV_bufferSize(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
+                A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT,
+                A->cusparseOpt.spsvDescrU, &buffer_size_sv_u);
+            cudaMalloc(&A->bufferSvU, buffer_size_sv_u);
+        }
+        cusparseSpSV_analysis(cusparsehandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A->cusparseOpt.matU,
+            A->cusparseOpt.vecX, A->cusparseOpt.vecY, CUDA_R_64F, CUSPARSE_SPSV_ALG_DEFAULT, A->cusparseOpt.spsvDescrU,
+            A->bufferSvU);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A->cusparseOpt.spsvDescrU, A->diagonal, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+
+        if (max_buf_size > 0)
+            cudaMalloc(&(A->bufferMvA), max_buf_size);
+
+        cusparseDestroyDnVec(dummy1);
+        cusparseDestroyDnVec(dummy2);
+        // //////////////////////////////////////////////////////////////////////////
+        A = A->Ac;
+    }
+
+    A = &A_in;
+    for (int level = 1; level < numberOfMgLevels; ++level)
+    {
+        const local_int_t nrow_c = A->Ac->localNumberOfRows;
+        const local_int_t nrow_f = A->localNumberOfRows;
+        F2cPermCuda(nrow_c, A->gpuAux.f2c, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
+        A = A->Ac;
+    }
+
+    return 0;
+}
+#endif
+
+#ifdef USE_GRACE
+size_t OptimizeProblemCpu(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    // Initialize data structures
+    size_t mem = AllocateMemCpu(A_in);
+
+    SparseMatrix* A = &A_in;
+    local_int_t numberOfMgLevels = 4;
+    local_int_t slice_size = A->slice_size;
+    for (int level = 0; level < numberOfMgLevels; ++level)
+    {
+        // Color the matrix
+        int num_colors;
+        ColorMatrixCpu(*A, &num_colors);
+        A->totalColors = num_colors;
+
+        // Compute when each color starts
+        A->cpuAux.firstRowOfColor[0] = 0;
+        for (int c = 1; c < A->totalColors; c++)
+        {
+            A->cpuAux.firstRowOfColor[c] = A->cpuAux.firstRowOfColor[c - 1] + A->cpuAux.nRowsWithColor[c - 1];
+        }
+
+        // Reorder the matrix
+        CreateSellPermCpu(*A);
+
+#ifndef HPCG_NO_MPI
+        // Translate row IDs that will be send to neighbours
+#pragma omp parallel for
+        for (local_int_t i = 0; i < A->totalToBeSent; i++)
+        {
+            local_int_t orig = A->elementsToSend[i];
+            A->elementsToSend[i] = A->ref2opt[orig];
+        }
+#endif
+
+        local_int_t numberOfNonzerosPerRow = HPCG_MAX_ROW_LEN;
+        local_int_t nrow = A->localNumberOfRows;
+        local_int_t half_nnz = (A->localNumberOfNonzeros - nrow - A->extNnz) / 2;
+        local_int_t num_slices = (nrow + slice_size - 1) / slice_size;
+        local_int_t sell_l_nnz = A->sellLSliceMrl[num_slices];
+        local_int_t sell_u_nnz = A->sellUSliceMrl[num_slices];
+        local_int_t sell_nnz = num_slices * slice_size * numberOfNonzerosPerRow;
+
+        auto INDEX_TYPE = NVPL_SPARSE_INDEX_32I;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = NVPL_SPARSE_INDEX_64I;
+#endif
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matL), nrow, nrow, half_nnz, sell_l_nnz, slice_size,
+            A->sellLSliceMrl, A->sellLPermColumns, A->sellLPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matU), nrow, nrow, half_nnz, sell_u_nnz, slice_size,
+            A->sellUSliceMrl, A->sellUPermColumns, A->sellUPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        nvpl_sparse_create_sliced_ell(&(A->nvplSparseOpt.matA), nrow, nrow, A->localNumberOfNonzeros, sell_nnz,
+            slice_size, A->sellASliceMrl, A->sellAPermColumns, A->sellAPermValues, INDEX_TYPE, INDEX_TYPE,
+            NVPL_SPARSE_INDEX_BASE_ZERO, NVPL_SPARSE_R_64F);
+
+        double alpha = 1.0, beta = 0.0;
+        size_t e_buf_size = 0;
+        size_t l_buf_size = 0, u_buf_size = 0, i_buf_size = 0, max_buf_size = 0;
+        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecX), nrow, x.values, NVPL_SPARSE_R_64F);
+        nvpl_sparse_create_dn_vec(&(A->nvplSparseOpt.vecY), nrow, b.values, NVPL_SPARSE_R_64F);
+        max_buf_size = e_buf_size;
+
+        // //MV
+        // //Lower
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvLDescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvLDescr, &l_buf_size);
+        // //Upper
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvUDescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvUDescr, &u_buf_size);
+        // //L+D+U
+        nvpl_sparse_spmv_create_descr(&A->nvplSparseOpt.spmvADescr);
+        nvpl_sparse_spmv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+            A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, &beta, A->nvplSparseOpt.vecY, A->nvplSparseOpt.vecY,
+            NVPL_SPARSE_R_64F, NVPL_SPARSE_SPMV_ALG_DEFAULT, A->nvplSparseOpt.spmvADescr, &i_buf_size);
+
+        max_buf_size = std::max(std::max(i_buf_size, e_buf_size), std::max(u_buf_size, l_buf_size));
+
+        // //SV
+        // //Lower
+        size_t buffer_size_sv_l, buffer_size_sv_u;
+        nvpl_sparse_fill_mode_t fillmode_l = NVPL_SPARSE_FILL_MODE_LOWER;
+        nvpl_sparse_fill_mode_t fillmode_u = NVPL_SPARSE_FILL_MODE_UPPER;
+        nvpl_sparse_diag_type_t diagtype = NVPL_SPARSE_DIAG_TYPE_NON_UNIT;
+
+        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrL);
+        nvpl_sparse_spsv_create_descr(&A->nvplSparseOpt.spsvDescrU);
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_DIAG_TYPE, &(diagtype), sizeof(diagtype));
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matL, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+
+        Vector origDiagA;
+        InitializeVector(origDiagA, A->localNumberOfRows, CPU);
+        CopyMatrixDiagonal(*A, origDiagA);
+
+        // Pass strictly L, and then update the diagonal
+        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
+        {
+            nvpl_sparse_sp_mat_set_attribute(
+                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_l), sizeof(fillmode_l));
+            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, &buffer_size_sv_l);
+
+            A->bufferSvL = new char[buffer_size_sv_l];
+            mem += buffer_size_sv_l;
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
+        }
+        else
+        {
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matL, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrL, A->bufferSvL);
+            nvpl_sparse_spsv_update_matrix(
+                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        }
+
+        // Pass strctly U, and then update diagonal
+        nvpl_sparse_sp_mat_set_attribute(
+            A->nvplSparseOpt.matU, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+        if (!Use_Hpcg_Mem_Reduction || A->localNumberOfRows % 8 != 0)
+        {
+            nvpl_sparse_sp_mat_set_attribute(
+                A->nvplSparseOpt.matA, NVPL_SPARSE_SPMAT_FILL_MODE, &(fillmode_u), sizeof(fillmode_u));
+            nvpl_sparse_spsv_buffer_size(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, &buffer_size_sv_u);
+            A->bufferSvU = new char[buffer_size_sv_u];
+            mem += buffer_size_sv_u;
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matA, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
+        }
+        else
+        {
+            nvpl_sparse_spsv_analysis(nvpl_sparse_handle, NVPL_SPARSE_OPERATION_NON_TRANSPOSE, &alpha,
+                A->nvplSparseOpt.matU, A->nvplSparseOpt.vecX, A->nvplSparseOpt.vecY, NVPL_SPARSE_R_64F,
+                NVPL_SPARSE_SPSV_ALG_DEFAULT, A->nvplSparseOpt.spsvDescrU, A->bufferSvU);
+            nvpl_sparse_spsv_update_matrix(
+                nvpl_sparse_handle, A->nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        }
+
+        DeleteVector(origDiagA);
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        A = A->Ac;
+    }
+    A = &A_in;
+
+    for (int level = 1; level < numberOfMgLevels; level++)
+    {
+        local_int_t nrow_c = A->Ac->localNumberOfRows;
+        local_int_t nrow_f = A->localNumberOfRows;
+        // Permute space injector operator
+        F2cPermCpu(nrow_c, A->mgData->f2cOperator, A->f2cPerm, A->ref2opt, A->Ac->opt2ref);
+        A = A->Ac;
+    }
+
+    return mem;
+}
+#endif // USE_GRACE
+
+size_t OptimizeProblem(SparseMatrix& A_in, CGData& data, Vector& b, Vector& x, Vector& xexact)
+{
+    size_t result = 0;
+    if (A_in.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        result = OptimizeProblemGpu(A_in, data, b, x, xexact);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        result = OptimizeProblemCpu(A_in, data, b, x, xexact);
+#endif
+    }
+
+    return result;
+}
+
+// Helper function (see OptimizeProblem.hpp for details)
+double OptimizeProblemMemoryUse(const SparseMatrix& A)
+{
+
+    return 0.0;
+}
--- a/src/OptimizeProblem.hpp
+++ b/src/OptimizeProblem.hpp
@@ -0,0 +1,30 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef OPTIMIZEPROBLEM_HPP
+#define OPTIMIZEPROBLEM_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+
+size_t OptimizeProblem(SparseMatrix& A, CGData& data, Vector& b, Vector& x, Vector& xexact);
+
+// This helper function should be implemented in a non-trivial way if OptimizeProblem is non-trivial
+// It should return as type double, the total number of bytes allocated and retained after calling OptimizeProblem.
+// This value will be used to report Gbytes used in ReportResults (the value returned will be divided by 1000000000.0).
+
+double OptimizeProblemMemoryUse(const SparseMatrix& A);
+
+#endif // OPTIMIZEPROBLEM_HPP
--- a/src/OutputFile.cpp
+++ b/src/OutputFile.cpp
@@ -0,0 +1,176 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <sstream>
+#include <string>
+
+#include "OutputFile.hpp"
+
+using std::string;
+using std::stringstream;
+using std::list;
+using std::ofstream;
+
+extern int use_output_file;
+
+OutputFile::OutputFile(const string& name_arg, const string& version_arg)
+    : name(name_arg)
+    , version(version_arg)
+    , eol("\n")
+    , keySeparator("::")
+{
+}
+
+OutputFile::OutputFile(void)
+    : eol("\n")
+    , keySeparator("::")
+{
+}
+
+OutputFile::~OutputFile()
+{
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        delete *it;
+    }
+}
+
+void OutputFile::add(const string& key_arg, const string& value_arg)
+{
+    descendants.push_back(allocKeyVal(key_arg, value_arg));
+}
+
+void OutputFile::add(const string& key_arg, double value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+void OutputFile::add(const string& key_arg, int value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+#ifndef HPCG_NO_LONG_LONG
+
+void OutputFile::add(const string& key_arg, long long value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+#endif
+
+void OutputFile::add(const string& key_arg, size_t value_arg)
+{
+    stringstream ss;
+    ss << value_arg;
+    descendants.push_back(allocKeyVal(key_arg, ss.str()));
+}
+
+void OutputFile::setKeyValue(const string& key_arg, const string& value_arg)
+{
+    key = key_arg;
+    value = value_arg;
+}
+
+OutputFile* OutputFile::get(const string& key_arg)
+{
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        if ((*it)->key == key_arg)
+            return *it;
+    }
+
+    return 0;
+}
+
+string OutputFile::generateRecursive(string prefix)
+{
+    string result = "";
+
+    result += prefix + key + "=" + value + eol;
+
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        result += (*it)->generateRecursive(prefix + key + keySeparator);
+    }
+
+    return result;
+}
+
+string OutputFile::generate(void)
+{
+    string result = name + "\nversion=" + version + eol;
+
+    for (list<OutputFile*>::iterator it = descendants.begin(); it != descendants.end(); ++it)
+    {
+        result += (*it)->generateRecursive("");
+    }
+
+    time_t rawtime;
+    time(&rawtime);
+    tm* ptm = localtime(&rawtime);
+    char sdate[64];
+    // use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+    sprintf(sdate, "%04d-%02d-%02d_%02d-%02d-%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
+        ptm->tm_min, ptm->tm_sec);
+
+    string filename = name + "_" + version + "_";
+    filename += string(sdate) + ".txt";
+
+    if (use_output_file)
+    {
+        ofstream myfile(filename.c_str());
+        myfile << result;
+        myfile.close();
+    }
+    else
+    {
+        std::cout << result << std::flush;
+    }
+
+    return result;
+}
+
+OutputFile* OutputFile::allocKeyVal(const std::string& key_arg, const std::string& value_arg)
+{
+    OutputFile* of = new OutputFile();
+    of->setKeyValue(key_arg, value_arg);
+    return of;
+}
--- a/src/OutputFile.hpp
+++ b/src/OutputFile.hpp
@@ -0,0 +1,161 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file Output_File.hpp
+
+ HPCG output file classes
+ */
+
+#ifndef OUTPUTFILE_HPP
+#define OUTPUTFILE_HPP
+
+#include <list>
+#include <string>
+
+//! The OutputFile class for the uniform collecting and reporting of performance data for HPCG
+
+/*!
+
+  The OutputFile class facilitates easy collecting and reporting of
+  key-value-formatted data that can be then registered with the HPCG results
+  collection website. The keys may have hierarchy key1::key2::key3=val with
+  double colon :: as a separator. A sample output may look like this (note how
+  "major" and "micro" keys repeat with different ancestor keys):
+
+\code
+
+version=3.2.1alpha
+version::major=3
+version::minor=2
+version::micro=1
+version::release=alpha
+axis=xyz
+axis::major=x
+axis::minor=y
+
+\endcode
+
+*/
+class OutputFile
+{
+protected:
+    std::list<OutputFile*> descendants; //!< descendant elements
+    std::string name;                   //!< name of the benchmark
+    std::string version;                //!< version of the benchmark
+    std::string key;                    //!< the key under which the element is stored
+    std::string value;                  //!< the value of the stored element
+    std::string eol;                    //!< end-of-line character sequence in the output file
+    std::string keySeparator;           //!< character sequence to separate keys in the output file
+
+    //! Recursively generate output string from descendant list, and their descendants and so on
+    std::string generateRecursive(std::string prefix);
+
+public:
+    static OutputFile* allocKeyVal(const std::string& key, const std::string& value);
+
+    //! Constructor: accepts name and version as strings that are used to create a file name for printing results.
+    /*!
+      This constructor accepts and name and version number for the benchmark that
+      are used to form a file name information for results that are generated by
+      the generate() method.
+      \param name (in) string containing name of the benchmark
+      \param version (in) string containing the version of the benchmark
+    */
+    OutputFile(const std::string& name, const std::string& version);
+
+    //! Default constructor: no-arguments accepted, should be used for descendant nodes
+    /*!
+      This no-argument constructor can be used for descendant nodes to provide
+      key1::key2::key3=val output. Unlike the root node, descendant nodes do not
+      have name and version but only store key-value pairs.
+    */
+    OutputFile(void);
+
+    ~OutputFile();
+
+    //! Create and add a descendant element with value of type "string"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, const std::string& value);
+
+    //! Create and add a descendant element with value of type "double"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, double value);
+
+    //! Create and add a descendant element with value of type "int"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, int value);
+
+#ifndef HPCG_NO_LONG_LONG
+    //! Create and add a descendant element with value of type "long long"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, long long value);
+#endif
+
+    //! Create and add a descendant element with value of type "size_t"
+    /*!
+    Create and add a descendant element identified by "key" and associated with
+    "value".  The element is added at the end of a list of previously added
+    elements.
+
+    @param[in] key   The key that identifies the added element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void add(const std::string& key, size_t value);
+
+    //! Key-Value setter method
+    /*!
+    Set the key and the value of this element.
+
+    @param[in] key   The key that identifies this element and under which the element is stored
+    @param[in] value The value stored by the element
+    */
+    void setKeyValue(const std::string& key, const std::string& value);
+
+    //! Get the element in the list with the given key or return NULL if not found
+    OutputFile* get(const std::string& key);
+
+    //! Generate output string with results based on the stored key-value hierarchy
+    std::string generate(void);
+};
+
+#endif // OUTPUTFILE_HPP
--- a/src/ReadHpcgDat.cpp
+++ b/src/ReadHpcgDat.cpp
@@ -0,0 +1,79 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#include <cstdio>
+
+#include "ReadHpcgDat.hpp"
+
+static int SkipUntilEol(FILE* stream)
+{
+    int chOrEof;
+    bool finished;
+
+    do
+    {
+        chOrEof = fgetc(stream);
+        finished = (chOrEof == EOF) || (chOrEof == '\n') || (chOrEof == '\r');
+    } while (!finished);
+
+    if ('\r' == chOrEof)
+    { // on Windows, \r might be followed by \n
+        int chOrEofExtra = fgetc(stream);
+
+        if ('\n' == chOrEofExtra || EOF == chOrEofExtra)
+            chOrEof = chOrEofExtra;
+        else
+            ungetc(chOrEofExtra, stream);
+    }
+
+    return chOrEof;
+}
+
+int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename)
+{
+    FILE* hpcgStream = fopen(filename, "r");
+
+    if (!hpcgStream)
+    {
+        printf("Cannot open input file: %s\n", filename);
+        return -1;
+    }
+
+    SkipUntilEol(hpcgStream); // skip the first line
+
+    SkipUntilEol(hpcgStream); // skip the second line
+
+    for (int i = 0; i < 3; ++i)
+        if (fscanf(hpcgStream, "%d", localDimensions + i) != 1 || localDimensions[i] < 16)
+            localDimensions[i] = 16;
+
+    SkipUntilEol(hpcgStream); // skip the rest of the second line
+
+    if (secondsPerRun != 0)
+    { // Only read number of seconds if the pointer is non-zero
+        if (fscanf(hpcgStream, "%d", secondsPerRun) != 1 || secondsPerRun[0] < 0)
+            secondsPerRun[0] = 30 * 60; // 30 minutes
+    }
+
+    SkipUntilEol(hpcgStream); // skip the rest of the third line
+
+    for (int i = 0; i < 3; ++i)
+        // the user didn't specify (or values are invalid) process dimensions
+        if (fscanf(hpcgStream, "%d", localProcDimensions + i) != 1 || localProcDimensions[i] < 1)
+            localProcDimensions[i] = 0; // value 0 means: "not specified" and it will be fixed later
+
+    fclose(hpcgStream);
+
+    return 0;
+}
--- a/src/ReadHpcgDat.hpp
+++ b/src/ReadHpcgDat.hpp
@@ -0,0 +1,20 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef READHPCGDAT_HPP
+#define READHPCGDAT_HPP
+
+int ReadHpcgDat(int* localDimensions, int* secondsPerRun, int* localProcDimensions, char* filename);
+
+#endif // READHPCGDAT_HPP
--- a/src/ReportResults.cpp
+++ b/src/ReportResults.cpp
@@ -0,0 +1,512 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file ReportResults.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include "OptimizeProblem.hpp"
+#include "OutputFile.hpp"
+#include "ReportResults.hpp"
+#include <vector>
+
+#ifdef HPCG_DEBUG
+#include <fstream>
+using std::endl;
+
+#include "hpcg.hpp"
+#endif
+
+extern int use_output_file;
+
+/*!
+ Creates a YAML file and writes the information about the HPCG run, its results, and validity.
+
+  @param[in] geom The description of the problem's geometry.
+  @param[in] A    The known system matrix
+  @param[in] numberOfMgLevels Number of levels in multigrid V cycle
+  @param[in] numberOfCgSets Number of CG runs performed
+  @param[in] niters Number of preconditioned CG iterations performed to lower the residual below a threshold
+  @param[in] times  Vector of cumulative timings for each of the phases of a preconditioned CG iteration
+  @param[in] testcg_data    the data structure with the results of the CG-correctness test including pass/fail
+ information
+  @param[in] testsymmetry_data the data structure with the results of the CG symmetry test including pass/fail
+ information
+  @param[in] testnorms_data the data structure with the results of the CG norm test including pass/fail information
+  @param[in] global_failure indicates whether a failure occurred during the correctness tests of CG
+
+  @see YAML_Doc
+*/
+
+void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
+    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
+    const TestNormsData& testnorms_data, int global_failure, bool quickPath)
+{
+
+    double minOfficialTime = 1800; // Any official benchmark result must run at least this many seconds
+
+#ifndef HPCG_NO_MPI
+    double t4 = times[4];
+    double t4min = 0.0;
+    double t4max = 0.0;
+    double t4avg = 0.0;
+    MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    t4avg = t4avg / ((double) A.geom->size);
+#endif
+
+    if (A.geom->rank == 0)
+    { // Only PE 0 needs to compute and report timing results
+
+        // TODO: Put the FLOP count, Memory BW and Memory Usage models into separate functions
+
+        // ======================== FLOP count model =======================================
+
+        double fNumberOfCgSets = numberOfCgSets;
+        double fniters = fNumberOfCgSets * (double) optMaxIters;
+        double fnrow = A.totalNumberOfRows;
+        double fnnz = A.totalNumberOfNonzeros;
+
+        // Op counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
+        double fnops_ddot = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow; // 3 ddots with nrow adds and nrow mults
+        double fnops_waxpby
+            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow;            // 3 WAXPBYs with nrow adds and nrow mults
+        double fnops_sparsemv = (fniters + fNumberOfCgSets) * 2.0 * fnnz; // 1 SpMV with nnz adds and nnz mults
+        // Op counts from the multigrid preconditioners
+        double fnops_precond = 0.0;
+        const SparseMatrix* Af = &A;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnnz_Af = Af->totalNumberOfNonzeros;
+            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
+            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
+            fnops_precond += fnumberOfPresmootherSteps * fniters * 4.0 * fnnz_Af; // number of presmoother flops
+            fnops_precond += fniters * 2.0 * fnnz_Af; // cost of fine grid residual calculation
+            fnops_precond += fnumberOfPostsmootherSteps * fniters * 4.0 * fnnz_Af; // number of postsmoother flops
+            Af = Af->Ac;                                                           // Go to next coarse level
+        }
+
+        fnops_precond
+            += fniters * 4.0 * ((double) Af->totalNumberOfNonzeros); // One symmetric GS sweep at the coarsest level
+        double fnops = fnops_ddot + fnops_waxpby + fnops_sparsemv + fnops_precond;
+        double frefnops = fnops * ((double) refMaxIters) / ((double) optMaxIters);
+
+        // ======================== Memory bandwidth model =======================================
+
+        // Read/Write counts come from implementation of CG in CG.cpp (include 1 extra for the CG preamble ops)
+        double fnreads_ddot
+            = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow * sizeof(double);    // 3 ddots with 2 nrow reads
+        double fnwrites_ddot = (3.0 * fniters + fNumberOfCgSets) * sizeof(double); // 3 ddots with 1 write
+        double fnreads_waxpby = (3.0 * fniters + fNumberOfCgSets) * 2.0 * fnrow
+            * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
+        double fnwrites_waxpby
+            = (3.0 * fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 3 WAXPBYs with nrow adds and nrow mults
+        double fnreads_sparsemv = (fniters + fNumberOfCgSets)
+            * (fnnz * (sizeof(double) + sizeof(local_int_t))
+                + fnrow * sizeof(double)); // 1 SpMV with nnz reads of values, nnz reads indices,
+        // plus nrow reads of x
+        double fnwrites_sparsemv = (fniters + fNumberOfCgSets) * fnrow * sizeof(double); // 1 SpMV nrow writes
+        // Op counts from the multigrid preconditioners
+        double fnreads_precond = 0.0;
+        double fnwrites_precond = 0.0;
+        Af = &A;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnnz_Af = Af->totalNumberOfNonzeros;
+            double fnrow_Af = Af->totalNumberOfRows;
+            double fnumberOfPresmootherSteps = Af->mgData->numberOfPresmootherSteps;
+            double fnumberOfPostsmootherSteps = Af->mgData->numberOfPostsmootherSteps;
+            fnreads_precond += fnumberOfPresmootherSteps * fniters
+                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // number of presmoother reads
+            fnwrites_precond
+                += fnumberOfPresmootherSteps * fniters * fnrow_Af * sizeof(double); // number of presmoother writes
+            fnreads_precond += fniters
+                * (fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // Number of reads for fine grid residual calculation
+            fnwrites_precond
+                += fniters * fnnz_Af * sizeof(double); // Number of writes for fine grid residual calculation
+            fnreads_precond += fnumberOfPostsmootherSteps * fniters
+                * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t))
+                    + fnrow_Af * sizeof(double)); // number of postsmoother reads
+            fnwrites_precond
+                += fnumberOfPostsmootherSteps * fniters * fnnz_Af * sizeof(double); // number of postsmoother writes
+            Af = Af->Ac;                                                            // Go to next coarse level
+        }
+
+        double fnnz_Af = Af->totalNumberOfNonzeros;
+        double fnrow_Af = Af->totalNumberOfRows;
+        fnreads_precond
+            += fniters * (2.0 * fnnz_Af * (sizeof(double) + sizeof(local_int_t)) + fnrow_Af * sizeof(double));
+        ;                                                        // One symmetric GS sweep at the coarsest level
+        fnwrites_precond += fniters * fnrow_Af * sizeof(double); // One symmetric GS sweep at the coarsest level
+        double fnreads = fnreads_ddot + fnreads_waxpby + fnreads_sparsemv + fnreads_precond;
+        double fnwrites = fnwrites_ddot + fnwrites_waxpby + fnwrites_sparsemv + fnwrites_precond;
+        double frefnreads = fnreads * ((double) refMaxIters) / ((double) optMaxIters);
+        double frefnwrites = fnwrites * ((double) refMaxIters) / ((double) optMaxIters);
+
+        // ======================== Memory usage model =======================================
+
+        // Data in GenerateProblem_ref
+
+        double numberOfNonzerosPerRow
+            = 27.0; // We are approximating a 27-point finite element/volume/difference 3D stencil
+        double size = ((double) A.geom->size); // Needed for estimating size of halo
+
+        double fnbytes = ((double) sizeof(Geometry));           // Geometry struct in main.cpp
+        fnbytes += ((double) sizeof(double) * fNumberOfCgSets); // testnorms_data in main.cpp
+
+        // Model for GenerateProblem_ref.cpp
+        fnbytes += fnrow * sizeof(char);                                             // array nonzerosInRow
+        fnbytes += fnrow * ((double) sizeof(global_int_t*));                         // mtxIndG
+        fnbytes += fnrow * ((double) sizeof(local_int_t*));                          // mtxIndL
+        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixValues
+        fnbytes += fnrow * ((double) sizeof(double*));                               // matrixDiagonal
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
+        fnbytes += fnrow * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
+        fnbytes += fnrow * ((double) 3 * sizeof(double));                            // x, b, xexact
+
+        // Model for CGData.hpp
+        double fncol = ((global_int_t) A.localNumberOfColumns)
+            * size; // Estimate of the global number of columns using the value from rank 0
+        fnbytes += fnrow * ((double) 2 * sizeof(double)); // r, Ap
+        fnbytes += fncol * ((double) 2 * sizeof(double)); // z, p
+
+        std::vector<double> fnbytesPerLevel(numberOfMgLevels); // Count byte usage per level (level 0 is main CG level)
+        fnbytesPerLevel[0] = fnbytes;
+
+        // Benchmarker-provided model for OptimizeProblem.cpp
+        double fnbytes_OptimizedProblem = OptimizeProblemMemoryUse(A);
+        fnbytes += fnbytes_OptimizedProblem;
+
+        Af = A.Ac;
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            double fnrow_Af = Af->totalNumberOfRows;
+            double fncol_Af = ((global_int_t) Af->localNumberOfColumns)
+                * size; // Estimate of the global number of columns using the value from rank 0
+            double fnbytes_Af = 0.0;
+            // Model for GenerateCoarseProblem.cpp
+            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t)); // f2cOperator
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double));      // rc
+            fnbytes_Af += 2.0 * fncol_Af
+                * ((double) sizeof(double)); // xc, Axf are estimated based on the size of these arrays on rank 0
+            fnbytes_Af += ((double) (sizeof(Geometry) + sizeof(SparseMatrix) + 3 * sizeof(Vector)
+                + sizeof(MGData))); // Account for structs geomc, Ac, rc, xc, Axf - (minor)
+
+            // Model for GenerateProblem.cpp (called within GenerateCoarseProblem.cpp)
+            fnbytes_Af += fnrow_Af * sizeof(char);                                             // array nonzerosInRow
+            fnbytes_Af += fnrow_Af * ((double) sizeof(global_int_t*));                         // mtxIndG
+            fnbytes_Af += fnrow_Af * ((double) sizeof(local_int_t*));                          // mtxIndL
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixValues
+            fnbytes_Af += fnrow_Af * ((double) sizeof(double*));                               // matrixDiagonal
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(local_int_t));  // mtxIndL[1..nrows]
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(double));       // matrixValues[1..nrows]
+            fnbytes_Af += fnrow_Af * numberOfNonzerosPerRow * ((double) sizeof(global_int_t)); // mtxIndG[1..nrows]
+
+// Model for SetupHalo_ref.cpp
+#ifndef HPCG_NO_MPI
+            fnbytes_Af += ((double) sizeof(double) * Af->totalToBeSent);              // sendBuffer
+            fnbytes_Af += ((double) sizeof(local_int_t) * Af->totalToBeSent);         // elementsToSend
+            fnbytes_Af += ((double) sizeof(int) * Af->numberOfSendNeighbors);         // neighbors
+            fnbytes_Af += ((double) sizeof(local_int_t) * Af->numberOfSendNeighbors); // receiveLength, sendLength
+#endif
+            fnbytesPerLevel[i] = fnbytes_Af;
+            fnbytes += fnbytes_Af; // Running sum
+            Af = Af->Ac;           // Go to next coarse level
+        }
+
+        assert(Af == 0); // Make sure we got to the lowest grid level
+
+        // Count number of bytes used per equation
+        double fnbytesPerEquation = fnbytes / fnrow;
+
+        // Instantiate YAML document
+        OutputFile doc("HPCG-Benchmark", "3.1");
+        doc.add("Release date", "March 28, 2019");
+
+        doc.add("Machine Summary", "");
+        doc.get("Machine Summary")->add("Distributed Processes", A.geom->size);
+        doc.get("Machine Summary")->add("Threads per processes", A.geom->numThreads);
+
+        doc.add("Global Problem Dimensions", "");
+        doc.get("Global Problem Dimensions")->add("Global nx", A.geom->gnx);
+        doc.get("Global Problem Dimensions")->add("Global ny", A.geom->gny);
+        doc.get("Global Problem Dimensions")->add("Global nz", A.geom->gnz);
+
+        doc.add("Processor Dimensions", "");
+        doc.get("Processor Dimensions")->add("npx", A.geom->npx);
+        doc.get("Processor Dimensions")->add("npy", A.geom->npy);
+        doc.get("Processor Dimensions")->add("npz", A.geom->npz);
+
+        doc.add("Local Domain Dimensions", "");
+        doc.get("Local Domain Dimensions")->add("nx", A.geom->nx);
+        doc.get("Local Domain Dimensions")->add("ny", A.geom->ny);
+
+        doc.add("########## Problem Summary  ##########", "");
+
+        doc.add("Setup Information", "");
+        doc.get("Setup Information")->add("Setup Time", times[9]);
+
+        doc.add("Linear System Information", "");
+        doc.get("Linear System Information")->add("Number of Equations", A.totalNumberOfRows);
+        doc.get("Linear System Information")->add("Number of Nonzero Terms", A.totalNumberOfNonzeros);
+
+        doc.add("Multigrid Information", "");
+        doc.get("Multigrid Information")->add("Number of coarse grid levels", numberOfMgLevels - 1);
+        Af = &A;
+        doc.get("Multigrid Information")->add("Coarse Grids", "");
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            doc.get("Multigrid Information")->get("Coarse Grids")->add("Grid Level", i);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Equations", Af->Ac->totalNumberOfRows);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Nonzero Terms", Af->Ac->totalNumberOfNonzeros);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Presmoother Steps", Af->mgData->numberOfPresmootherSteps);
+            doc.get("Multigrid Information")
+                ->get("Coarse Grids")
+                ->add("Number of Postsmoother Steps", Af->mgData->numberOfPostsmootherSteps);
+            Af = Af->Ac;
+        }
+
+        doc.add("########## Memory Use Summary  ##########", "");
+
+        doc.add("Memory Use Information", "");
+        doc.get("Memory Use Information")->add("Total memory used for data (Gbytes)", fnbytes / 1000000000.0);
+        doc.get("Memory Use Information")
+            ->add("Memory used for OptimizeProblem data (Gbytes)", fnbytes_OptimizedProblem / 1000000000.0);
+        doc.get("Memory Use Information")
+            ->add("Bytes per equation (Total memory / Number of Equations)", fnbytesPerEquation);
+
+        doc.get("Memory Use Information")
+            ->add("Memory used for linear system and CG (Gbytes)", fnbytesPerLevel[0] / 1000000000.0);
+
+        doc.get("Memory Use Information")->add("Coarse Grids", "");
+        for (int i = 1; i < numberOfMgLevels; ++i)
+        {
+            doc.get("Memory Use Information")->get("Coarse Grids")->add("Grid Level", i);
+            doc.get("Memory Use Information")
+                ->get("Coarse Grids")
+                ->add("Memory used", fnbytesPerLevel[i] / 1000000000.0);
+        }
+
+        doc.add("########## V&V Testing Summary  ##########", "");
+        doc.add("Spectral Convergence Tests", "");
+        if (testcg_data.count_fail == 0)
+            doc.get("Spectral Convergence Tests")->add("Result", "PASSED");
+        else
+            doc.get("Spectral Convergence Tests")->add("Result", "FAILED");
+        doc.get("Spectral Convergence Tests")->add("Unpreconditioned", "");
+        doc.get("Spectral Convergence Tests")
+            ->get("Unpreconditioned")
+            ->add("Maximum iteration count", testcg_data.niters_max_no_prec);
+        doc.get("Spectral Convergence Tests")
+            ->get("Unpreconditioned")
+            ->add("Expected iteration count", testcg_data.expected_niters_no_prec);
+        doc.get("Spectral Convergence Tests")->add("Preconditioned", "");
+        doc.get("Spectral Convergence Tests")
+            ->get("Preconditioned")
+            ->add("Maximum iteration count", testcg_data.niters_max_prec);
+        doc.get("Spectral Convergence Tests")
+            ->get("Preconditioned")
+            ->add("Expected iteration count", testcg_data.expected_niters_prec);
+
+        const char DepartureFromSymmetry[] = "Departure from Symmetry |x'Ay-y'Ax|/(2*||x||*||A||*||y||)/epsilon";
+        doc.add(DepartureFromSymmetry, "");
+        if (testsymmetry_data.count_fail == 0)
+            doc.get(DepartureFromSymmetry)->add("Result", "PASSED");
+        else
+            doc.get(DepartureFromSymmetry)->add("Result", "FAILED");
+        doc.get(DepartureFromSymmetry)->add("Departure for SpMV", testsymmetry_data.depsym_spmv);
+        doc.get(DepartureFromSymmetry)->add("Departure for MG", testsymmetry_data.depsym_mg);
+
+        doc.add("########## Iterations Summary  ##########", "");
+        doc.add("Iteration Count Information", "");
+        if (!global_failure)
+            doc.get("Iteration Count Information")->add("Result", "PASSED");
+        else
+            doc.get("Iteration Count Information")->add("Result", "FAILED");
+        doc.get("Iteration Count Information")->add("Reference CG iterations per set", refMaxIters);
+        doc.get("Iteration Count Information")->add("Optimized CG iterations per set", optMaxIters);
+        doc.get("Iteration Count Information")
+            ->add("Total number of reference iterations", refMaxIters * numberOfCgSets);
+        doc.get("Iteration Count Information")
+            ->add("Total number of optimized iterations", optMaxIters * numberOfCgSets);
+
+        doc.add("########## Reproducibility Summary  ##########", "");
+        doc.add("Reproducibility Information", "");
+        if (testnorms_data.pass)
+            doc.get("Reproducibility Information")->add("Result", "PASSED");
+        else
+            doc.get("Reproducibility Information")->add("Result", "FAILED");
+        doc.get("Reproducibility Information")->add("Scaled residual mean", testnorms_data.mean);
+        doc.get("Reproducibility Information")->add("Scaled residual variance", testnorms_data.variance);
+
+        doc.add("########## Performance Summary (times in sec) ##########", "");
+
+        doc.add("Benchmark Time Summary", "");
+        doc.get("Benchmark Time Summary")->add("Optimization phase", times[7]);
+        doc.get("Benchmark Time Summary")->add("DDOT", times[1]);
+        doc.get("Benchmark Time Summary")->add("WAXPBY", times[2]);
+        doc.get("Benchmark Time Summary")->add("SpMV", times[3]);
+        doc.get("Benchmark Time Summary")->add("MG", times[5]);
+        doc.get("Benchmark Time Summary")->add("Total", times[0]);
+
+        doc.add("Floating Point Operations Summary", "");
+        doc.get("Floating Point Operations Summary")->add("Raw DDOT", fnops_ddot);
+        doc.get("Floating Point Operations Summary")->add("Raw WAXPBY", fnops_waxpby);
+        doc.get("Floating Point Operations Summary")->add("Raw SpMV", fnops_sparsemv);
+        doc.get("Floating Point Operations Summary")->add("Raw MG", fnops_precond);
+        doc.get("Floating Point Operations Summary")->add("Total", fnops);
+        doc.get("Floating Point Operations Summary")->add("Total with convergence overhead", frefnops);
+
+        doc.add("GB/s Summary", "");
+        doc.get("GB/s Summary")->add("Raw Read B/W", fnreads / times[0] / 1.0E9);
+        doc.get("GB/s Summary")->add("Raw Write B/W", fnwrites / times[0] / 1.0E9);
+        doc.get("GB/s Summary")->add("Raw Total B/W", (fnreads + fnwrites) / (times[0]) / 1.0E9);
+        doc.get("GB/s Summary")
+            ->add("Total with convergence and optimization phase overhead",
+                (frefnreads + frefnwrites) / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0))
+                    / 1.0E9);
+
+        doc.add("GFLOP/s Summary", "");
+        doc.get("GFLOP/s Summary")->add("Raw DDOT", fnops_ddot / times[1] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw WAXPBY", fnops_waxpby / times[2] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw SpMV", fnops_sparsemv / (times[3]) / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw MG", fnops_precond / (times[5]) / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Raw Total", fnops / times[0] / 1.0E9);
+        doc.get("GFLOP/s Summary")->add("Total with convergence overhead", frefnops / times[0] / 1.0E9);
+        // This final GFLOP/s rating includes the overhead of problem setup and optimizing the data structures vs ten
+        // sets of 50 iterations of CG
+        double totalGflops = frefnops / (times[0] + fNumberOfCgSets * (times[7] / 10.0 + times[9] / 10.0)) / 1.0E9;
+        double totalGflops24 = frefnops / (times[0] + fNumberOfCgSets * times[7] / 10.0) / 1.0E9;
+        doc.get("GFLOP/s Summary")->add("Total with convergence and optimization phase overhead", totalGflops);
+
+        doc.add("User Optimization Overheads", "");
+        doc.get("User Optimization Overheads")->add("Optimization phase time (sec)", (times[7]));
+        doc.get("User Optimization Overheads")
+            ->add("Optimization phase time vs reference SpMV+MG time", times[7] / times[8]);
+
+#ifndef HPCG_NO_MPI
+        doc.add("DDOT Timing Variations", "");
+        doc.get("DDOT Timing Variations")->add("Min DDOT MPI_Allreduce time", t4min);
+        doc.get("DDOT Timing Variations")->add("Max DDOT MPI_Allreduce time", t4max);
+        doc.get("DDOT Timing Variations")->add("Avg DDOT MPI_Allreduce time", t4avg);
+
+// doc.get("Sparse Operations Overheads")->add("Halo exchange time (sec)", (times[6]));
+// doc.get("Sparse Operations Overheads")->add("Halo exchange as percentage of SpMV time",
+// (times[6])/totalSparseMVTime*100.0);
+#endif
+        doc.add("Final Summary", "");
+        bool isValidRun = (testcg_data.count_fail == 0) && (testsymmetry_data.count_fail == 0) && (testnorms_data.pass)
+            && (!global_failure);
+        if (isValidRun)
+        {
+            doc.get("Final Summary")->add("HPCG result is VALID with a GFLOP/s rating of", totalGflops);
+            doc.get("Final Summary")->add("HPCG 2.4 rating for historical reasons is", totalGflops24);
+            if (!A.isDotProductOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeDotProduct used",
+                        "Performance results are most likely suboptimal");
+            }
+            if (!A.isSpmvOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeSPMV used", "Performance results are most likely suboptimal");
+            }
+            if (!A.isMgOptimized)
+            {
+                if (A.geom->numThreads > 1)
+                    doc.get("Final Summary")
+                        ->add("Reference version of ComputeMG used and number of threads greater than 1",
+                            "Performance results are severely suboptimal");
+                else // numThreads ==1
+                    doc.get("Final Summary")
+                        ->add("Reference version of ComputeMG used", "Performance results are most likely suboptimal");
+            }
+            if (!A.isWaxpbyOptimized)
+            {
+                doc.get("Final Summary")
+                    ->add("Reference version of ComputeWAXPBY used", "Performance results are most likely suboptimal");
+            }
+            if (times[0] >= minOfficialTime)
+            {
+                doc.get("Final Summary")
+                    ->add("Please upload results from the YAML file contents to", "http://hpcg-benchmark.org");
+            }
+            else
+            {
+                doc.get("Final Summary")->add("Results are valid but execution time (sec) is", times[0]);
+                if (quickPath)
+                {
+                    doc.get("Final Summary")
+                        ->add("You have selected the QuickPath option",
+                            "Results are official for legacy installed systems with confirmation from the HPCG "
+                            "Benchmark leaders.");
+                    doc.get("Final Summary")
+                        ->add("After confirmation please upload results from the YAML file contents to",
+                            "http://hpcg-benchmark.org");
+                }
+                else
+                {
+                    doc.get("Final Summary")
+                        ->add("Official results execution time (sec) must be at least", minOfficialTime);
+                }
+            }
+        }
+        else
+        {
+            doc.get("Final Summary")->add("HPCG result is", "INVALID.");
+            doc.get("Final Summary")
+                ->add("Please review the YAML file contents", "You may NOT submit these results for consideration.");
+        }
+
+        std::string yaml = doc.generate();
+#ifdef HPCG_DEBUG
+        HPCG_fout << yaml;
+#endif
+    }
+    return;
+}
--- a/src/ReportResults.hpp
+++ b/src/ReportResults.hpp
@@ -0,0 +1,26 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef REPORTRESULTS_HPP
+#define REPORTRESULTS_HPP
+#include "SparseMatrix.hpp"
+#include "TestCG.hpp"
+#include "TestNorms.hpp"
+#include "TestSymmetry.hpp"
+
+void ReportResults(const SparseMatrix& A, int numberOfMgLevels, int numberOfCgSets, int refMaxIters, int optMaxIters,
+    double times[], const TestCGData& testcg_data, const TestSymmetryData& testsymmetry_data,
+    const TestNormsData& testnorms_data, int global_failure, bool quickPath);
+
+#endif // REPORTRESULTS_HPP
--- a/src/SetupHalo.cpp
+++ b/src/SetupHalo.cpp
@@ -0,0 +1,729 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file SetupHalo.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <map>
+#include <mpi.h>
+#include <set>
+#endif
+
+#include <algorithm>
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#include "SetupHalo.hpp"
+#include "SetupHalo_ref.hpp"
+
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#include "CudaKernels.hpp"
+#endif
+
+#ifdef USE_GRACE
+#include "CpuKernels.hpp"
+#endif
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+extern int global_total_ranks;
+extern int* physical_rank_dims;
+extern int* logical_rank_to_phys;
+extern int* rankToId_h;
+extern int* idToRank_h;
+extern p2p_comm_mode_t P2P_Mode;
+#endif
+
+/*!
+  Prepares system matrix data structure and creates data necessary necessary
+  for communication of boundary values of this process.
+
+  @param[inout] A    The known system matrix
+
+  @see ExchangeHalo
+*/
+#ifdef USE_CUDA
+void SetupHalo_Gpu(SparseMatrix& A)
+{
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+
+#ifndef HPCG_NO_MPI
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+
+    local_int_t* send_buffer_d;
+    local_int_t sendbufld
+        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
+    int* neighbors = new int[27];
+    int* neighborsPhysical = new int[27];
+
+    CHECK_CUDART(cudaMalloc((void**) &(send_buffer_d), 27 * sendbufld * sizeof(local_int_t)));
+    local_int_t* sendLength = new local_int_t[27];
+
+    local_int_t totalToBeSent = 0;
+    int neiCount = 0;
+    int numberOfExternalValues = 0;
+
+    local_int_t* sendcounts2 = new local_int_t[27];
+    local_int_t* receiveLength = new local_int_t[27];
+    memset(sendcounts2, 0, sizeof(local_int_t) * (27));
+
+    local_int_t* sendcounts_d = NULL;
+    local_int_t* elementsToSendGpu;
+
+    cudaMalloc(&sendcounts_d, sizeof(local_int_t) * (27));
+    cudaMemsetAsync(sendcounts_d, 0, sizeof(local_int_t) * (27), stream);
+
+    // Finds elements to send and neighbors
+    SetupHaloCuda(A, sendbufld, sendcounts_d, send_buffer_d, &totalToBeSent, &neiCount, neighbors, sendLength,
+        &elementsToSendGpu);
+
+    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
+    double* sendBuffer = nullptr;
+    if (totalToBeSent > 0)
+    {
+        cudaMemcpyAsync(
+            elementsToSend, elementsToSendGpu, sizeof(local_int_t) * totalToBeSent, cudaMemcpyDeviceToHost, stream);
+
+        local_int_t* sendcounts = (local_int_t*) malloc(sizeof(local_int_t) * (A.geom->size + 1));
+        memset(sendcounts, 0, sizeof(local_int_t) * (A.geom->size + 1));
+
+        local_int_t *eltsToRecv_d = NULL, *extToLocMap = NULL;
+
+        sendcounts[0] = 0;
+        for (int i = 0; i < neiCount; i++)
+        {
+            receiveLength[i] = sendLength[i];
+            sendcounts[i + 1] = sendcounts[i] + sendLength[i];
+            int neighborId = neighbors[i];
+            neighborsPhysical[i] = logical_rank_to_phys[neighborId];
+        }
+        CHECK_CUDART(cudaMalloc(&extToLocMap, sizeof(local_int_t) * localNumberOfRows));
+        CHECK_CUDART(cudaMalloc(&eltsToRecv_d, sizeof(local_int_t) * totalToBeSent));
+
+        CHECK_CUDART(cudaMallocHost(&(sendBuffer), sizeof(double) * totalToBeSent));
+        CHECK_CUDART(cudaMalloc(&(A.gpuAux.sendBuffer), sizeof(double) * totalToBeSent));
+
+        local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
+
+        // Exchange elements to send with neighbors
+        auto INDEX_TYPE = MPI_INT;
+#ifdef INDEX_64 // In src/Geometry
+        INDEX_TYPE = MPI_LONG;
+#endif
+
+        MPI_Status status;
+        int MPI_MY_TAG = 93;
+        MPI_Request* request = new MPI_Request[neiCount];
+        cudaStreamSynchronize(stream);
+
+        local_int_t* recv_ptr = eltsToRecv;
+        for (int i = 0; i < neiCount; i++)
+        {
+            auto n_recv = sendLength[i];
+            MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+            recv_ptr += n_recv;
+        }
+
+        local_int_t* elts_ptr = elementsToSend;
+        for (int i = 0; i < neiCount; i++)
+        {
+            auto n_send = sendLength[i];
+            MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
+            elts_ptr += n_send;
+        }
+        for (int i = 0; i < neiCount; i++)
+        {
+            MPI_Wait(request + i, &status);
+        }
+        delete[] request;
+
+        cudaMemcpyAsync(
+            eltsToRecv_d, eltsToRecv, sizeof(local_int_t) * (totalToBeSent), cudaMemcpyHostToDevice, stream);
+
+        // Add the sorted indices from neighbors. For each neighbor, add its indices sequentially
+        //  before the next neighbor's indices. Tje indices will be adjusted to be
+        //  localNumberOfRows + its sequential location
+        for (int neighborCount = 0; neighborCount < neiCount; ++neighborCount)
+        {
+            int neighborId = neighbors[neighborCount];
+            cudaMemsetAsync(extToLocMap, 0, sizeof(local_int_t) * localNumberOfRows, stream);
+            local_int_t str = sendcounts[neighborCount];
+            local_int_t end = sendcounts[neighborCount + 1];
+            ExtToLocMapCuda(localNumberOfRows, str, end, extToLocMap, eltsToRecv_d);
+            ExtTolocCuda(localNumberOfRows, neighborId, A.extNnz, A.csrExtColumns, A.csrExtValues,
+                A.gpuAux.ext2csrOffsets, extToLocMap, A.gpuAux.columns);
+        }
+
+        CHECK_CUDART(cudaFree(sendcounts_d));
+        CHECK_CUDART(cudaFree(extToLocMap));
+        CHECK_CUDART(cudaFree(eltsToRecv_d));
+
+        // For P2P Alltoallv communication
+        if (P2P_Mode == MPI_GPU_All2allv || P2P_Mode == MPI_CPU_All2allv)
+        {
+            int* sdispls = new int[A.geom->size];
+            int* rdispls = new int[A.geom->size];
+            int* scounts = new int[A.geom->size];
+            int* rcounts = new int[A.geom->size];
+            int tmp_s = 0, tmp_r = 0;
+
+            if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
+                return;
+
+            for (local_int_t i = 0; i < A.geom->size; i++)
+            {
+                scounts[i] = 0;
+                rcounts[i] = 0;
+                sdispls[i] = 0;
+                rdispls[i] = 0;
+            }
+
+            for (local_int_t i = 0; i < neiCount; i++)
+            {
+                local_int_t root = neighborsPhysical[i];
+                scounts[root] = sendLength[i];
+                rcounts[root] = receiveLength[i];
+                sdispls[root] = tmp_s;
+                tmp_s += sendLength[i];
+                rdispls[root] = tmp_r;
+                tmp_r += receiveLength[i];
+            }
+
+            A.scounts = scounts;
+            A.rcounts = rcounts;
+            A.sdispls = sdispls;
+            A.rdispls = rdispls;
+        }
+    }
+
+    // Store contents in our matrix struct
+    A.numberOfExternalValues = totalToBeSent;
+    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
+    A.numberOfSendNeighbors = neiCount;
+    A.totalToBeSent = totalToBeSent;
+    A.elementsToSend = elementsToSend;
+    A.gpuAux.elementsToSend = elementsToSendGpu;
+    A.neighbors = neighbors;
+    A.neighborsPhysical = neighborsPhysical;
+    A.receiveLength = receiveLength;
+    A.sendLength = sendLength;
+    A.sendBuffer = sendBuffer;
+#endif
+    return;
+}
+#endif
+
+#ifdef USE_GRACE
+void SetupHalo_Cpu(SparseMatrix& A)
+{
+    // Extract Matrix pieces
+    global_int_t nx = A.geom->nx;
+    global_int_t ny = A.geom->ny;
+    global_int_t nz = A.geom->nz;
+    global_int_t gnx = A.geom->gnx;
+    global_int_t gny = A.geom->gny;
+    global_int_t gnz = A.geom->gnz;
+    global_int_t gix0 = A.geom->gix0;
+    global_int_t giy0 = A.geom->giy0;
+    global_int_t giz0 = A.geom->giz0;
+    int npx = A.geom->npx;
+    int npy = A.geom->npy;
+
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    local_int_t* nonzerosInRow = A.nonzerosInRow;
+    global_int_t** mtxIndG = A.mtxIndG;
+    local_int_t** mtxIndL = A.mtxIndL;
+
+#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        int cur_nnz = nonzerosInRow[i];
+        for (int j = 0; j < cur_nnz; j++)
+            mtxIndL[i][j] = mtxIndG[i][j];
+    }
+
+#else // Run this section if compiling for MPI
+
+    // Scan global IDs of the nonzeros in the matrix.  Determine if the column ID matches a row ID.  If not:
+    // 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
+    //  We need to receive this value of the x vector during the halo exchange.
+    // 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
+    std::map<local_int_t, std::map<global_int_t, local_int_t>> externalToLocalMap;
+    local_int_t* extTemp = new local_int_t[localNumberOfRows];
+
+    // Okay Let us git rid of the map
+    local_int_t sendbufld
+        = std::max(std::max(A.geom->nx * A.geom->ny, A.geom->nx * A.geom->nz), A.geom->ny * A.geom->nz);
+    local_int_t* send_buffer = new local_int_t[27 * sendbufld];
+    char* has_external = new char[localNumberOfRows];
+    local_int_t* sendcounter = new local_int_t[27];
+    for (local_int_t i = 0; i < 27; i++)
+        sendcounter[i] = 0;
+
+// Goes through all local rows, for each local point
+//  find its 27 3D neighbors (including the point itself).
+//  For each neibor decide if it is on a different rank (halo) or local
+//  If external, add to the send buffer
+//  If local, create the local matrix
+#pragma omp parallel for
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        const local_int_t iz = (i / (nx * ny));
+        const local_int_t iy = (i - iz * nx * ny) / nx;
+        const local_int_t ix = i - (iz * ny + iy) * nx;
+        const global_int_t gix = ix + gix0;
+        const global_int_t giy = iy + giy0;
+        const global_int_t giz = iz + giz0;
+        global_int_t curcol;
+
+        int nnz_c = 0;
+        bool rank_set[27];
+        for (int j = 0; j < 27; j++)
+        {
+            rank_set[j] = false;
+        }
+        has_external[i] = 0;
+        for (int k = 0; k < 27; k++)
+        {
+            long long int cgix = gix + tid2indCpu[k][0];
+            long long int cgiy = giy + tid2indCpu[k][1];
+            long long int cgiz = giz + tid2indCpu[k][2];
+            int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+            if (ok)
+            {
+                int ipz = cgiz / nz;
+                int ipy = cgiy / ny;
+                int ipx = cgix / nx;
+
+                // For GPUCPU exec mode, find the 3D rank coordinates.
+                //  For diff dim between CPU and GPU, we cannot
+                //  just divide on the local dim to find ipx/ipy/ipz
+                //  We must find it manually based on neighbor 3d coordinates
+                //  Note the halo size is always 1
+                if (A.geom->different_dim == Z)
+                {
+                    long long int local = cgiz - giz0;
+                    if (local >= 0 && local < nz)
+                        ipz = A.geom->ipz;
+                    else if (local < 0)
+                        ipz = A.geom->ipz - 1;
+                    else if (local >= nz)
+                        ipz = A.geom->ipz + 1;
+                }
+                else if (A.geom->different_dim == Y)
+                {
+                    long long int local = cgiy - giy0;
+                    if (local >= 0 && local < ny)
+                        ipy = A.geom->ipy;
+                    else if (local < 0)
+                        ipy = A.geom->ipy - 1;
+                    else if (local >= ny)
+                        ipy = A.geom->ipy + 1;
+                }
+                else if (A.geom->different_dim == X)
+                {
+                    long long int local = cgix - gix0;
+                    if (local >= 0 && local < nx)
+                        ipx = A.geom->ipx;
+                    else if (local < 0)
+                        ipx = A.geom->ipx - 1;
+                    else if (local >= nx)
+                        ipx = A.geom->ipx + 1;
+                }
+
+                // Global rank Id
+                int col_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                // The neighbor point rank is diff than the current point rank
+                if (A.geom->logical_rank != col_rank)
+                {
+                    has_external[i] = 1;
+                    int rankId = rankToId_h[col_rank];
+                    local_int_t* p = &(sendcounter[rankId]);
+                    // Add the halo point atomically to send_buffer
+                    // For all the cols in a row that has the same rank,
+                    //  we add the row once to the rank buffer
+                    if (!rank_set[rankId])
+                    {
+                        rank_set[rankId] = true;
+                        local_int_t t;
+#pragma omp atomic capture
+                        {
+                            t = *p;
+                            *p += 1;
+                        }
+                        send_buffer[rankId * sendbufld + t] = i;
+                    }
+                }
+                else
+                {
+                    // local neighbor, add it to the local matrix
+                    local_int_t zi = cgiz - giz0;
+                    local_int_t yi = cgiy - giy0;
+                    local_int_t xi = cgix - gix0;
+                    local_int_t lcol = zi * ny * nx + yi * nx + xi;
+                    mtxIndL[i][nnz_c] = lcol;
+                }
+                nnz_c++;
+            }
+        }
+    }
+
+    // Now external data structures
+    // 1 Create elements to send buffer (Sort the indicies for each neighbor)
+    local_int_t totalToBeSent = 0;
+    local_int_t* sendcounts = new local_int_t[A.geom->size + 1];
+    sendcounts[0] = 0;
+    int neighborCount = 0;
+#pragma omp parallel for
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            std::sort(send_buffer + i * sendbufld, send_buffer + i * sendbufld + sendcounter[i]);
+        }
+    }
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            totalToBeSent += sendcounter[i];
+            sendcounts[neighborCount + 1] = sendcounts[neighborCount] + sendcounter[i];
+            neighborCount++;
+        }
+    }
+
+    // 2 Now find neighbor Ids, neighbor physical Ids (see GenerateGeometry), and elemets to send
+    local_int_t sendEntryCount = 0;
+    local_int_t* receiveLength = new local_int_t[neighborCount];
+    local_int_t* sendLength = new local_int_t[neighborCount];
+    // Build the arrays and lists needed by the ExchangeHalo function.
+    double* sendBuffer = new double[totalToBeSent];
+    int* neighbors = new int[neighborCount];
+    int* neighborsPhysical = new int[neighborCount];
+    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
+
+    neighborCount = 0;
+    for (local_int_t i = 0; i < 27; i++)
+    {
+        if (sendcounter[i] > 0)
+        {
+            int neighborId = idToRank_h[i]; // logical Id
+            int phys_neiId = logical_rank_to_phys[neighborId];
+
+            neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
+            neighborsPhysical[neighborCount] = phys_neiId;
+            receiveLength[neighborCount] = sendcounter[i];
+            sendLength[neighborCount] = sendcounter[i];
+
+            for (int j = 0; j < sendcounter[i]; j++)
+            {
+                elementsToSend[sendEntryCount] = send_buffer[i * sendbufld + j];
+                sendEntryCount++;
+            }
+            neighborCount++;
+        }
+    }
+
+    delete[] send_buffer;
+    delete[] sendcounter;
+
+    // Exchange elements to send  wit other neighbors
+    auto INDEX_TYPE = MPI_INT;
+#ifdef INDEX_64 // In src/Geometry
+    INDEX_TYPE = MPI_LONG;
+#endif
+    MPI_Status status;
+    int MPI_MY_TAG = 93;
+    MPI_Request* request = new MPI_Request[neighborCount];
+    local_int_t* eltsToRecv = new local_int_t[totalToBeSent];
+    local_int_t* recv_ptr = eltsToRecv;
+    for (int i = 0; i < neighborCount; i++)
+    {
+        int n_recv = sendLength[i];
+        MPI_Irecv(recv_ptr, n_recv, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i);
+        recv_ptr += n_recv;
+    }
+
+    local_int_t* elts_ptr = elementsToSend;
+    for (int i = 0; i < neighborCount; i++)
+    {
+        local_int_t n_send = sendLength[i];
+        MPI_Send(elts_ptr, n_send, INDEX_TYPE, neighborsPhysical[i], MPI_MY_TAG, MPI_COMM_WORLD);
+        elts_ptr += n_send;
+    }
+    for (int i = 0; i < neighborCount; i++)
+    {
+        MPI_Wait(request + i, &status);
+    }
+    delete[] request;
+
+    // Create a map to be used in the optimization step
+    //  Any external column index will be given a sequntail Id
+    //  after the number of rows (Will be used to access x vector)
+    int prev_dim = 0;
+    for (int nc = 0; nc < neighborCount; ++nc)
+    {
+        int neighborId = neighbors[nc];
+        int phys_neiId = neighborsPhysical[nc];
+        local_int_t str = sendcounts[nc];
+        local_int_t end = sendcounts[nc + 1];
+        for (int j = str; j < end; j++)
+        {
+            const local_int_t col = eltsToRecv[j];
+            externalToLocalMap[neighborId][col] = localNumberOfRows + j;
+        }
+    }
+
+    delete[] eltsToRecv;
+    delete[] sendcounts;
+
+    if (totalToBeSent > 0)
+    {
+// Last step sort all external IDs per rank Id, elements of neighbor 0 first, then 1, and so on
+#pragma omp parallel for
+        for (local_int_t i = 0; i < localNumberOfRows; i++)
+        {
+            int nnz_ext = 0;
+            if (has_external[i] == 1)
+            {
+
+                const local_int_t iz = (i / (nx * ny));
+                const local_int_t iy = (i - iz * nx * ny) / nx;
+                const local_int_t ix = i - (iz * ny + iy) * nx;
+                const global_int_t gix = ix + gix0;
+                const global_int_t giy = iy + giy0;
+                const global_int_t giz = iz + giz0;
+                int nnz_c = 0;
+                
+                for (int k = 0; k < 27; k++)
+                {
+                    long long int cgix = gix + tid2indCpu[k][0];
+                    long long int cgiy = giy + tid2indCpu[k][1];
+                    long long int cgiz = giz + tid2indCpu[k][2];
+
+                    local_int_t zi = (cgiz) % nz;
+                    local_int_t yi = (cgiy) % ny;
+                    local_int_t xi = (cgix) % nx;
+                    int ok = cgiz > -1 && cgiz < gnz && cgiy > -1 && cgiy < gny && cgix > -1 && cgix < gnx;
+                    int ipz = cgiz / nz;
+                    int ipy = cgiy / ny;
+                    int ipx = cgix / nx;
+
+                    // The indices sent by the neighbor uses the neighbor's nx, ny, and nz which can
+                    // be deffirent than the current neighbor's dims. Thus, based on neighor location
+                    // and the diffrent_dim we adjust the indices if needed.
+                    // Also, the ipx, ipy, and ipz must be updated accordingly
+                    global_int_t new_nx = A.geom->nx;
+                    global_int_t new_ny = A.geom->ny;
+
+                    if (A.geom->different_dim == Z)
+                    {
+                        long long int local = cgiz - giz0;
+                        if (local >= 0 && local < nz)
+                        {
+                            ipz = A.geom->ipz;
+                            zi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipz = A.geom->ipz - 1;
+                            zi = A.geom->previous_neighbor_dim - 1;
+                        }
+                        else if (local >= nz)
+                        {
+                            ipz = A.geom->ipz + 1;
+                            zi = 0;
+                        }
+                    }
+                    else if (A.geom->different_dim == Y)
+                    {
+                        long long int local = cgiy - giy0;
+                        if (local >= 0 && local < ny)
+                        {
+                            ipy = A.geom->ipy;
+                            yi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipy = A.geom->ipy - 1;
+                            yi = A.geom->previous_neighbor_dim - 1;
+                            new_ny = A.geom->previous_neighbor_dim;
+                        }
+                        else if (local >= ny)
+                        {
+                            ipy = A.geom->ipy + 1;
+                            yi = 0;
+                            new_ny = A.geom->next_neighbor_dim;
+                        }
+                    }
+                    else if (A.geom->different_dim == X)
+                    {
+                        long long int local = cgix - gix0;
+                        if (local >= 0 && local < nx)
+                        {
+                            ipx = A.geom->ipx;
+                            xi = local;
+                        }
+                        else if (local < 0)
+                        {
+                            ipx = A.geom->ipx - 1;
+                            xi = A.geom->previous_neighbor_dim - 1;
+                            new_nx = A.geom->previous_neighbor_dim;
+                        }
+                        else if (local >= nx)
+                        {
+                            ipx = A.geom->ipx + 1;
+                            xi = 0;
+                            new_nx = A.geom->next_neighbor_dim;
+                        }
+                    }
+                    local_int_t lcol = zi * new_ny * new_nx + yi * new_nx + xi;
+                    int row_rank = ipx + ipy * npx + ipz * npy * npx;
+
+                    if (ok)
+                    {
+                        if (externalToLocalMap.find(row_rank) != externalToLocalMap.end())
+                        {
+                            mtxIndL[i][nnz_c] = externalToLocalMap[row_rank][lcol];
+                            nnz_ext++;
+                        }
+                        nnz_c++;
+                    }
+                }
+            }
+             extTemp[i] = nnz_ext;
+        }
+    }
+
+    if (P2P_Mode == MPI_CPU_All2allv)
+    {
+        int* sdispls = new int[A.geom->size];
+        int* rdispls = new int[A.geom->size];
+        int* scounts = new int[A.geom->size];
+        int* rcounts = new int[A.geom->size];
+        int tmp_s = 0, tmp_r = 0;
+
+        if (sdispls == NULL || rdispls == NULL || scounts == NULL || rcounts == NULL)
+            return;
+
+        for (local_int_t i = 0; i < A.geom->size; i++)
+        {
+            scounts[i] = 0;
+            rcounts[i] = 0;
+            sdispls[i] = 0;
+            rdispls[i] = 0;
+        }
+
+        for (local_int_t i = 0; i < neighborCount; i++)
+        {
+            local_int_t root = neighborsPhysical[i];
+            scounts[root] = sendLength[i];
+            rcounts[root] = receiveLength[i];
+            sdispls[root] = tmp_s;
+            tmp_s += sendLength[i];
+            rdispls[root] = tmp_r;
+            tmp_r += receiveLength[i];
+        }
+        A.scounts = scounts;
+        A.rcounts = rcounts;
+        A.sdispls = sdispls;
+        A.rdispls = rdispls;
+    }
+
+    delete[] has_external;
+
+    // Store contents in our matrix struct
+    A.numberOfExternalValues = totalToBeSent;
+    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
+    A.numberOfSendNeighbors = neighborCount;
+    A.totalToBeSent = totalToBeSent;
+    A.elementsToSend = elementsToSend;
+    A.neighbors = neighbors;
+    A.neighborsPhysical = neighborsPhysical;
+    A.receiveLength = receiveLength;
+    A.sendLength = sendLength;
+    A.sendBuffer = sendBuffer;
+    A.cpuAux.tempIndex = extTemp;
+
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
+              << ", number of neighbors = " << A.numberOfSendNeighbors << endl;
+    for (int i = 0; i < A.numberOfSendNeighbors; i++)
+    {
+        HPCG_fout << "     rank " << A.geom->rank << " neighbor " << neighbors[i]
+                  << " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
+        for (local_int_t j = 0; j < sendLength[i]; ++j)
+            HPCG_fout << "       rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
+                      << endl;
+    }
+#endif
+
+#endif
+    // ifdef HPCG_NO_MPI
+
+    return;
+}
+#endif // USE_GRACE
+
+void SetupHalo(SparseMatrix& A)
+{
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        SetupHalo_Gpu(A);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        SetupHalo_Cpu(A);
+#endif
+    }
+}
--- a/src/SetupHalo.hpp
+++ b/src/SetupHalo.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef SETUPHALO_HPP
+#define SETUPHALO_HPP
+#include "SparseMatrix.hpp"
+
+void SetupHalo(SparseMatrix& A);
+
+#endif // SETUPHALO_HPP
--- a/src/SetupHalo_ref.cpp
+++ b/src/SetupHalo_ref.cpp
@@ -0,0 +1,212 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file SetupHalo_ref.cpp
+
+ HPCG routine
+ */
+
+#ifndef HPCG_NO_MPI
+#include <map>
+#include <mpi.h>
+#include <set>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#ifdef HPCG_DETAILED_DEBUG
+#include <fstream>
+using std::endl;
+#include "hpcg.hpp"
+#include <cassert>
+#endif
+
+#include <cstdio>
+
+#include "SetupHalo_ref.hpp"
+#include "mytimer.hpp"
+
+extern int use_output_file;
+
+/*!
+  Reference version of SetupHalo that prepares system matrix data structure and creates data necessary
+  for communication of boundary values of this process.
+
+  @param[inout] A    The known system matrix
+
+  @see ExchangeHalo
+*/
+void SetupHalo_ref(SparseMatrix& A)
+{
+
+    // Extract Matrix pieces
+
+    local_int_t localNumberOfRows = A.localNumberOfRows;
+    local_int_t* nonzerosInRow = A.nonzerosInRow;
+    global_int_t** mtxIndG = A.mtxIndG;
+    local_int_t** mtxIndL = A.mtxIndL;
+
+#ifdef HPCG_NO_MPI // In the non-MPI case we simply copy global indices to local index storage
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        int cur_nnz = nonzerosInRow[i];
+        for (int j = 0; j < cur_nnz; j++)
+            mtxIndL[i][j] = mtxIndG[i][j];
+    }
+
+#else // Run this section if compiling for MPI
+
+    // Scan global IDs of the nonzeros in the matrix.  Determine if the column ID matches a row ID.  If not:
+    // 1) We call the ComputeRankOfMatrixRow function, which tells us the rank of the processor owning the row ID.
+    //  We need to receive this value of the x vector during the halo exchange.
+    // 2) We record our row ID since we know that the other processor will need this value from us, due to symmetry.
+
+    std::map<int, std::set<global_int_t>> sendList, receiveList;
+    typedef std::map<int, std::set<global_int_t>>::iterator map_iter;
+    typedef std::set<global_int_t>::iterator set_iter;
+    std::map<global_int_t, local_int_t> externalToLocalMap;
+
+    // TODO: With proper critical and atomic regions, this loop could be threaded, but not attempting it at this time
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        global_int_t currentGlobalRow = A.localToGlobalMap[i];
+        for (int j = 0; j < nonzerosInRow[i]; j++)
+        {
+            global_int_t curIndex = mtxIndG[i][j];
+            int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
+#ifdef HPCG_DETAILED_DEBUG
+            HPCG_fout << "rank, row , col, globalToLocalMap[col] = " << A.geom->rank << " " << currentGlobalRow << " "
+                      << curIndex << " " << A.globalToLocalMap[curIndex] << endl;
+#endif
+            if (A.geom->rank != rankIdOfColumnEntry)
+            { // If column index is not a row index, then it comes from another processor
+                receiveList[rankIdOfColumnEntry].insert(curIndex);
+                sendList[rankIdOfColumnEntry].insert(
+                    currentGlobalRow); // Matrix symmetry means we know the neighbor process wants my value
+            }
+        }
+    }
+
+    // Count number of matrix entries to send and receive
+    local_int_t totalToBeSent = 0;
+    for (map_iter curNeighbor = sendList.begin(); curNeighbor != sendList.end(); ++curNeighbor)
+    {
+        totalToBeSent += (curNeighbor->second).size();
+    }
+    local_int_t totalToBeReceived = 0;
+    for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
+    {
+        totalToBeReceived += (curNeighbor->second).size();
+    }
+
+#ifdef HPCG_DETAILED_DEBUG
+    // These are all attributes that should be true, due to symmetry
+    HPCG_fout << "totalToBeSent = " << totalToBeSent << " totalToBeReceived = " << totalToBeReceived << endl;
+    assert(totalToBeSent == totalToBeReceived);    // Number of sent entry should equal number of received
+    assert(sendList.size() == receiveList.size()); // Number of send-to neighbors should equal number of receive-from
+    // Each receive-from neighbor should be a send-to neighbor, and send the same number of entries
+    for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor)
+    {
+        assert(sendList.find(curNeighbor->first) != sendList.end());
+        assert(sendList[curNeighbor->first].size() == receiveList[curNeighbor->first].size());
+    }
+#endif
+
+    // Build the arrays and lists needed by the ExchangeHalo function.
+    double* sendBuffer = new double[totalToBeSent];
+    local_int_t* elementsToSend = new local_int_t[totalToBeSent];
+    int* neighbors = new int[sendList.size()];
+    local_int_t* receiveLength = new local_int_t[receiveList.size()];
+    local_int_t* sendLength = new local_int_t[sendList.size()];
+    int neighborCount = 0;
+    local_int_t receiveEntryCount = 0;
+    local_int_t sendEntryCount = 0;
+    for (map_iter curNeighbor = receiveList.begin(); curNeighbor != receiveList.end(); ++curNeighbor, ++neighborCount)
+    {
+        int neighborId = curNeighbor->first;   // rank of current neighbor we are processing
+        neighbors[neighborCount] = neighborId; // store rank ID of current neighbor
+        receiveLength[neighborCount] = receiveList[neighborId].size();
+        sendLength[neighborCount] = sendList[neighborId].size(); // Get count if sends/receives
+        for (set_iter i = receiveList[neighborId].begin(); i != receiveList[neighborId].end(); ++i, ++receiveEntryCount)
+        {
+            externalToLocalMap[*i]
+                = localNumberOfRows + receiveEntryCount; // The remote columns are indexed at end of internals
+        }
+        for (set_iter i = sendList[neighborId].begin(); i != sendList[neighborId].end(); ++i, ++sendEntryCount)
+        {
+            // if (geom.rank==1) HPCG_fout << "*i, globalToLocalMap[*i], sendEntryCount = " << *i << " " <<
+            // A.globalToLocalMap[*i] << " " << sendEntryCount << endl;
+            elementsToSend[sendEntryCount] = A.globalToLocalMap[*i]; // store local ids of entry to send
+        }
+    }
+
+#if 1
+    // Convert matrix indices to local IDs
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localNumberOfRows; i++)
+    {
+        for (int j = 0; j < nonzerosInRow[i]; j++)
+        {
+            global_int_t curIndex = mtxIndG[i][j];
+            int rankIdOfColumnEntry = ComputeRankOfMatrixRow(*(A.geom), curIndex);
+            if (A.geom->rank == rankIdOfColumnEntry)
+            { // My column index, so convert to local index
+                mtxIndL[i][j] = A.globalToLocalMap[curIndex];
+            }
+            else
+            { // If column index is not a row index, then it comes from another processor
+                mtxIndL[i][j] = externalToLocalMap[curIndex];
+            }
+        }
+    }
+#endif
+    // Store contents in our matrix struct
+    A.numberOfExternalValues = externalToLocalMap.size();
+    printf("%d %d\n", A.localNumberOfRows, A.numberOfExternalValues);
+    fflush(0);
+    A.localNumberOfColumns = A.localNumberOfRows + A.numberOfExternalValues;
+    A.numberOfSendNeighbors = sendList.size();
+    A.totalToBeSent = totalToBeSent;
+    A.elementsToSend = elementsToSend;
+    A.neighbors = neighbors;
+    A.receiveLength = receiveLength;
+    A.sendLength = sendLength;
+    A.sendBuffer = sendBuffer;
+
+#ifdef HPCG_DETAILED_DEBUG
+    HPCG_fout << " For rank " << A.geom->rank << " of " << A.geom->size
+              << ", number of neighbors = " << A.numberOfSendNeighbors << endl;
+    for (int i = 0; i < A.numberOfSendNeighbors; i++)
+    {
+        HPCG_fout << "     rank " << A.geom->rank << " neighbor " << neighbors[i]
+                  << " send/recv length = " << sendLength[i] << "/" << receiveLength[i] << endl;
+        for (local_int_t j = 0; j < sendLength[i]; ++j)
+            HPCG_fout << "       rank " << A.geom->rank << " elementsToSend[" << j << "] = " << elementsToSend[j]
+                      << endl;
+    }
+#endif
+
+#endif
+    // ifdef HPCG_NO_MPI
+
+    return;
+}
--- a/src/SetupHalo_ref.hpp
+++ b/src/SetupHalo_ref.hpp
@@ -0,0 +1,21 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef SETUPHALO_REF_HPP
+#define SETUPHALO_REF_HPP
+#include "SparseMatrix.hpp"
+
+void SetupHalo_ref(SparseMatrix& A);
+
+#endif // SETUPHALO_REF_HPP
--- a/src/SparseMatrix.hpp
+++ b/src/SparseMatrix.hpp
@@ -0,0 +1,306 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file SparseMatrix.hpp
+
+ HPCG data structures for the sparse matrix
+ */
+
+#ifndef SPARSEMATRIX_HPP
+#define SPARSEMATRIX_HPP
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cusparse.h>
+#endif
+
+#ifdef USE_GRACE
+#include <nvpl_sparse.h>
+#endif
+
+#include "Cuda.hpp"
+#include "Geometry.hpp"
+#include "MGData.hpp"
+#include "Vector.hpp"
+#include <cassert>
+#include <vector>
+
+extern bool Use_Hpcg_Mem_Reduction;
+
+#ifndef HPCG_NO_MPI
+extern p2p_comm_mode_t P2P_Mode;
+#endif
+
+#if __cplusplus < 201103L
+// for C++03
+#include <map>
+typedef std::map<global_int_t, local_int_t> GlobalToLocalMap;
+#else
+// for C++11 or greater
+#include <unordered_map>
+using GlobalToLocalMap = std::unordered_map<global_int_t, local_int_t>;
+#endif
+
+#ifdef USE_CUDA
+struct CUSPARSE_STRUCT
+{
+    cusparseDnVecDescr_t vecX;
+    cusparseDnVecDescr_t vecY;
+    cusparseSpMatDescr_t matA;
+    cusparseSpMatDescr_t matL;
+    cusparseSpMatDescr_t matU;
+
+    // CUSPARSE SpSV
+    cusparseSpSVDescr_t spsvDescrL, spsvDescrU;
+};
+
+struct GPU_AUX_STRUCT
+{
+    // Uncolored row related info
+    local_int_t* nnzPerRow;
+    local_int_t* columns;
+    double* values;
+    local_int_t* csrAPermOffsets;
+    local_int_t* csrLPermOffsets;
+    local_int_t* csrUPermOffsets;
+    local_int_t* diagonalIdx;
+
+    // Sliced EllPACK Aux
+    local_int_t* sellADiagonalIdx;
+
+    // Auxiliary data
+    local_int_t* f2c;
+
+    local_int_t* color;
+    int* colorCountCpu;
+
+    // MULTI-GPU Aux data
+    local_int_t* map;
+    local_int_t* ext2csrOffsets;
+    local_int_t* elementsToSend;
+    global_int_t* localToGlobalMap;
+    local_int_t compressNumberOfRows;
+    double* sendBuffer;
+};
+#endif
+
+#ifdef USE_GRACE
+struct NVPL_SPARSE_STRUCT
+{
+    nvpl_sparse_dn_vec_descr_t vecX;
+    nvpl_sparse_dn_vec_descr_t vecY;
+
+    nvpl_sparse_sp_mat_descr_t matL;
+    nvpl_sparse_sp_mat_descr_t matU;
+    nvpl_sparse_sp_mat_descr_t matA;
+
+    nvpl_sparse_spsv_descr_t spsvDescrL, spsvDescrU;
+    nvpl_sparse_spmv_descr_t spmvADescr, spmvLDescr, spmvUDescr;
+};
+
+struct CPU_AUX_STRUCT
+{
+    // Auxiliary data
+    //  Coloring info as number of colors and where each color starts
+    //  Also keep information on how many consecutive rows share the same color
+    //  This assumes matrix reordering (rows with same color are packed)
+    local_int_t* color;
+    local_int_t* firstRowOfColor;
+    local_int_t* nRowsWithColor;
+    local_int_t* tempIndex;
+};
+#endif
+
+struct SparseMatrix_STRUCT
+{
+    rank_type_t rankType;
+    int level;
+    char* title;                                //!< name of the sparse matrix
+    Geometry* geom;                             //!< geometry associated with this matrix
+    global_int_t totalNumberOfRows;             //!< total number of matrix rows across all processes
+    global_int_t totalNumberOfNonzeros;         //!< total number of matrix nonzeros across all processes
+    local_int_t localNumberOfRows;              //!< number of rows local to this process
+    local_int_t localNumberOfColumns;           //!< number of columns local to this process
+    local_int_t localNumberOfNonzeros;          //!< number of nonzeros local to this process
+    local_int_t* nonzerosInRow;                 //!< The number of nonzeros in a row will always be 27 or fewer
+    global_int_t** mtxIndG;                     //!< matrix indices as global values
+    local_int_t** mtxIndL;                      //!< matrix indices as local values
+    double** matrixValues;                      //!< values of matrix entries
+    double** matrixDiagonal;                    //!< values of matrix diagonal entries
+    GlobalToLocalMap globalToLocalMap;          //!< global-to-local mapping
+    std::vector<global_int_t> localToGlobalMap; //!< local-to-global mapping
+    mutable bool isDotProductOptimized;
+    mutable bool isSpmvOptimized;
+    mutable bool isMgOptimized;
+    mutable bool isWaxpbyOptimized;
+
+    mutable MGData* mgData; // Pointer to the coarse level data for this fine matrix
+    void* optimizationData; // pointer that can be used to store implementation-specific data
+
+    local_int_t totalToBeSent; //!< total number of entries to be sent
+    local_int_t slice_size;
+
+#ifndef HPCG_NO_MPI
+    local_int_t numberOfExternalValues; //!< number of entries that are external to this process
+    int numberOfSendNeighbors;          //!< number of neighboring processes that will be send local data
+    local_int_t* elementsToSend;        //!< elements to send to neighboring processes
+    int* neighbors;                     //!< neighboring processes
+    int* neighborsPhysical;
+    local_int_t* receiveLength; //!< lenghts of messages received from neighboring processes
+    local_int_t* sendLength;    //!< lenghts of messages sent to neighboring processes
+    double* sendBuffer;         //!< send buffer for non-blocking sends
+    local_int_t extNnz;
+#endif
+
+    // Optmization Data common between CPU and GPU
+    // Coloring permutations
+    local_int_t totalColors;
+    local_int_t* ref2opt;
+    local_int_t* opt2ref;
+    local_int_t* f2cPerm;
+
+    // Sliced EllPACK
+    local_int_t *sellASliceMrl, *sellLSliceMrl, *sellUSliceMrl;
+    local_int_t *sellAPermColumns, *sellLPermColumns, *sellUPermColumns;
+    double *sellAPermValues, *sellLPermValues, *sellUPermValues;
+    double* diagonal;
+
+    char* bufferSvL = nullptr;
+    char* bufferSvU = nullptr;
+    char* bufferMvA = nullptr;
+    char* bufferMvL = nullptr;
+    char* bufferMvU = nullptr;
+
+    // MULTI-GPU data
+    local_int_t* csrExtOffsets;
+    local_int_t* csrExtColumns;
+    double* csrExtValues;
+    double* tempBuffer;
+
+    // When MPI_All2allv is used for P2P communication
+    int* scounts;
+    int* rcounts;
+    int* sdispls;
+    int* rdispls;
+
+#ifdef USE_CUDA
+    CUSPARSE_STRUCT cusparseOpt;
+    GPU_AUX_STRUCT gpuAux;
+#endif
+
+// #ifdef USE_GRACE
+//     NVPL_SPARSE_STRUCT nvplSparseOpt;
+//     CPU_AUX_STRUCT cpuAux;
+// #endif
+
+    mutable struct SparseMatrix_STRUCT* Ac; // Coarse grid matrix
+};
+
+typedef struct SparseMatrix_STRUCT SparseMatrix;
+
+/*!
+  Initializes the known system matrix data structure members to 0.
+
+  @param[in] A the known system matrix
+ */
+inline void InitializeSparseMatrix(SparseMatrix& A, Geometry* geom)
+{
+    A.title = 0;
+    A.geom = geom;
+    A.totalNumberOfRows = 0;
+    A.totalNumberOfNonzeros = 0;
+    A.localNumberOfRows = 0;
+    A.localNumberOfColumns = 0;
+    A.localNumberOfNonzeros = 0;
+    A.nonzerosInRow = 0;
+    A.mtxIndG = 0;
+    A.mtxIndL = 0;
+    A.matrixValues = 0;
+    A.matrixDiagonal = 0;
+
+    // Optimization is ON by default. The code that switches it OFF is in the
+    // functions that are meant to be optimized.
+    A.isDotProductOptimized = true;
+    A.isSpmvOptimized = true;
+    A.isMgOptimized = true;
+    A.isWaxpbyOptimized = true;
+
+    A.totalToBeSent = 0;
+
+#ifndef HPCG_NO_MPI
+    A.numberOfExternalValues = 0;
+    A.numberOfSendNeighbors = 0;
+    A.totalToBeSent = 0;
+    A.elementsToSend = 0;
+    A.neighbors = 0;
+    A.neighborsPhysical = 0;
+    A.receiveLength = 0;
+    A.sendLength = 0;
+    A.sendBuffer = 0;
+#endif
+    A.mgData = 0; // Fine-to-coarse grid transfer initially not defined.
+
+    return;
+}
+
+/*!
+  Copy values from matrix diagonal into user-provided vector.
+
+  @param[in] A the known system matrix.
+  @param[inout] diagonal  Vector of diagonal values (must be allocated before call to this function).
+ */
+inline void CopyMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
+{
+    double** curDiagA = A.matrixDiagonal;
+    double* dv = diagonal.values;
+    assert(A.localNumberOfRows == diagonal.localLength);
+    for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
+        dv[i] = *(curDiagA[i]);
+    return;
+}
+
+/*!
+  Replace specified matrix diagonal value.
+
+  @param[inout] A The system matrix.
+  @param[in] diagonal  Vector of diagonal values that will replace existing matrix diagonal values.
+ */
+inline void ReplaceMatrixDiagonal(SparseMatrix& A, Vector& diagonal)
+{
+    double** curDiagA = A.matrixDiagonal;
+    double* dv = diagonal.values;
+    assert(A.localNumberOfRows == diagonal.localLength);
+    for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
+        *(curDiagA[i]) = dv[i];
+    return;
+}
+#endif // SPARSEMATRIX_HPP
--- a/src/TestCG.cpp
+++ b/src/TestCG.cpp
@@ -0,0 +1,243 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file TestCG.cpp
+
+ HPCG routine
+ */
+
+// Changelog
+//
+// Version 0.4
+// - Added timing of setup time for sparse MV
+// - Corrected percentages reported for sparse MV with overhead
+//
+/////////////////////////////////////////////////////////////////////////
+
+#include <fstream>
+#include <iostream>
+using std::endl;
+#include "hpcg.hpp"
+#include <vector>
+
+#include "CG.hpp"
+#include "CG_ref.hpp"
+#include "TestCG.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+
+extern int use_output_file;
+
+/*!
+  Test the correctness of the Preconditined CG implementation by using a system matrix with a dominant diagonal.
+
+  @param[in]    geom The description of the problem's geometry.
+  @param[in]    A    The known system matrix
+  @param[in]    data the data structure with all necessary CG vectors preallocated
+  @param[in]    b    The known right hand side vector
+  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
+  @param[out]   testcg_data the data structure with the results of the test including pass/fail information
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+  @see CG()
+ */
+
+int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data)
+{
+    // Use this array for collecting timing information
+    std::vector<double> times(8, 0.0);
+    // Temporary storage for holding original diagonal and RHS
+    Vector origDiagA, exaggeratedDiagA, origB;
+    InitializeVector(origDiagA, A.localNumberOfRows, A.rankType);
+    InitializeVector(exaggeratedDiagA, A.localNumberOfRows, A.rankType);
+    InitializeVector(origB, A.localNumberOfRows, A.rankType);
+    CopyMatrixDiagonal(A, origDiagA);
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyMatrixDiagonalCuda(A, origDiagA);
+#endif
+    }
+    CopyVector(origDiagA, exaggeratedDiagA);
+    CopyVector(b, origB);
+
+    // Modify the matrix diagonal to greatly exaggerate diagonal values.
+    // CG should converge in about 10 iterations for this problem, regardless of problem size
+    for (local_int_t i = 0; i < A.localNumberOfRows; ++i)
+    {
+        global_int_t globalRowID = A.localToGlobalMap[i];
+        if (globalRowID < 9)
+        {
+            double scale = (globalRowID + 2) * 1.0e6;
+            ScaleVectorValue(exaggeratedDiagA, i, scale);
+            ScaleVectorValue(b, i, scale);
+        }
+        else
+        {
+            ScaleVectorValue(exaggeratedDiagA, i, 1.0e6);
+            ScaleVectorValue(b, i, 1.0e6);
+        }
+    }
+
+    // Reference Matrix
+    ReplaceMatrixDiagonal(A, exaggeratedDiagA);
+
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyVectorH2D(exaggeratedDiagA);
+        PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
+        PermVectorCuda(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
+        ReplaceMatrixDiagonalCuda(A, exaggeratedDiagA);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrL, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrU, exaggeratedDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
+        PermVectorCpu(A.opt2ref, exaggeratedDiagA, A.localNumberOfRows);
+        ReplaceMatrixDiagonalCpu(A, exaggeratedDiagA);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, exaggeratedDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+
+    ////////////////////////////////
+
+    int niters = 0;
+    double normr = 0.0;
+    double normr0 = 0.0;
+    int maxIters = 50;
+    int numberOfCgCalls = 2;
+    double tolerance = 1.0e-12; // Set tolerance to reasonable value for grossly scaled diagonal terms
+    testcg_data.expected_niters_no_prec
+        = 12; // For the unpreconditioned CG call, we should take about 10 iterations, permit 12
+    testcg_data.expected_niters_prec = 2; // For the preconditioned case, we should take about 1 iteration, permit 2
+    testcg_data.niters_max_no_prec = 0;
+    testcg_data.niters_max_prec = 0;
+    for (int k = 0; k < 2; ++k)
+    { // This loop tests both unpreconditioned and preconditioned runs
+        int expected_niters = testcg_data.expected_niters_no_prec;
+        if (k == 1)
+            expected_niters = testcg_data.expected_niters_prec;
+        for (int i = 0; i < numberOfCgCalls; ++i)
+        {
+            ZeroVector(x); // Zero out x
+            int ierr = CG(A, data, b, x, maxIters, tolerance, niters, normr, normr0, &times[0], k == 1, 0);
+            if (ierr)
+                if (use_output_file)
+                {
+                    HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
+                }
+                else
+                {
+                    std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
+                }
+            if (niters <= expected_niters)
+            {
+                ++testcg_data.count_pass;
+            }
+            else
+            {
+                ++testcg_data.count_fail;
+            }
+            if (k == 0 && niters > testcg_data.niters_max_no_prec)
+                testcg_data.niters_max_no_prec = niters; // Keep track of largest iter count
+            if (k == 1 && niters > testcg_data.niters_max_prec)
+                testcg_data.niters_max_prec = niters; // Same for preconditioned run
+            if (A.geom->rank == 0)
+            {
+                if (use_output_file)
+                {
+                    HPCG_fout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
+                              << normr / normr0 << "]" << endl;
+                }
+                else
+                {
+                    std::cout << "Call [" << i << "] Number of Iterations [" << niters << "] Scaled Residual ["
+                              << normr / normr0 << "]" << endl;
+                }
+                if (niters > expected_niters)
+                    if (use_output_file)
+                    {
+                        HPCG_fout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
+                                  << endl;
+                    }
+                    else
+                    {
+                        std::cout << " Expected " << expected_niters << " iterations.  Performed " << niters << "."
+                                  << endl;
+                    }
+            }
+        }
+    }
+
+    // Restore matrix diagonal and RHS
+    ReplaceMatrixDiagonal(A, origDiagA);
+
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        ReplaceMatrixDiagonalCuda(A, origDiagA);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrL, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+        cusparseSpSV_updateMatrix(
+            cusparsehandle, A.cusparseOpt.spsvDescrU, origDiagA.values_d, CUSPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        ReplaceMatrixDiagonalCpu(A, origDiagA);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrL, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+        nvpl_sparse_spsv_update_matrix(
+            nvpl_sparse_handle, A.nvplSparseOpt.spsvDescrU, origDiagA.values, NVPL_SPARSE_SPSV_UPDATE_DIAGONAL);
+#endif
+    }
+
+    CopyVector(origB, b);
+    // Delete vectors
+    DeleteVector(origDiagA);
+    DeleteVector(exaggeratedDiagA);
+    DeleteVector(origB);
+    testcg_data.normr = normr;
+
+    return 0;
+}
--- a/src/TestCG.hpp
+++ b/src/TestCG.hpp
@@ -0,0 +1,45 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file TestCG.hpp
+
+ HPCG data structure
+ */
+
+#ifndef TESTCG_HPP
+#define TESTCG_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "Vector.hpp"
+#include "hpcg.hpp"
+
+struct TestCGData_STRUCT
+{
+    int count_pass;              //!< number of succesful tests
+    int count_fail;              //!< number of succesful tests
+    int expected_niters_no_prec; //!< expected number of test CG iterations without preconditioning with diagonally
+                                 //!< dominant matrix (~12)
+    int expected_niters_prec;    //!< expected number of test CG iterations with preconditioning and with diagonally
+                                 //!< dominant matrix (~1-2)
+    int niters_max_no_prec;      //!< maximum number of test CG iterations without predictitioner
+    int niters_max_prec;         //!< maximum number of test CG iterations without predictitioner
+    double normr;                //!< residual norm achieved during test CG iterations
+};
+typedef struct TestCGData_STRUCT TestCGData;
+
+extern int TestCG(SparseMatrix& A, CGData& data, Vector& b, Vector& x, TestCGData& testcg_data);
+
+#endif // TESTCG_HPP
--- a/src/TestNorms.cpp
+++ b/src/TestNorms.cpp
@@ -0,0 +1,49 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file TestNorms.cpp
+
+ HPCG routine
+ */
+
+#include "TestNorms.hpp"
+#include <cmath>
+
+/*!
+  Computes the mean and standard deviation of the array of norm results.
+
+  @param[in] testnorms_data data structure with the results of norm test
+
+  @return Returns 0 upon success or non-zero otherwise
+*/
+int TestNorms(TestNormsData& testnorms_data)
+{
+    double mean_delta = 0.0;
+    for (int i = 0; i < testnorms_data.samples; ++i)
+        mean_delta += (testnorms_data.values[i] - testnorms_data.values[0]);
+    double mean = testnorms_data.values[0] + mean_delta / (double) testnorms_data.samples;
+    testnorms_data.mean = mean;
+
+    // Compute variance
+    double sumdiff = 0.0;
+    for (int i = 0; i < testnorms_data.samples; ++i)
+        sumdiff += (testnorms_data.values[i] - mean) * (testnorms_data.values[i] - mean);
+    testnorms_data.variance = sumdiff / (double) testnorms_data.samples;
+
+    // Determine if variation is sufficiently small to declare success
+    testnorms_data.pass = (testnorms_data.variance < 1.0e-6);
+
+    return 0;
+}
--- a/src/TestNorms.hpp
+++ b/src/TestNorms.hpp
@@ -0,0 +1,36 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file TestNorms.hpp
+
+ HPCG data structure
+ */
+
+#ifndef TESTNORMS_HPP
+#define TESTNORMS_HPP
+
+struct TestNormsData_STRUCT
+{
+    double* values;  //!< sample values
+    double mean;     //!< mean of all sampes
+    double variance; //!< variance of mean
+    int samples;     //!< number of samples
+    bool pass;       //!< pass/fail indicator
+};
+typedef struct TestNormsData_STRUCT TestNormsData;
+
+extern int TestNorms(TestNormsData& testnorms_data);
+
+#endif // TESTNORMS_HPP
--- a/src/TestSymmetry.cpp
+++ b/src/TestSymmetry.cpp
@@ -0,0 +1,298 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file TestSymmetry.cpp
+
+ HPCG routine
+ */
+
+// The MPI include must be first for Windows platforms
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+#include <cfloat>
+#include <fstream>
+#include <iostream>
+using std::endl;
+#include <cmath>
+#include <vector>
+
+#include "hpcg.hpp"
+
+#include "ComputeDotProduct.hpp"
+#include "ComputeMG.hpp"
+#include "ComputeResidual.hpp"
+#include "ComputeSPMV.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "Geometry.hpp"
+#include "SparseMatrix.hpp"
+#include "TestSymmetry.hpp"
+
+extern int use_output_file;
+/*!
+  Tests symmetry-preserving properties of the sparse matrix vector multiply and multi-grid routines.
+
+  @param[in]    geom   The description of the problem's geometry.
+  @param[in]    A      The known system matrix
+  @param[in]    b      The known right hand side vector
+  @param[in]    xexact The exact solution vector
+  @param[inout] testsymmetry_data The data structure with the results of the CG symmetry test including pass/fail
+  information
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see ComputeDotProduct
+  @see ComputeDotProduct_ref
+  @see ComputeSPMV
+  @see ComputeSPMV_ref
+  @see ComputeMG
+  @see ComputeMG_ref
+*/
+int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data)
+{
+
+    local_int_t nrow = A.localNumberOfRows;
+    local_int_t ncol = A.localNumberOfColumns;
+    Vector x_ncol, y_ncol, z_ncol;
+    InitializeVector(x_ncol, ncol, A.rankType);
+    InitializeVector(y_ncol, ncol, A.rankType);
+    InitializeVector(z_ncol, ncol, A.rankType);
+
+    double t4 = 0.0; // Needed for dot-product call, otherwise unused
+    testsymmetry_data.count_fail = 0;
+
+    // Test symmetry of matrix
+    // First load vectors with random values
+    FillRandomVector(x_ncol);
+    FillRandomVector(y_ncol);
+
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        CopyVectorH2D(y_ncol);
+        CopyVectorH2D(x_ncol);
+#endif
+    }
+    int ierr;
+
+    double xNorm2, yNorm2;
+    double ANorm = 2 * 26.0;
+
+    // Next, compute x'*A*y
+    ComputeDotProduct(nrow, y_ncol, y_ncol, yNorm2, t4, A.isDotProductOptimized, A.rankType);
+    ierr = ComputeSPMV(A, y_ncol, z_ncol); // z_nrow = A*y_overlap
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+        }
+    double xtAy = 0.0;
+    ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtAy, t4, A.isDotProductOptimized, A.rankType); // x'*A*y
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+
+    // Next, compute y'*A*x
+    ComputeDotProduct(nrow, x_ncol, x_ncol, xNorm2, t4, A.isDotProductOptimized, A.rankType);
+    ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+        }
+    double ytAx = 0.0;
+    ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytAx, t4, A.isDotProductOptimized, A.rankType); // y'*A*x
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+
+    testsymmetry_data.depsym_spmv = std::fabs((long double) (xtAy - ytAx))
+        / ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
+    if (testsymmetry_data.depsym_spmv > 1.0)
+        ++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
+    if (A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
+                      << testsymmetry_data.depsym_spmv << endl;
+        }
+        else
+        {
+            std::cout << "Departure from symmetry (scaled) for SpMV abs(x'*A*y - y'*A*x) = "
+                      << testsymmetry_data.depsym_spmv << endl;
+        }
+
+    // Test symmetry of multi-grid
+    // Compute x'*Minv*y
+    ierr = ComputeMG(A, y_ncol, z_ncol); // z_ncol = Minv*y_ncol
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
+        }
+    double xtMinvy = 0.0;
+    ierr = ComputeDotProduct(nrow, x_ncol, z_ncol, xtMinvy, t4, A.isDotProductOptimized, A.rankType); // x'*Minv*y
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+    // Next, compute z'*Minv*x
+    ierr = ComputeMG(A, x_ncol, z_ncol); // z_ncol = Minv*x_ncol
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
+        }
+    double ytMinvx = 0.0;
+    ierr = ComputeDotProduct(nrow, y_ncol, z_ncol, ytMinvx, t4, A.isDotProductOptimized, A.rankType); // y'*Minv*x
+    if (ierr)
+        if (use_output_file)
+        {
+            HPCG_fout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+        else
+        {
+            std::cout << "Error in call to dot: " << ierr << ".\n" << endl;
+        }
+    testsymmetry_data.depsym_mg = std::fabs((long double) (xtMinvy - ytMinvx))
+        / ((xNorm2 * ANorm * yNorm2 + yNorm2 * ANorm * xNorm2) * (DBL_EPSILON));
+    if (testsymmetry_data.depsym_mg > 1.0)
+        ++testsymmetry_data.count_fail; // If the difference is > 1, count it wrong
+    if (A.geom->rank == 0)
+        if (use_output_file)
+        {
+            HPCG_fout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
+                      << testsymmetry_data.depsym_mg << endl;
+        }
+        else
+        {
+            std::cout << "Departure from symmetry (scaled) for MG abs(x'*Minv*y - y'*Minv*x) = "
+                      << testsymmetry_data.depsym_mg << endl;
+        }
+
+    CopyVector(xexact, x_ncol); // Copy exact answer into overlap vector
+
+    int numberOfCalls = 2;
+    double residual = 0.0;
+    for (int i = 0; i < numberOfCalls; ++i)
+    {
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            CopyVectorH2D(x_ncol);
+#endif
+        }
+
+        ierr = ComputeSPMV(A, x_ncol, z_ncol); // b_computed = A*x_overlap
+
+        if (A.rankType == GPU)
+        {
+#ifdef USE_CUDA
+            PermVectorCuda(A.ref2opt, z_ncol, nrow);
+            CopyVectorD2H(z_ncol);
+#endif
+        }
+        else
+        {
+#ifdef USE_GRACE
+            PermVectorCpu(A.ref2opt, z_ncol, nrow);
+#endif
+        }
+
+        if (ierr)
+            if (use_output_file)
+            {
+                HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+            }
+            else
+            {
+                std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+            }
+        if ((ierr = ComputeResidual(A.localNumberOfRows, b, z_ncol, residual)))
+            if (use_output_file)
+            {
+                HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
+            }
+            else
+            {
+                std::cout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
+            }
+        if (A.geom->rank == 0)
+            if (use_output_file)
+            {
+                HPCG_fout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
+            }
+            else
+            {
+                std::cout << "SpMV call [" << i << "] Residual [" << residual << "]" << endl;
+            }
+    }
+    DeleteVector(x_ncol);
+    DeleteVector(y_ncol);
+    DeleteVector(z_ncol);
+
+    return 0;
+}
--- a/src/TestSymmetry.hpp
+++ b/src/TestSymmetry.hpp
@@ -0,0 +1,38 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file TestSymmetry.hpp
+
+ HPCG data structures for symmetry testing
+ */
+
+#ifndef TESTSYMMETRY_HPP
+#define TESTSYMMETRY_HPP
+
+#include "CGData.hpp"
+#include "SparseMatrix.hpp"
+#include "hpcg.hpp"
+
+struct TestSymmetryData_STRUCT
+{
+    double depsym_spmv; //!< departure from symmetry for the SPMV kernel
+    double depsym_mg;   //!< departure from symmetry for the MG kernel
+    int count_fail;     //!< number of failures in the symmetry tests
+};
+typedef struct TestSymmetryData_STRUCT TestSymmetryData;
+
+extern int TestSymmetry(SparseMatrix& A, Vector& b, Vector& xexact, TestSymmetryData& testsymmetry_data);
+
+#endif // TESTSYMMETRY_HPP
--- a/src/Vector.hpp
+++ b/src/Vector.hpp
@@ -0,0 +1,240 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file Vector.hpp
+
+ HPCG data structures for dense vectors
+ */
+
+#ifndef VECTOR_HPP
+#define VECTOR_HPP
+#include <cassert>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <omp.h>
+#include <vector>
+
+#include "Geometry.hpp"
+
+struct Vector_STRUCT
+{
+    rank_type_t rt;
+    local_int_t localLength; //!< length of local portion of the vector
+    bool isCudaHost;
+    double* values; //!< array of values
+    /*!
+     This is for storing optimized data structures created in OptimizeProblem and
+     used inside optimized ComputeSPMV().
+     */
+    void* optimizationData;
+#ifdef USE_CUDA
+    double* values_d = nullptr;
+#endif
+
+    bool initialized = false;
+};
+typedef struct Vector_STRUCT Vector;
+
+/*!
+  Initializes input vector.
+
+  @param[in] v
+  @param[in] localLength Length of local portion of input vector
+ */
+
+inline void InitializeVector(Vector& v, local_int_t localLength, rank_type_t rt, bool isCudaHost = false)
+{
+    v.localLength = localLength;
+    v.isCudaHost = isCudaHost;
+    v.rt = rt;
+
+#ifdef USE_CUDA
+    if (v.rt == GPU && v.isCudaHost)
+        cudaMallocHost(&(v.values), sizeof(double) * localLength);
+    else
+#endif
+        v.values = new double[localLength];
+
+    v.optimizationData = 0;
+#ifdef USE_CUDA
+    if (v.rt == GPU)
+        cudaMalloc((void**) &(v.values_d), sizeof(double) * localLength);
+#endif
+    v.initialized = true;
+    return;
+}
+
+/*!
+  Fill the input vector with zero values.
+
+  @param[inout] v - On entrance v is initialized, on exit all its values are zero.
+ */
+
+inline void ZeroVector(Vector& v)
+{
+
+    assert(v.initialized);
+
+    local_int_t localLength = v.localLength;
+    double* vv = v.values;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localLength; ++i)
+        vv[i] = 0.0;
+#ifdef USE_CUDA
+    if (v.rt == GPU)
+    {
+        cudaMemset(v.values_d, 0, sizeof(double) * localLength);
+    }
+#endif
+    return;
+}
+/*!
+  Multiply (scale) a specific vector entry by a given value.
+
+  @param[inout] v Vector to be modified
+  @param[in] index Local index of entry to scale
+  @param[in] value Value to scale by
+ */
+inline void ScaleVectorValue(Vector& v, local_int_t index, double value)
+{
+    assert(index >= 0 && index < v.localLength);
+    double* vv = v.values;
+    vv[index] *= value;
+    return;
+}
+/*!
+  Fill the input vector with pseudo-random values.
+
+  @param[in] v
+ */
+inline void FillRandomVector(Vector& v)
+{
+    local_int_t localLength = v.localLength;
+    double* vv = v.values;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < localLength; ++i)
+        vv[i] = rand() / (double) (RAND_MAX) + 1.0;
+    return;
+}
+
+/*!
+  Copy input vector to output vector.
+
+  @param[in] v Input vector
+  @param[in] w Output vector
+ */
+inline void CopyVector(const Vector& v, Vector& w)
+{
+    local_int_t len = std::min(v.localLength, w.localLength);
+    assert(v.initialized && w.initialized);
+    double* vv = v.values;
+    double* wv = w.values;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (local_int_t i = 0; i < len; ++i)
+        wv[i] = vv[i];
+#ifdef USE_CUDA
+    if (v.rt == GPU && w.rt == GPU)
+    {
+        cudaMemcpy(w.values_d, v.values_d, sizeof(double) * len, cudaMemcpyDeviceToDevice);
+    }
+#endif
+    return;
+}
+
+#ifdef USE_CUDA
+inline void CopyVectorD2H(const Vector& v)
+{
+    local_int_t localLength = v.localLength;
+    cudaMemcpy(v.values, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToHost);
+    return;
+}
+
+inline void CopyVectorD2D(const Vector& v, Vector& w)
+{
+    local_int_t localLength = v.localLength;
+    cudaMemcpy(w.values_d, v.values_d, sizeof(double) * localLength, cudaMemcpyDeviceToDevice);
+    return;
+}
+
+inline void CopyVectorH2D(const Vector& v)
+{
+    local_int_t localLength = v.localLength;
+    cudaMemcpy(v.values_d, v.values, sizeof(double) * localLength, cudaMemcpyHostToDevice);
+    return;
+}
+#endif
+
+inline void CopyAndReorderVector(const Vector& v, Vector& w, local_int_t* perm)
+{
+    local_int_t localLength = v.localLength;
+    assert(w.localLength >= localLength);
+    double* vv = v.values;
+    double* wv = w.values;
+    local_int_t i;
+#ifndef HPCG_NO_OPENMP
+#pragma omp parallel for
+#endif
+    for (i = 0; i < localLength; ++i)
+    {
+        wv[i] = vv[perm[i]];
+    }
+    return;
+}
+
+/*!
+  Deallocates the members of the data structure of the known system matrix provided they are not 0.
+
+  @param[in] A the known system matrix
+ */
+inline void DeleteVector(Vector& v)
+{
+    if (v.isCudaHost)
+        cudaFreeHost(v.values);
+    else
+    {
+        delete[] v.values;
+    }
+    v.localLength = 0;
+#ifdef USE_CUDA
+    if (v.values_d)
+        cudaFree(v.values_d);
+#endif
+    return;
+}
+
+#endif // VECTOR_HPP
--- a/src/WriteProblem.cpp
+++ b/src/WriteProblem.cpp
@@ -0,0 +1,98 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file WriteProblem.cpp
+
+ HPCG routine
+ */
+
+#include "WriteProblem.hpp"
+#include <cstdio>
+
+/*!
+  Routine to dump:
+   - matrix in row, col, val format for analysis with MATLAB
+   - x, xexact, b as simple arrays of numbers.
+
+   Writes to A.dat, x.dat, xexact.dat and b.dat, respectivly.
+
+   NOTE:  THIS CODE ONLY WORKS ON SINGLE PROCESSOR RUNS
+
+   Read into MATLAB using:
+
+       load A.dat
+       A=spconvert(A);
+       load x.dat
+       load xexact.dat
+       load b.dat
+
+  @param[in] geom   The description of the problem's geometry.
+  @param[in] A      The known system matrix
+  @param[in] b      The known right hand side vector
+  @param[in] x      The solution vector computed by CG iteration
+  @param[in] xexact Generated exact solution
+
+  @return Returns with -1 if used with more than one MPI process. Returns with 0 otherwise.
+
+  @see GenerateProblem
+*/
+int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact)
+{
+
+    if (geom.size != 1)
+        return -1; // TODO Only works on one processor.  Need better error handler
+    const global_int_t nrow = A.totalNumberOfRows;
+
+    FILE *fA = 0, *fx = 0, *fxexact = 0, *fb = 0;
+    fA = fopen("A.dat", "w");
+    fx = fopen("x.dat", "w");
+    fxexact = fopen("xexact.dat", "w");
+    fb = fopen("b.dat", "w");
+
+    if (!fA || !fx || !fxexact || !fb)
+    {
+        if (fb)
+            fclose(fb);
+        if (fxexact)
+            fclose(fxexact);
+        if (fx)
+            fclose(fx);
+        if (fA)
+            fclose(fA);
+        return -1;
+    }
+
+    for (global_int_t i = 0; i < nrow; i++)
+    {
+        const double* const currentRowValues = A.matrixValues[i];
+        const global_int_t* const currentRowIndices = A.mtxIndG[i];
+        const int currentNumberOfNonzeros = A.nonzerosInRow[i];
+        for (int j = 0; j < currentNumberOfNonzeros; j++)
+#ifdef HPCG_NO_LONG_LONG
+            fprintf(fA, " %d %d %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
+#else
+            fprintf(fA, " %lld %lld %22.16e\n", i + 1, (global_int_t) (currentRowIndices[j] + 1), currentRowValues[j]);
+#endif
+        fprintf(fx, "%22.16e\n", x.values[i]);
+        fprintf(fxexact, "%22.16e\n", xexact.values[i]);
+        fprintf(fb, "%22.16e\n", b.values[i]);
+    }
+
+    fclose(fA);
+    fclose(fx);
+    fclose(fxexact);
+    fclose(fb);
+    return 0;
+}
--- a/src/WriteProblem.hpp
+++ b/src/WriteProblem.hpp
@@ -0,0 +1,22 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef WRITEPROBLEM_HPP
+#define WRITEPROBLEM_HPP
+#include "Geometry.hpp"
+#include "SparseMatrix.hpp"
+#include <string>
+
+int WriteProblem(const Geometry& geom, const SparseMatrix& A, const Vector b, const Vector x, const Vector xexact);
+#endif // WRITEPROBLEM_HPP
--- a/src/YAML_Doc.cpp
+++ b/src/YAML_Doc.cpp
@@ -0,0 +1,107 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "YAML_Doc.hpp"
+#include <cassert>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+using namespace std;
+
+/*!
+  Sets the application name and version which will become part of the YAML doc.
+
+  @param[in] miniApp_Name application name
+  @param[in] miniApp_Version application name
+  @param[in] destination_Directory destination directory for the YAML document
+  @param[in] destination_FileName file name for the YAML document
+*/
+YAML_Doc::YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
+    const std::string& destination_Directory, const std::string& destination_FileName)
+{
+    miniAppName = miniApp_Name;
+    miniAppVersion = miniApp_Version;
+    destinationDirectory = destination_Directory;
+    destinationFileName = destination_FileName;
+}
+
+// inherits the destructor from YAML_Element
+YAML_Doc::~YAML_Doc(void) {}
+
+/*!
+  Generates YAML from the elements of the document and saves it to a file.
+
+  @return returns the complete YAML document as a string
+*/
+string YAML_Doc::generateYAML()
+{
+    string yaml;
+
+    yaml = yaml + miniAppName + " version: " + miniAppVersion + "\n";
+
+    for (size_t i = 0; i < children.size(); i++)
+    {
+        yaml = yaml + children[i]->printYAML("");
+    }
+
+    time_t rawtime;
+    tm* ptm;
+    time(&rawtime);
+    ptm = localtime(&rawtime);
+    char sdate[64];
+    // use tm_mon+1 because tm_mon is 0 .. 11 instead of 1 .. 12
+    sprintf(sdate, "%04d.%02d.%02d.%02d.%02d.%02d", ptm->tm_year + 1900, ptm->tm_mon + 1, ptm->tm_mday, ptm->tm_hour,
+        ptm->tm_min, ptm->tm_sec);
+
+    string filename;
+    if (destinationFileName == "")
+        filename = miniAppName + "-" + miniAppVersion + "_";
+    else
+        filename = destinationFileName;
+    filename = filename + string(sdate) + ".yaml";
+    if (destinationDirectory != "" && destinationDirectory != ".")
+    {
+        string mkdir_cmd = "mkdir " + destinationDirectory;
+        int result = system(mkdir_cmd.c_str());
+        assert(result == 0);
+        filename = destinationDirectory + "/" + destinationFileName;
+    }
+    else
+        filename = "./" + filename;
+
+    ofstream myfile;
+    myfile.open(filename.c_str());
+    myfile << yaml;
+    myfile.close();
+    return yaml;
+}
--- a/src/YAML_Doc.hpp
+++ b/src/YAML_Doc.hpp
@@ -0,0 +1,117 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file YAML_Doc.hpp
+
+ HPCG YAML classes
+ */
+
+// Changelog
+//
+// Version 0.1
+// - Initial version.
+//
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef YAML_DOC_HPP
+#define YAML_DOC_HPP
+#include "YAML_Element.hpp"
+#include <string>
+
+//! The YAML_Doc class for the uniform collecting and reporting of performance data for HPCG
+
+/*!
+
+The YAML_Doc class works in conjunction with the YAML_Element class to facilitate easy collecting and reporting of
+YAML-formatted data that can be then registered with the HPCG results collection website.
+
+\code
+
+//EXAMPLE CODE FOR GENERATING YAML
+
+  YAML_Doc doc("hpcg","0.1");
+  doc.add("final_residual",1.4523e-13);
+  doc.add("time","4.893");
+
+//note: the following line will remove the data (4.890) associated with "time"
+  doc.get("time")->add("total",4.243);
+
+//note:  the following line will likewise remove the data (1.243) associated with "time"
+  doc.get("time")->get("total")->add("time",2.457);
+  doc.get("time")->get("total")->add("flops",4.88e5);
+  doc.get("time")->add("ddot",1.243);
+  doc.get("time")->add("sparsemv","");
+  doc.get("time")->get("sparsemv")->add("time",0.3445);
+  doc.get("time")->get("sparsemv")->add("overhead","");
+  doc.get("time")->get("sparsemv")->get("overhead")->add("time",0.0123);
+  doc.get("time")->get("sparsemv")->get("overhead")->add("percentage",0.034);
+  cout << doc.generateYAML() << endl;
+  return 0;
+
+\endcode
+
+Below is the output generated by the above code:
+
+\verbatim
+
+final_residual: 1.4523e-13
+time:
+  total:
+    time: 2.457
+    flops: 4.88e5
+  ddot: 1.243
+  sparsemv:
+    time: 0.3445
+    overhead:
+      time: 0.0123
+      percentage: 0.034
+
+\endverbatim
+
+\note {No value is allowed to be attached to a key that has children.  If children are added to a key, the value is
+simply set to "".}
+
+*/
+class YAML_Doc : public YAML_Element
+{
+public:
+    //! Constructor: accepts mini-application name and version as strings, optionally accepts directory and file name
+    //! for printing results.
+    /*!
+      The sole constructor for this class accepts and name and version number for the mini-application as well as
+      optional directory and file name information for results that are generated by the generateYAML() method. \param
+      miniApp_Name (in) string containing name of the mini-application \param miniApp_Version (in) string containing the
+      version of the mini-application \param destination_Directory (in, optional) path of directory where results file
+      will be stored, relative to current working directory. If this value is not supplied, the results file will be
+      stored in the current working directory.  If the directory does not exist it will be created. \param
+      destination_FileName (in, optional) root name of the results file.  A suffix of ".yaml" will be automatically
+      appended.  If no file name is specified the filename will be constructed by concatenating the miniAppName +
+      miniAppVersion + ".yaml" strings.
+    */
+    YAML_Doc(const std::string& miniApp_Name, const std::string& miniApp_Version,
+        const std::string& destination_Directory = "", const std::string& destination_FileName = "");
+    //! Destructor
+    ~YAML_Doc();
+    //! Generate YAML results to standard out and to a file using specified directory and filename, using current
+    //! directory and miniAppName + miniAppVersion + ".yaml" by default
+    std::string generateYAML();
+
+protected:
+    std::string miniAppName;          //!< the name of the application that generated the YAML output
+    std::string miniAppVersion;       //!< the version of the application that generated the YAML output
+    std::string destinationDirectory; //!< the destination directory for the generated the YAML output
+    std::string destinationFileName;  //!< the filename for the generated the YAML output
+};
+#endif // YAML_DOC_HPP
--- a/src/YAML_Element.cpp
+++ b/src/YAML_Element.cpp
@@ -0,0 +1,220 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file YAML_Element.cpp
+
+ HPCG routine
+ */
+
+#include "YAML_Element.hpp"
+#include <fstream>
+#include <iostream>
+#include <sstream>
+using namespace std;
+YAML_Element::YAML_Element(const std::string& key_arg, const std::string& value_arg)
+{
+    key = key_arg;
+    value = value_arg;
+}
+
+YAML_Element::~YAML_Element()
+{
+    for (size_t i = 0; i < children.size(); i++)
+    {
+        delete children[i];
+    }
+    children.clear();
+}
+
+/*!
+  Add an element to the vector
+  QUESTION: if an element is not added because the key already exists,
+  will this lead to memory leakage?
+
+  @param[in] key_arg   The key under which the element is stored
+  @param[in] value_arg The value of the element
+
+  @return Returns the added element
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, double value_arg)
+{
+    this->value = "";
+    string converted_value = convert_double_to_string(value_arg);
+    YAML_Element* element = new YAML_Element(key_arg, converted_value);
+    children.push_back(element);
+    return element;
+}
+
+/*!
+  Add an element to the vector
+
+  @param[in] key_arg   The key under which the element is stored
+  @param[in] value_arg The value of the element
+
+  @return Returns the added element
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, int value_arg)
+{
+    this->value = "";
+    string converted_value = convert_int_to_string(value_arg);
+    YAML_Element* element = new YAML_Element(key_arg, converted_value);
+    children.push_back(element);
+    return element;
+}
+
+#ifndef HPCG_NO_LONG_LONG
+
+/*!
+  Add an element to the vector
+
+  @param[in] key_arg   The key under which the element is stored
+  @param[in] value_arg The value of the element
+
+  @return Returns the added element
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, long long value_arg)
+{
+    this->value = "";
+    string converted_value = convert_long_long_to_string(value_arg);
+    YAML_Element* element = new YAML_Element(key_arg, converted_value);
+    children.push_back(element);
+    return element;
+}
+
+#endif
+
+/*!
+  Add an element to the vector
+
+  @param[in] key_arg   The key under which the element is stored
+  @param[in] value_arg The value of the element
+
+  @return Returns the added element
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, size_t value_arg)
+{
+    this->value = "";
+    string converted_value = convert_size_t_to_string(value_arg);
+    YAML_Element* element = new YAML_Element(key_arg, converted_value);
+    children.push_back(element);
+    return element;
+}
+
+/*!
+  Add an element to the vector
+
+  @param[in] key_arg   The key under which the element is stored
+  @param[in] value_arg The value of the element
+
+  @return Returns the added element
+*/
+YAML_Element* YAML_Element::add(const std::string& key_arg, const std::string& value_arg)
+{
+    this->value = "";
+    YAML_Element* element = new YAML_Element(key_arg, value_arg);
+    children.push_back(element);
+    return element;
+}
+
+/*!
+  Returns the pointer to the YAML_Element for the given key.
+  @param[in] key_arg   The key under which the element was stored
+
+  @return If found, returns the element, otherwise returns NULL
+*/
+YAML_Element* YAML_Element::get(const std::string& key_arg)
+{
+    for (size_t i = 0; i < children.size(); i++)
+    {
+        if (children[i]->getKey() == key_arg)
+        {
+            return children[i];
+        }
+    }
+    return 0;
+}
+
+/*!
+  Prints a line of a YAML document.  Correct YAML depends on
+  correct spacing; the parameter space should be the proper
+  amount of space for the parent element
+
+  @param[in] space spacing inserted at the beginning of the line
+
+  @return Returns a single line of the YAML document without the leading white space
+*/
+string YAML_Element::printYAML(std::string space)
+{
+    string yaml_line = space + key + ": " + value + "\n";
+    for (int i = 0; i < 2; i++)
+        space = space + " ";
+    for (size_t i = 0; i < children.size(); i++)
+    {
+        yaml_line = yaml_line + children[i]->printYAML(space);
+    }
+    return yaml_line;
+}
+
+/*!
+  Converts a double precision value to a string.
+
+  @param[in] value_arg The value to be converted.
+*/
+string YAML_Element::convert_double_to_string(double value_arg)
+{
+    stringstream strm;
+    strm << value_arg;
+    return strm.str();
+}
+
+/*!
+  Converts a integer value to a string.
+
+  @param[in] value_arg The value to be converted.
+*/
+string YAML_Element::convert_int_to_string(int value_arg)
+{
+    stringstream strm;
+    strm << value_arg;
+    return strm.str();
+}
+
+#ifndef HPCG_NO_LONG_LONG
+
+/*!
+  Converts a "long long" integer value to a string.
+
+  @param[in] value_arg The value to be converted.
+*/
+string YAML_Element::convert_long_long_to_string(long long value_arg)
+{
+    stringstream strm;
+    strm << value_arg;
+    return strm.str();
+}
+
+#endif
+
+/*!
+  Converts a "size_t" integer value to a string.
+
+  @param[in] value_arg The value to be converted.
+*/
+string YAML_Element::convert_size_t_to_string(size_t value_arg)
+{
+    stringstream strm;
+    strm << value_arg;
+    return strm.str();
+}
--- a/src/YAML_Element.hpp
+++ b/src/YAML_Element.hpp
@@ -0,0 +1,87 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*!
+ @file YAML_Element.hpp
+
+ HPCG data structures for YAML output
+ */
+
+// Changelog
+//
+// Version 0.1
+// - Initial version.
+//
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef YAML_ELEMENT_HPP
+#define YAML_ELEMENT_HPP
+#include "Geometry.hpp"
+#include <string>
+#include <vector>
+//! HPCG YAML_Element class, from the HPCG YAML_Element class for registering key-value pairs of performance data
+
+/*!
+  HPCG generates a collection of performance data for each run of the executable.  YAML_Element, and
+  the related YAML_Doc class, provide a uniform facility for gathering and reporting this data using the YAML text
+  format.
+*/
+class YAML_Element
+{
+public:
+    //! Default constructor.
+    YAML_Element()
+    {
+        key = "";
+        value = "";
+    }
+    //! Construct with known key-value pair
+    YAML_Element(const std::string& key_arg, const std::string& value_arg);
+    //! Destructor
+    ~YAML_Element();
+    //! Key accessor method
+    std::string getKey()
+    {
+        return key;
+    }
+    //! Add a child element to an element list associated with this element, value of type double
+    YAML_Element* add(const std::string& key_arg, double value_arg);
+    //! Add a child element to an element list associated with this element, value of type int
+    YAML_Element* add(const std::string& key_arg, int value_arg);
+#ifndef HPCG_NO_LONG_LONG
+    //! Add a child element to an element list associated with this element, value of type long long
+    YAML_Element* add(const std::string& key_arg, long long value_arg);
+#endif
+    //! Add a child element to an element list associated with this element, value of type size_t
+    YAML_Element* add(const std::string& key_arg, size_t value_arg);
+    //! Add a child element to an element list associated with this element, value of type string
+    YAML_Element* add(const std::string& key_arg, const std::string& value_arg);
+    //! get the element in the list with the given key
+    YAML_Element* get(const std::string& key_arg);
+    std::string printYAML(std::string space);
+
+protected:
+    std::string key;                     //!< the key under which the element is stored
+    std::string value;                   //!< the value of the stored element
+    std::vector<YAML_Element*> children; //!< children elements of this element
+
+private:
+    std::string convert_double_to_string(double value_arg);
+    std::string convert_int_to_string(int value_arg);
+#ifndef HPCG_NO_LONG_LONG
+    std::string convert_long_long_to_string(long long value_arg);
+#endif
+    std::string convert_size_t_to_string(size_t value_arg);
+};
+#endif // YAML_ELEMENT_HPP
--- a/src/finalize.cpp
+++ b/src/finalize.cpp
@@ -0,0 +1,49 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fstream>
+
+#include "hpcg.hpp"
+
+extern int use_output_file;
+/*!
+  Closes the I/O stream used for logging information throughout the HPCG run.
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see HPCG_Init
+*/
+int HPCG_Finalize(void)
+{
+    if (use_output_file)
+        HPCG_fout.close();
+    return 0;
+}
--- a/src/hpcg.hpp
+++ b/src/hpcg.hpp
@@ -0,0 +1,150 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file hpcg.hpp
+
+ HPCG data structures and functions
+ */
+
+/*
+Hitory:
+  *05.28.2023: HPC-Benchmark 23.5 release
+*/
+
+#ifndef HPCG_HPP
+#define HPCG_HPP
+
+#include "Geometry.hpp"
+#include <fstream>
+
+#ifndef USE_CUDA
+#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64) || defined(__amd64__) || defined(__amd64)            \
+    || defined(_M_X64)
+#define USE_CUDA
+#endif
+#endif
+
+#ifdef USE_CUDA
+#include "Cuda.hpp"
+#endif
+
+#define XSTR(s) STR(s)
+#define STR(s) #s
+
+#define EMPTY_MACRO_ 1
+#define CHECK_EMPTY_MACRO_(x) EMPTY_MACRO_##x
+#define CHECK_EMPTY_MACRO(x) CHECK_EMPTY_MACRO_(x)
+
+#ifndef make_HPCG_VER_MAJOR
+#define HPCG_VER_MAJOR 24
+#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MAJOR) == 1
+#define HPCG_VER_MAJOR 24
+#else
+#define HPCG_VER_MAJOR make_HPCG_VER_MAJOR
+#endif
+
+#ifndef make_HPCG_VER_MINOR
+#define HPCG_VER_MINOR 09
+#elif CHECK_EMPTY_MACRO(make_HPCG_VER_MINOR) == 1
+#define HPCG_VER_MINOR 09
+#else
+#define HPCG_VER_MINOR make_HPCG_VER_MINOR
+#endif
+
+#ifndef make_HPCG_VER_PATCH
+#define HPCG_VER_PATCH 0
+#elif CHECK_EMPTY_MACRO(make_HPCG_VER_PATCH) == 1
+#define HPCG_VER_PATCH 0
+#else
+#define HPCG_VER_PATCH make_HPCG_VER_PATCH
+#endif
+
+#ifndef make_HPCG_VER_BUILD
+#define HPCG_VER_BUILD 0
+#elif CHECK_EMPTY_MACRO(make_HPCG_VER_BUILD) == 1
+#define HPCG_VER_BUILD 0
+#else
+#define HPCG_VER_BUILD make_HPCG_VER_BUILD
+#endif
+
+#define HPCG_VERSION (HPCG_VER_MAJOR * 1000 + HPCG_VER_MINOR * 100 + HPCG_VER_PATCH)
+
+#define VER_HEADER \
+    "HPCG-NVIDIA " XSTR(HPCG_VER_MAJOR)"." XSTR(HPCG_VER_MINOR) "." XSTR(HPCG_VER_PATCH) "  -- NVIDIA accelerated HPCG benchmark -- NVIDIA\n"
+
+#define HPCG_LINE_MAX 256
+
+extern std::ofstream HPCG_fout;
+
+// Refer to src/init.cpp for possible user-defined values
+struct HPCG_Params_STRUCT
+{
+    int comm_size;                   //!< Number of MPI processes in MPI_COMM_WORLD
+    int comm_rank;                   //!< This process' MPI rank in the range [0 to comm_size - 1]
+    int numThreads;                  //!< This process' number of threads
+    local_int_t nx;                  //!< Number of processes in x-direction of 3D process grid
+    local_int_t ny;                  //!< Number of processes in y-direction of 3D process grid
+    local_int_t nz;                  //!< Number of processes in z-direction of 3D process grid
+    int runningTime;                 //!< Number of seconds to run the timed portion of the benchmark
+    int npx;                         //!< Number of x-direction grid points for each local subdomain
+    int npy;                         //!< Number of y-direction grid points for each local subdomain
+    int npz;                         //!< Number of z-direction grid points for each local subdomain
+    int pz;                          //!< Partition in the z processor dimension, default is npz
+    local_int_t zl;                  //!< nz for processors in the z dimension with value less than pz
+    local_int_t zu;                  //!< nz for processors in the z dimension with value greater than pz
+    bool benchmark_mode;             // !< Skips running reference code
+    bool use_l2compression;          // !< Activates GPU L2 Compression
+    bool use_hpcg_mem_reduction;     // !< Not passed as parameter. Set in main to true. Activates aggressive memory
+                                     // reduction optimizations
+    rank_type_t rank_type;           // !< Not passed as parameter. GPU or CPU
+    p2p_comm_mode_t p2_mode;         // !< We have 4 methods to do p2p comm in MV and MG, refer to Geometry.hpp
+    exec_mode_t exec_mode = GPUONLY; // !< Three modes supported: GPUONLY, CPUONLY, GPUCPU.
+    int g2c;                         // !< Related to GPU/CPU local problem definition
+    dim_3d_t diff_dim;               // !< Specifies the dim that is different for the CPU and GPU ranks
+    local_problem_def_t local_problem_def; // !< Specifies how nx, ny, nz, and g2c are interpreted (4 possibilites)
+    bool cpu_allowed_to_print; // !< Not passed as parameter. Specifies the CPU rank (opposite to GPU rank) that is
+                               // allowed to print
+    bool use_output_file;      // !< There is a global variable with the same name defined in src/init.cpp and used
+                               // throughout the files
+    local_int_t gpu_slice_size;
+    local_int_t cpu_slice_size;
+};
+/*!
+  HPCG_Params is a shorthand for HPCG_Params_STRUCT
+ */
+typedef HPCG_Params_STRUCT HPCG_Params;
+
+extern void InitializeRanks(HPCG_Params& params);
+extern int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params);
+extern int HPCG_Finalize(void);
+
+#endif // HPCG_HPP
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -0,0 +1,444 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#ifndef HPCG_NO_OPENMP
+#include <omp.h>
+#endif
+
+#ifdef _WIN32
+const char* NULLDEVICE = "nul";
+#else
+const char* NULLDEVICE = "/dev/null";
+#endif
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+
+#include <fstream>
+#include <iostream>
+
+#include "hpcg.hpp"
+
+#include "ReadHpcgDat.hpp"
+
+int use_output_file = 0;
+std::ofstream HPCG_fout; //!< output file stream for logging activities during HPCG run
+#if defined(USE_CUDA) && defined(USE_NCCL)
+ncclComm_t Nccl_Comm;
+#endif
+
+#ifndef HPCG_NO_MPI
+char host_name[MPI_MAX_PROCESSOR_NAME];
+char pro_name[MPI_MAX_PROCESSOR_NAME];
+MPI_Comm proComm;
+int global_rank = 0;
+int global_total_ranks = 0;
+int program_rank = 0;
+int program_total_ranks = 0;
+int* physical_rank_dims;
+int* logical_rank_to_phys;
+int* physical_rank_dims_d;
+int* logical_rank_to_phys_d;
+#else
+char host_name[1000];
+char pro_name[1000];
+#endif
+
+static int startswith(const char* s, const char* prefix)
+{
+    size_t n = strlen(prefix);
+    if (strncmp(s, prefix, n))
+        return 0;
+    return 1;
+}
+
+int stringCmp(const void* a, const void* b)
+{
+    return strcmp((const char*) a, (const char*) b);
+}
+
+/*!
+  Initializes an HPCG run by obtaining problem parameters (from a file or
+  command line) and then broadcasts them to all nodes. It also initializes
+  login I/O streams that are used throughout the HPCG run. Only MPI rank 0
+  performs I/O operations.
+
+  The function assumes that MPI has already been initialized for MPI runs.
+
+  @param[in] argc_p the pointer to the "argc" parameter passed to the main() function
+  @param[in] argv_p the pointer to the "argv" parameter passed to the main() function
+  @param[out] params the reference to the data structures that is filled the basic parameters of the run
+
+  @return returns 0 upon success and non-zero otherwise
+
+  @see HPCG_Finalize
+*/
+
+void InitializeRanks(HPCG_Params& params)
+{
+    char(*host_names)[MPI_MAX_PROCESSOR_NAME];
+    char(*program_names)[MPI_MAX_PROCESSOR_NAME];
+    MPI_Comm nodeComm;
+    int n, namelen, color, local_procs;
+    size_t bytes;
+
+    int deviceCount;
+    int local_rank = 0;
+
+    // 1) Find global
+    MPI_Comm_rank(MPI_COMM_WORLD, &global_rank);        // GLobal rank for CPU and GPU
+    MPI_Comm_size(MPI_COMM_WORLD, &global_total_ranks); // Global Number of ranks for CPU and GPU
+
+    physical_rank_dims = new int[3 * global_total_ranks];
+    logical_rank_to_phys = new int[global_total_ranks];
+
+    bytes = global_total_ranks * sizeof(char[MPI_MAX_PROCESSOR_NAME]);
+
+    // Color ranks by program name (if more than one binary executed, e.g., one for CPU and one for GPU)
+    program_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
+    strcpy(program_names[global_rank], __FILE__);
+    for (n = 0; n < global_total_ranks; n++)
+    {
+        MPI_Bcast(&(program_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
+    }
+    qsort(program_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
+
+    color = 0;
+    for (n = 0; n < global_total_ranks; n++)
+    {
+        if (n > 0 && strcmp(program_names[n - 1], program_names[n]))
+            color++;
+        if (strcmp(__FILE__, program_names[n]) == 0)
+            break;
+    }
+
+    MPI_Comm_split(MPI_COMM_WORLD, color, 0, &proComm);
+    MPI_Comm_rank(proComm, &program_rank);
+    MPI_Comm_size(proComm, &program_total_ranks);
+    free(program_names);
+
+    MPI_Get_processor_name(host_name, &namelen); // Host name
+    host_names = (char(*)[MPI_MAX_PROCESSOR_NAME]) malloc(bytes);
+    strcpy(host_names[global_rank], host_name);
+
+    for (n = 0; n < global_total_ranks; n++)
+    {
+        MPI_Bcast(&(host_names[n]), MPI_MAX_PROCESSOR_NAME, MPI_CHAR, n, MPI_COMM_WORLD);
+    }
+
+    qsort(host_names, global_total_ranks, sizeof(char[MPI_MAX_PROCESSOR_NAME]), stringCmp);
+
+    color = 0;
+    for (n = 0; n < global_total_ranks; n++)
+    {
+        if (n > 0 && strcmp(host_names[n - 1], host_names[n]))
+            color++;
+        if (strcmp(host_name, host_names[n]) == 0)
+            break;
+    }
+
+    MPI_Comm_split(proComm, color, 0, &nodeComm);
+    MPI_Comm_rank(nodeComm, &local_rank);
+    MPI_Comm_size(nodeComm, &local_procs);
+
+    free(host_names);
+#ifdef USE_CUDA
+    cudaGetDeviceCount(&deviceCount);
+#endif
+
+    // Figure out the rank type, based on execution mode (params.exec_mode)
+    if (params.exec_mode == CPUONLY)
+    {
+        params.rank_type = CPU;
+    }
+    else if (params.exec_mode == GPUONLY)
+    {
+        params.rank_type = GPU;
+#ifdef USE_CUDA
+        cudaGetDeviceCount(&deviceCount);
+        cudaSetDevice(local_rank % deviceCount);
+
+        // Touch Pinned Memory
+        double* t;
+        cudaMallocHost((void**) (&(t)), sizeof(double));
+        cudaFreeHost(t);
+
+        if (params.p2_mode == NCCL)
+        {
+#ifdef USE_NCCL
+            ncclUniqueId id;
+            if (global_rank == 0)
+                ncclGetUniqueId(&id);
+            MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+            ncclCommInitRank(&Nccl_Comm, global_total_ranks, id, global_rank);
+#endif // USE_NCCL
+        }
+
+#endif // USE_CUDA
+    }
+    else /*CPUGPU*/
+    {
+        // Here we assume that a node has the same number of GPU and CPU ranks
+        // This design is rigid but it is difficult to assign ranks automatically
+        // to GPUs and CPUs otherwise
+        params.cpu_allowed_to_print = false; // Enable printing for the first CPU rank only
+        int ranks_for_numa = local_procs / deviceCount;
+        if (ranks_for_numa == 1)
+        {
+            if (global_rank == 0)
+                printf("Warning: All Ranks will be Assigned to GPUs, check the total number of ranks\n");
+        }
+        if (local_rank % ranks_for_numa == 0)
+        {
+            params.rank_type = GPU;
+#ifdef USE_CUDA
+            cudaSetDevice(local_rank / ranks_for_numa);
+            // Touch Pinned Memory
+            double* t;
+            cudaMallocHost((void**) (&(t)), sizeof(double));
+            cudaFreeHost(t);
+#endif
+        }
+        else
+        {
+            params.rank_type = CPU;
+            if (local_rank == 1 && color == 0)
+            {
+                params.cpu_allowed_to_print = true;
+            }
+        }
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+int HPCG_Init(int* argc_p, char*** argv_p, HPCG_Params& params)
+{
+    int argc = *argc_p;
+    char** argv = *argv_p;
+    char fname[80];
+    int i, j, *iparams;
+    char cparams[][9] = {"--nx=", "--ny=", "--nz=", "--rt=", "--npx=", "--npy=", "--npz=", "--b=", "--l2cmp=", "--mr=",
+        "--exm=", "--g2c=", "--ddm=", "--lpm=", "--p2p=", "--of=", "--gss=", "--css="};
+    time_t rawtime;
+    tm* ptm;
+    const int nparams = (sizeof cparams) / (sizeof cparams[0]);
+    bool broadcastParams = false; // Make true if parameters read from file.
+
+    const char* name = "HPCG_USE_OUTPUT_FILE";
+    char* value;
+    value = getenv(name);
+    if (value != NULL)
+    {
+        use_output_file = atoi(value);
+    }
+    iparams = (int*) malloc(sizeof(int) * nparams);
+
+    // Initialize iparams
+    for (i = 0; i < nparams; ++i)
+        iparams[i] = 0;
+
+    /* for sequential and some MPI implementations it's OK to read first three args */
+    for (i = 0; i < nparams; ++i)
+        if (argc <= i + 1 || sscanf(argv[i + 1], "%d", iparams + i) != 1 || iparams[i] < 11)
+            iparams[i] = 0;
+
+    /* for some MPI environments, command line arguments may get complicated so we need a prefix */
+    for (i = 1; i <= argc && argv[i]; ++i)
+        for (j = 0; j < nparams; ++j)
+            if (startswith(argv[i], cparams[j]))
+                if (sscanf(argv[i] + strlen(cparams[j]), "%d", iparams + j) != 1)
+                    iparams[j] = 0;
+
+    // Check if --rt was specified on the command line
+    int* rt = iparams + 3; // Assume runtime was not specified and will be read from the hpcg.dat file
+    if (iparams[3])
+        rt = 0; // If --rt was specified, we already have the runtime, so don't read it from file
+    if (!iparams[0] && !iparams[1] && !iparams[2])
+    { /* no geometry arguments on the command line */
+        char HPCG_DAT_FILE[HPCG_LINE_MAX];
+        if (argc > 1)
+        {
+            strcpy(HPCG_DAT_FILE, argv[1]);
+        }
+        else
+        {
+            strcpy(HPCG_DAT_FILE, "./hpcg.dat");
+        }
+        if (ReadHpcgDat(iparams, rt, iparams + 7, HPCG_DAT_FILE) == -1)
+        {
+            printf("No input data. Possible options:\n");
+            fflush(0);
+            printf("\t1) Specify path to input file: ./xhpcg <path to *.dat file>\n");
+            printf("\t2) Copy hpcg.dat to the run directory\n");
+            printf("\t3) Use command line parameters: ./xhpcg --nx <x> --ny <y> --nz <z> --rt <t>\n");
+            exit(-1);
+        }
+        broadcastParams = true;
+    }
+
+    // Check for small or unspecified nx, ny, nz values
+    // If any dimension is less than 16, make it the max over the other two dimensions, or 16, whichever is largest
+    for (i = 0; i < 3; ++i)
+    {
+        if (iparams[i] < 16)
+            for (j = 1; j <= 2; ++j)
+                if (iparams[(i + j) % 3] > iparams[i])
+                    iparams[i] = iparams[(i + j) % 3];
+        if (iparams[i] < 16)
+            iparams[i] = 16;
+    }
+
+#ifndef HPCG_NO_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &params.comm_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &params.comm_size);
+#else
+    params.comm_rank = 0;
+    params.comm_size = 1;
+#endif
+
+// Broadcast values of iparams to all MPI processes
+#ifndef HPCG_NO_MPI
+    if (broadcastParams)
+    {
+        MPI_Bcast(iparams, nparams, MPI_INT, 0, MPI_COMM_WORLD);
+    }
+#endif
+
+    params.nx = iparams[0];
+    params.ny = iparams[1];
+    params.nz = iparams[2];
+
+    params.runningTime = iparams[3];
+
+    params.npx = iparams[4];
+    params.npy = iparams[5];
+    params.npz = iparams[6];
+
+    params.benchmark_mode = iparams[7] > 0;
+    params.use_l2compression = iparams[8] > 0;
+    params.use_hpcg_mem_reduction = iparams[9] > 0;
+
+    /* 0: CPU only | 1: GPU only | 2: GPUCPU */
+    params.exec_mode = iparams[10] == 2 ? GPUCPU : (iparams[10] == 1 ? CPUONLY : GPUONLY);
+    params.g2c = iparams[11] == 0 ? 1 : iparams[11];
+
+    /* 0: NONE | 1: X | 1: Y | 2: Z */
+    params.diff_dim = iparams[12] == 3 ? Z : (iparams[12] == 2 ? Y : (iparams[12] == 1 ? X : NONE));
+
+    // GPU_RATIO=0/*NX, NY, NZ are local to GPU and g2c is a ratio*/
+    // GPU_ABS=1/*NX, NY, NZ are local to GPU and g2c is absolute dimension size*/,
+    // GPU_CPU_RATIO=2/*NX, NY, NZ are local to GPU+CPU and g2c is ratio*/,
+    // GPU_CPU_ABS=3/*NX, NY, NZ are local to GPU+CPU and g2c is absolute dimension size*/
+    if (iparams[13] == 1)
+        params.local_problem_def = GPU_ABS;
+    else if (iparams[13] == 2)
+        params.local_problem_def = GPU_CPU_RATIO;
+    else if (iparams[13] == 3)
+        params.local_problem_def = GPU_CPU_ABS;
+    else
+        params.local_problem_def = GPU_RATIO;
+
+    // P2P Communication method
+    if (iparams[14] == 1)
+        params.p2_mode = MPI_CPU_All2allv;
+    else if (iparams[14] == 2)
+        params.p2_mode = MPI_CUDA_AWARE;
+    else if (iparams[14] == 3)
+        params.p2_mode = MPI_GPU_All2allv;
+    else if (iparams[14] == 4)
+        params.p2_mode = NCCL;
+    else
+        params.p2_mode = MPI_CPU;
+
+    if (iparams[15] == 1)
+    {
+        params.use_output_file = 1;
+        use_output_file = 1;
+    }
+    else
+    {
+        params.use_output_file = 0;
+        use_output_file = 0;
+    }
+    
+    // --gss
+    params.gpu_slice_size = iparams[16] > 0 ? iparams[16] : 4096;
+
+    // --css
+    params.cpu_slice_size = iparams[17] > 0 ? iparams[17] : 8;
+
+    if (params.comm_rank == 0)
+    {
+        printf("%s", VER_HEADER);
+    }
+
+#ifdef HPCG_NO_OPENMP
+    params.numThreads = 1;
+#else
+#pragma omp parallel
+    #pragma omp single
+    params.numThreads = omp_get_num_threads();
+#endif
+
+    time(&rawtime);
+    ptm = localtime(&rawtime);
+    sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
+        ptm->tm_hour, ptm->tm_min, ptm->tm_sec);
+
+    if (use_output_file)
+    {
+        if (0 == params.comm_rank)
+        {
+            HPCG_fout.open(fname);
+        }
+        else
+        {
+#if defined(HPCG_DEBUG) || defined(HPCG_DETAILED_DEBUG)
+            sprintf(fname, "hpcg%04d%02d%02dT%02d%02d%02d_%d.txt", 1900 + ptm->tm_year, ptm->tm_mon + 1, ptm->tm_mday,
+                ptm->tm_hour, ptm->tm_min, ptm->tm_sec, params.comm_rank);
+            HPCG_fout.open(fname);
+#else
+            HPCG_fout.open(NULLDEVICE);
+#endif
+        }
+    }
+    free(iparams);
+
+    return 0;
+}
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -0,0 +1,878 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*!
+ @file main.cpp
+
+ - All emums are in Geomerty.hpp
+ - Supports GPU-only, Grace-only, and GPU-Grace. GPU and Grace are different MPI ranks.
+ - The dimensions of GPU rank and CPU rank can only differ in one dimension (nx, ny, or nz).
+ - Parameters are explained in bin/RUNNING-*
+ */
+
+// Main routine of a program that calls the HPCG conjugate gradient
+// solver to solve the problem, and then prints results.
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+#endif
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+#ifdef USE_GRACE
+#include <nvpl_sparse.h>
+#endif
+
+#include "CG.hpp"
+#include "CGData.hpp"
+#include "CG_ref.hpp"
+#include "CheckAspectRatio.hpp"
+#include "CheckProblem.hpp"
+#include "ComputeMG_ref.hpp"
+#include "ComputeResidual.hpp"
+#include "ComputeSPMV_ref.hpp"
+#include "CpuKernels.hpp"
+#include "CudaKernels.hpp"
+#include "ExchangeHalo.hpp"
+#include "GenerateCoarseProblem.hpp"
+#include "GenerateGeometry.hpp"
+#include "GenerateProblem.hpp"
+#include "Geometry.hpp"
+#include "OptimizeProblem.hpp"
+#include "ReportResults.hpp"
+#include "SetupHalo.hpp"
+#include "SparseMatrix.hpp"
+#include "TestCG.hpp"
+#include "TestNorms.hpp"
+#include "TestSymmetry.hpp"
+#include "Vector.hpp"
+#include "WriteProblem.hpp"
+#include "hpcg.hpp"
+#include "mytimer.hpp"
+
+#ifdef HPCG_DETAILED_DEBUG
+using std::cin;
+#endif
+using std::endl;
+
+// Prints in a file or terminal
+extern int use_output_file;
+
+#ifdef USE_CUDA
+cusparseHandle_t cusparsehandle;
+cublasHandle_t cublashandle;
+cudaStream_t stream;
+cudaEvent_t copy_done;
+cudaStream_t copy_stream;
+int* ranktoId;
+#endif
+
+#ifdef USE_GRACE
+nvpl_sparse_handle_t nvpl_sparse_handle;
+#endif
+
+// The communication mode used to send point-to-point messages
+#ifndef HPCG_NO_MPI
+p2p_comm_mode_t P2P_Mode;
+#endif
+
+// USE CUDA L2 compression
+bool Use_Compression;
+
+// USE HPCG aggresive memory reduction
+bool Use_Hpcg_Mem_Reduction;
+
+#ifndef HPCG_NO_MPI
+// Used to find ranks for CPU and GPU programs
+int* rankToId_h;
+int* idToRank_h;
+extern int* physical_rank_dims;
+extern int* logical_rank_to_phys;
+#endif
+
+/*!
+  Main driver program: Construct synthetic problem, run V&V tests, compute benchmark parameters, run benchmark, report
+  results.
+
+  @param[in]  argc Standard argument count.  Should equal 1 (no arguments passed in) or 4 (nx, ny, nz passed in)
+  @param[in]  argv Standard argument array.  If argc==1, argv is unused.  If argc==4, argv[1], argv[2], argv[3] will be
+  interpreted as nx, ny, nz, resp.
+
+  @return Returns zero on success and a non-zero value otherwise.
+
+*/
+int main(int argc, char* argv[])
+{
+#ifndef HPCG_NO_MPI
+    MPI_Init(&argc, &argv);
+#endif
+
+    // Here I read all the parameters, including the execution mode (CPUONLY, GPUONLY, GPUCPU)
+    HPCG_Params params;
+    HPCG_Init(&argc, &argv, params);
+    bool quickPath = (params.runningTime == 0);
+    int size = params.comm_size, rank = params.comm_rank; // Number of MPI processes, My process ID
+
+    bool benchmark_mode = params.benchmark_mode;
+    Use_Compression = params.use_l2compression;
+    Use_Hpcg_Mem_Reduction = true; // params.use_hpcg_mem_reduction;
+    P2P_Mode = params.p2_mode;
+
+    if (rank == 0)
+    {
+        printf("Build v0.6.0 \n");
+
+#ifdef HPCG_ENG_VERSION
+        printf("\n%s%s\n", "========================================", "========================================");
+#ifdef HPCG_COMMIT_HASH
+        printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit: %s\n",
+            XSTR(HPCG_COMMIT_HASH));
+#else
+        printf("Engineering version of HPCG-NVIDIA. Results cannot be shared with third parties\nCommit:\n");
+#endif
+        printf("%s%s\n", "========================================", "========================================");
+#endif
+        printf("\nStart of application (%s) ...\n",
+            params.exec_mode == GPUONLY       ? "GPU-Only"
+                : params.exec_mode == CPUONLY ? "Grace-Only"
+                                              : "GPU+Grace");
+
+        if (benchmark_mode)
+            printf(" | Benchmark Mode !!!! CPU reference code is not performed \n");
+
+        if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
+            if (Use_Compression)
+                printf(
+                    " | L2 compression is activated !!!! Currently, it is not legal to submit HPCG results with L2 "
+                    "compression\n");
+        #ifdef INDEX_64
+            printf(" | Using INT64 Indexing \n");
+        #endif
+    }
+
+    // Check P2P comm mode
+//     if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
+//     {
+// #ifndef USE_GRACE
+//         if (rank == 0)
+//             printf(
+//                 "Error: HPCG was not compiled for Grace execution. USE --exm=0 for GPU-only execution or add "
+//                 "-DUSE_GRACE. Exiting ...\n");
+// #ifndef HPCG_NO_MPI
+//         MPI_Finalize();
+// #endif
+//         return 0;
+// #endif // USE_GRACE
+
+        bool invalid = false;
+        if (P2P_Mode == NCCL)
+        {
+            if (rank == 0)
+                printf("Invalid P2P communication mode (NCCL) for CPUs, Exiting ...\n");
+            invalid = true;
+        }
+        if (P2P_Mode == MPI_GPU_All2allv)
+        {
+            if (rank == 0)
+                printf("Invalid P2P communication mode (MPI GPU All2allv) for CPUs, Exiting ...\n");
+            invalid = true;
+        }
+        if (P2P_Mode == MPI_CUDA_AWARE)
+        {
+            if (rank == 0)
+                printf("Invalid P2P communication mode (CUDA-Aware MPI) for CPUs, Exiting ...\n");
+            invalid = true;
+        }
+        if (invalid)
+        {
+#ifndef HPCG_NO_MPI
+            MPI_Finalize();
+#endif
+            return 0;
+        }
+    }
+
+#ifndef USE_NCCL
+    if (params.exec_mode == GPUONLY)
+    {
+        if (rank == 0)
+            printf(
+                "Error: HPCG was not compiled with NCCL. USE --exm=1 for Grace-only execution or add -DUSE_NCCL. "
+                "Exiting ...\n");
+#ifndef HPCG_NO_MPI
+        MPI_Finalize();
+#endif
+        return 0;
+    }
+#endif // USE_NCCL
+
+    // Check whether total number of ranks == npx*npy*npz
+    auto rank_grid_size = params.npx * params.npy * params.npz;
+    if (rank_grid_size > 0 && size != rank_grid_size)
+    {
+        if (rank == 0)
+            printf("Error: Total Number of ranks != npx*npy*npz. Exiting ...\n");
+#ifndef HPCG_NO_MPI
+        MPI_Finalize();
+#endif
+        return 0;
+    }
+
+#ifndef USE_CUDA
+    if (params.exec_mode != CPUONLY)
+    {
+        if (rank == 0)
+            printf(
+                "Error: HPCG was not compiled for GPU execution. USE --exm=1 for Grace-only execution or add "
+                "-DUSE_CUDA. Exiting ...\n");
+#ifndef HPCG_NO_MPI
+        MPI_Finalize();
+#endif
+        return 0;
+    }
+#endif
+
+    // Here, we decide the rank type
+    // assign a rank to GPU and CPU
+    InitializeRanks(params);
+
+// Check if QuickPath option is enabled.
+// If the running time is set to zero, we minimize all paths through the program
+#ifdef HPCG_DETAILED_DEBUG
+    if (size < 100 && rank == 0)
+        HPCG_fout << "Process " << rank << " of " << size << " is alive with " << params.numThreads << " threads."
+                  << endl;
+    if (rank == 0)
+    {
+        char c;
+        std::cout << "Press key to continue" << std::endl;
+        std::cin.get(c);
+    }
+#ifndef HPCG_NO_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+#endif
+
+/////////////////////////
+// Problem setup Phase //
+/////////////////////////
+#ifdef HPCG_DEBUG
+    double t1 = mytimer();
+#endif
+
+    // Construct the geometry and linear system
+    Geometry* geom = new Geometry;
+    GenerateGeometry(params, geom);
+    int ierr = CheckAspectRatio(0.125, geom->nx, geom->ny, geom->nz, "local problem", rank == 0);
+    if (ierr)
+        return ierr;
+
+    ierr = CheckAspectRatio(0.125, geom->npx, geom->npy, geom->npz, "process grid", rank == 0);
+    if (ierr)
+        return ierr;
+
+// Sync All Ranks
+#ifndef HPCG_NO_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+    // Test Library versions for cuSPARSE or NVPL Sparse
+    // The two librray versions has to be tested in
+    // GPU or Grace ranks
+    int cusparseMajor = 0, cusparseMinor = 0;
+    if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
+    {
+#ifdef USE_CUDA
+        // Cusparse Version
+        cusparseGetProperty(MAJOR_VERSION, &cusparseMajor);
+        cusparseGetProperty(MINOR_VERSION, &cusparseMinor);
+
+        if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 2))
+        {
+            if (rank == 0)
+                printf("cuSPARSE version must be 12.2 or higher (found v%d.%d) \n", cusparseMajor, cusparseMinor);
+#ifndef HPCG_NO_MPI
+            MPI_Finalize();
+#endif
+            return 0;
+        }
+#endif
+    }
+
+    int nvspMajor = 0, nvspMinor = 0, nvspPatch = 0, nvspVersion = 0;
+//     if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
+//     {
+// #ifdef USE_GRACE
+//         // NVPL Sparse Version
+//         nvpl_sparse_create(&(nvpl_sparse_handle));
+//         nvpl_sparse_get_version(nvpl_sparse_handle, &nvspVersion);
+//         nvspMajor = nvspVersion / 1000;
+//         nvspMinor = (nvspVersion % 1000) / 100;
+//         nvspPatch = nvspVersion % 100;
+//         if (nvspMajor < 0 || (nvspMajor == 0 && nvspMinor < 2))
+//         {
+//             if (rank == 0)
+//                 printf("NVPL Sparse version must be 0.2 or higher (found v%d.%d) \n", nvspMajor, nvspMinor);
+// #ifndef HPCG_NO_MPI
+//             MPI_Finalize();
+// #endif
+//             return 0;
+//         }
+// #endif // USE_GRACE
+//     }
+
+    SparseMatrix A;
+    Vector x_overlap, b_computed;
+    Vector b, x, xexact;
+    std::vector<double> times(10, 0.0);
+    CGData data;
+    InitializeSparseMatrix(A, geom);
+    size_t cpuRefMemory = 0;
+    int numberOfMgLevels = 4; // Number of levels including first
+    SparseMatrix* curLevelMatrix = &A;
+    if (params.rank_type == GPU)
+    {
+        A.rankType = GPU;
+        A.slice_size = params.gpu_slice_size;
+        cublasCreate(&(cublashandle));
+        cusparseCreate(&(cusparsehandle));
+        cudaStreamCreate(&(stream));
+        cudaStreamCreate(&(copy_stream));
+        cusparseSetStream(cusparsehandle, stream);
+        cublasSetStream(cublashandle, stream);
+        cusparseSetPointerMode(cusparsehandle, CUSPARSE_POINTER_MODE_HOST);
+        cublasSetPointerMode(cublashandle, CUBLAS_POINTER_MODE_HOST);
+        cudaEventCreate(&copy_done);
+
+        // Allocate GPU related data
+        AllocateMemCuda(A);
+
+        double setup_time = mytimer();
+        GenerateProblem(A, &b, &x, &xexact);
+        SetupHalo(A);
+        for (int level = 1; level < numberOfMgLevels; ++level)
+        {
+            GenerateCoarseProblem(*curLevelMatrix);
+            curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
+        }
+        setup_time = mytimer() - setup_time; // Capture total time of setup
+        delete[] physical_rank_dims;
+        delete[] logical_rank_to_phys;
+        times[9] = setup_time; // Save it for reporting
+
+        // Copy data from Device to Host.
+        // Note: exclude this from setup_time, as soon as it is needed only for reference calls.
+        cpuRefMemory = CopyDataToHostCuda(A, &b, &x, &xexact);
+
+        // Alocate the GPU data for optimized data structures
+        AllocateMemOptCuda(A);
+    }
+//     else
+//     {
+// #ifdef USE_GRACE
+//         A.rankType = CPU;
+//         A.slice_size = params.cpu_slice_size;
+//         // Use this array for collecting timing information
+//         double setup_time = mytimer();
+//         GenerateProblem(A, &b, &x, &xexact);
+//         SetupHalo(A);
+//         for (int level = 1; level < numberOfMgLevels; ++level)
+//         {
+//             GenerateCoarseProblem(*curLevelMatrix);
+//             curLevelMatrix = curLevelMatrix->Ac; // Make the just-constructed coarse grid the next level
+//         }
+
+//         // These global buffers only needed for problem setup
+//         delete[] rankToId_h;
+//         delete[] idToRank_h;
+//         delete[] physical_rank_dims;
+//         delete[] logical_rank_to_phys;
+
+//         setup_time = mytimer() - setup_time; // Capture total time of setup
+//         times[9] = setup_time;               // Save it for reporting
+// #endif                                       // USE_GRACE
+//     }
+
+    curLevelMatrix = &A;
+    Vector* curb = &b;
+    Vector* curx = &x;
+    Vector* curxexact = &xexact;
+    for (int level = 0; level < numberOfMgLevels; ++level)
+    {
+        // Doesn't work for GPU or GPUCPU cases
+        // Data need to be transfered between CPU and GPU, which is not feasible
+        if (params.exec_mode == CPUONLY)
+        {
+            CheckProblem(*curLevelMatrix, curb, curx, curxexact);
+            //Delete mtxIndG since it is not needed anymore
+            delete [] curLevelMatrix->mtxIndG[0];
+        }
+        curLevelMatrix = curLevelMatrix->Ac; // Make the nextcoarse grid the next level
+        curb = 0;                            // No vectors after the top level
+        curx = 0;
+        curxexact = 0;
+    }
+
+    InitializeSparseCGData(A, data);
+
+    ////////////////////////////////////
+    // Reference SpMV+MG Timing Phase //
+    ////////////////////////////////////
+    // Call Reference SpMV and MG. Compute Optimization time as ratio of times in these routines
+    local_int_t nrow = A.localNumberOfRows;
+    local_int_t ncol = A.localNumberOfColumns;
+    InitializeVector(x_overlap, ncol, A.rankType);  // Overlapped copy of x vector
+    InitializeVector(b_computed, nrow, A.rankType); // Computed RHS vector
+
+    // Record execution time of reference SpMV and MG kernels for reporting times
+    // First load vector with random values
+    FillRandomVector(x_overlap);
+
+    int numberOfCalls = 10;
+    if (quickPath)
+        numberOfCalls = 1; // QuickPath means we do on one call of each block of repetitive code
+    if (!benchmark_mode)
+    {
+        double t_begin = mytimer();
+        for (int i = 0; i < numberOfCalls; ++i)
+        {
+            ierr = ComputeSPMV_ref(A, x_overlap, b_computed); // b_computed = A*x_overlap
+            if (ierr)
+                if (use_output_file)
+                {
+                    HPCG_fout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+                }
+                else
+                {
+                    std::cout << "Error in call to SpMV: " << ierr << ".\n" << endl;
+                }
+            ierr = ComputeMG_ref(A, b_computed, x_overlap); // b_computed = Minv*y_overlap
+            if (ierr)
+                if (use_output_file)
+                {
+                    HPCG_fout << "Error in call to MG: " << ierr << ".\n" << endl;
+                }
+                else
+                {
+                    std::cout << "Error in call to MG: " << ierr << ".\n" << endl;
+                }
+        }
+        times[8] = (mytimer() - t_begin) / ((double) numberOfCalls); // Total time divided by number of calls.
+#ifdef HPCG_DEBUG
+        if (rank == 0)
+            HPCG_fout << "Total SpMV+MG timing phase execution time in main (sec) = " << mytimer() - t1 << endl;
+#endif
+    }
+
+    ///////////////////////////////
+    // Reference CG Timing Phase //
+    ///////////////////////////////
+
+#ifdef HPCG_DEBUG
+    t1 = mytimer();
+#endif
+    int global_failure = 0; // assume all is well: no failures
+
+    int niters = 0;
+    int totalNiters_ref = 0;
+    double normr = 1.0;
+    double normr0 = 1.0;
+    int refMaxIters = 50;
+    numberOfCalls = 1; // Only need to run the residual reduction analysis once
+
+    // Compute the residual reduction for the natural ordering and reference kernels
+    std::vector<double> ref_times(9, 0.0);
+    double tolerance = 0.0; // Set tolerance to zero to make all runs do maxIters iterations
+    int err_count = 0;
+    double refTolerance = 0.0055;
+    if (!benchmark_mode)
+    {
+        for (int i = 0; i < numberOfCalls; ++i)
+        {
+            ZeroVector(x);
+            ierr = CG_ref(A, data, b, x, refMaxIters, tolerance, niters, normr, normr0, &ref_times[0], true,
+                i == 0); // TODO: TRUE
+            if (ierr)
+                ++err_count; // count the number of errors in CG
+            totalNiters_ref += niters;
+        }
+        if (rank == 0 && err_count)
+            if (use_output_file)
+            {
+                HPCG_fout << err_count << " error(s) in call(s) to reference CG." << endl;
+            }
+            else
+            {
+                std::cout << err_count << " error(s) in call(s) to reference CG." << endl;
+            }
+        refTolerance = normr / normr0;
+    }
+
+    if (params.exec_mode == GPUONLY || params.exec_mode == GPUCPU)
+    {
+#ifdef USE_CUDA
+        if (cusparseMajor < 12 || (cusparseMajor == 12 && cusparseMinor < 5))
+        {
+            // Test for the most course matrix
+            if(A.localNumberOfRows/(8 * 8 * 8) < A.slice_size) {
+                if (rank == 0)
+                    printf("cuSPARSE version must be 12.5 or higher (found v%d.%d) to allow a GPU slice size (%lld) larger than the matrix number of rows (%lld). Use --gss to set GPU slice size \n", 
+                    cusparseMajor, cusparseMinor, (long long)A.slice_size, (long long)(A.localNumberOfRows/(8*8*8)));
+#ifndef HPCG_NO_MPI
+                MPI_Finalize();
+#endif
+                return 0;
+            }
+        }
+#endif
+    }
+
+    // Call user-tunable set up function.
+    double t7 = mytimer();
+    size_t opt_mem = OptimizeProblem(A, data, b, x, xexact);
+    t7 = mytimer() - t7;
+    times[7] = t7;
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        std::cout << "Total problem optimize in main (sec) = " << t7 << endl;
+#endif
+
+    if (params.rank_type == GPU)
+    {
+#ifdef USE_CUDA
+        int dev;
+        cudaDeviceProp props;
+        CHECK_CUDART(cudaGetDevice(&dev));
+        CHECK_CUDART(cudaGetDeviceProperties(&props, dev));
+        size_t free_bytes, total_bytes;
+        CHECK_CUDART(cudaMemGetInfo(&free_bytes, &total_bytes));
+
+        //Find the number of SMS
+        int numSMS = props.multiProcessorCount;
+
+        if (rank == 0)
+            printf(
+                "GPU Rank Info:\n"
+                " | cuSPARSE version %d.%d\n%s"
+                " | Reference CPU memory = %.2f MB\n"
+                " | GPU Name: '%s'\n"
+                " | Number of SMs: %d\n"
+                " | GPU Memory Use: %ld MB / %ld MB\n"
+                " | Process Grid: %dx%dx%d\n"
+                " | Local Domain: %dx%dx%d\n"
+                " | Number of CPU Threads: %d\n"
+                " | Slice Size: %lld\n",
+                cusparseMajor, cusparseMinor, Use_Compression ? " | L2 compression is activated\n" : "",
+                cpuRefMemory / 1024.0 / 1024.0, props.name, numSMS, (total_bytes - free_bytes) >> 20, total_bytes >> 20,
+                A.geom->npx, A.geom->npy, A.geom->npz, (int)A.geom->nx, (int)A.geom->ny, (int)A.geom->nz, params.numThreads, (long long)A.slice_size);
+        CHECK_CUDART(cudaDeviceSynchronize());
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        cpuRefMemory = EstimateCpuRefMem(A);
+        if (rank == 0 || (params.exec_mode == GPUCPU && params.cpu_allowed_to_print))
+            printf(
+                "CPU Rank Info:\n"
+                " | NVPL Sparse version %d.%d.%d\n"
+                " | Reference CPU memory = %.2f MB\n"
+                " | Optimization Memory Use: %.2f MB\n"
+                " | Process Grid: %dx%dx%d\n"
+                " | Local Domain: %dx%dx%d\n"
+                " | Number of CPU Threads: %d\n"
+                " | Slice Size: %d\n",
+                nvspMajor, nvspMinor, nvspPatch, cpuRefMemory / 1024.0 / 1024.0, opt_mem / 1024.0 / 1024.0, A.geom->npx,
+                A.geom->npy, A.geom->npz, A.geom->nx, A.geom->ny, A.geom->nz, params.numThreads, A.slice_size);
+#endif // USE_GRACE
+    }
+
+#ifdef HPCG_DETAILED_DEBUG
+    if (geom->size == 1)
+        WriteProblem(*geom, A, b, x, xexact);
+#endif
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+//////////////////////////////
+// Validation Testing Phase //
+//////////////////////////////
+#ifdef HPCG_DEBUG
+    t1 = mytimer();
+#endif
+    TestCGData testcg_data;
+    testcg_data.count_pass = testcg_data.count_fail = 0;
+    TestCG(A, data, b, x, testcg_data);
+
+    TestSymmetryData testsymmetry_data;
+    TestSymmetry(A, b, xexact, testsymmetry_data);
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+        HPCG_fout << "Total validation (TestCG and TestSymmetry) execution time in main (sec) = " << mytimer() - t1
+                  << endl;
+#endif
+
+    //////////////////////////////
+    // Optimized CG Setup Phase //
+    //////////////////////////////
+
+    // Need to permute the b vector
+    if (A.rankType == GPU)
+    {
+#ifdef USE_CUDA
+        PermVectorCuda(A.opt2ref, b, A.localNumberOfRows);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        PermVectorCpu(A.opt2ref, b, A.localNumberOfRows);
+#endif
+    }
+
+    niters = 0;
+    normr = 0.0;
+    normr0 = 0.0;
+    err_count = 0;
+    int tolerance_failures = 0;
+
+    int optMaxIters = 10 * refMaxIters;
+    int optNiters = refMaxIters;
+    double opt_worst_time = 0.0;
+    double opt_best_time = 9999999.0;
+
+    std::vector<double> bleh_times(9, 0.0);
+    ZeroVector(x); // start x at all zeros
+    ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &bleh_times[0], true, 1);
+    std::vector<double> opt_times(9, 0.0);
+    numberOfCalls = 1;
+
+    // Compute the residual reduction and residual count for the user ordering and optimized kernels.
+    for (int i = 0; i < numberOfCalls; ++i)
+    {
+        ZeroVector(x); // start x at all zeros
+        double last_cummulative_time = opt_times[0];
+        ierr = CG(A, data, b, x, optMaxIters, refTolerance, niters, normr, normr0, &opt_times[0], true, 0); // TODO:
+                                                                                                            // TRUE
+        if (ierr)
+            ++err_count; // count the number of errors in CG
+        if (normr / normr0 > refTolerance)
+            ++tolerance_failures; // the number of failures to reduce residual
+
+        // pick the largest number of iterations to guarantee convergence
+        if (niters > optNiters)
+            optNiters = niters;
+
+        double current_time = opt_times[0] - last_cummulative_time;
+        if (current_time > opt_worst_time)
+            opt_worst_time = current_time;
+        if (current_time < opt_best_time)
+            opt_best_time = current_time;
+    }
+
+#ifndef HPCG_NO_MPI
+    // Get the absolute worst time across all MPI ranks (time in CG can be different)
+    double local_opt_worst_time = opt_worst_time;
+    MPI_Allreduce(&local_opt_worst_time, &opt_worst_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+#endif
+
+    if (rank == 0 && err_count)
+        if (use_output_file)
+        {
+            HPCG_fout << err_count << " error(s) in call(s) to optimized CG." << endl;
+        }
+        else
+        {
+            std::cout << err_count << " error(s) in call(s) to optimized CG." << endl;
+        }
+    if (tolerance_failures)
+    {
+        global_failure = 1;
+        if (rank == 0)
+            if (use_output_file)
+            {
+                HPCG_fout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
+            }
+            else
+            {
+                std::cout << "Failed to reduce the residual " << tolerance_failures << " times." << endl;
+            }
+    }
+
+    ///////////////////////////////
+    // Optimized CG Timing Phase //
+    ///////////////////////////////
+
+    // Here we finally run the benchmark phase
+    // The variable total_runtime is the target benchmark execution time in seconds
+
+    double total_runtime = params.runningTime;
+    int numberOfCgSets = int(total_runtime / opt_worst_time) + 1; // Run at least once, account for rounding
+
+#ifdef HPCG_DEBUG
+    if (rank == 0)
+    {
+        HPCG_fout << "Projected running time: " << total_runtime << " seconds" << endl;
+        HPCG_fout << "Number of CG sets: " << numberOfCgSets << endl;
+    }
+#endif
+
+    /* This is the timed run for a specified amount of time. */
+
+    optMaxIters = optNiters;
+    double optTolerance = 0.0; // Force optMaxIters iterations
+    TestNormsData testnorms_data;
+    testnorms_data.samples = numberOfCgSets;
+    testnorms_data.values = new double[numberOfCgSets];
+
+#ifndef HPCG_NOMPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+
+    for (int i = 0; i < numberOfCgSets; ++i)
+    {
+        ZeroVector(x);                                                                                  // Zero out x
+        ierr = CG(A, data, b, x, optMaxIters, optTolerance, niters, normr, normr0, &times[0], true, 0); // TODO: TRUE
+        if (ierr)
+            if (use_output_file)
+            {
+                HPCG_fout << "Error in call to CG: " << ierr << ".\n" << endl;
+            }
+            else
+            {
+                std::cout << "Error in call to CG: " << ierr << ".\n" << endl;
+            }
+        if (rank == 0)
+            if (use_output_file)
+            {
+                HPCG_fout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
+            }
+            else
+            {
+                std::cout << "Call [" << i << "] Scaled Residual [" << normr / normr0 << "]" << endl;
+            }
+        testnorms_data.values[i] = normr / normr0; // Record scaled residual from this run
+    }
+
+    if (params.rank_type == GPU)
+    {
+#ifdef USE_CUDA
+        PermVectorCuda(A.ref2opt, x, A.localNumberOfRows);
+        CopyVectorD2H(x);
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+        // Reorder vector
+        Vector xOrdered;
+        InitializeVector(xOrdered, x.localLength, A.rankType);
+        CopyVector(x, xOrdered);
+        CopyAndReorderVector(xOrdered, x, A.ref2opt);
+        DeleteVector(xOrdered);
+#endif
+    }
+
+// Compute difference between known exact solution and computed solution
+// All processors are needed here.
+#ifdef HPCG_DEBUG
+    double residual = 0;
+    ierr = ComputeResidual(A.localNumberOfRows, x, xexact, residual);
+    if (ierr)
+        HPCG_fout << "Error in call to compute_residual: " << ierr << ".\n" << endl;
+    if (rank == 0)
+        HPCG_fout << "Difference between computed and exact  = " << residual << ".\n" << endl;
+#endif
+
+    // Test Norm Results
+    ierr = TestNorms(testnorms_data);
+
+    //////////////////
+    // Report Results //
+    //////////////////
+
+    // Report results to YAML file
+    ReportResults(A, numberOfMgLevels, numberOfCgSets, refMaxIters, optMaxIters, &times[0], testcg_data,
+        testsymmetry_data, testnorms_data, global_failure, quickPath);
+
+    if (params.rank_type == GPU)
+    {
+#ifdef USE_CUDA
+       DeleteMatrixGpu(A); // This delete will recursively delete all coarse grid data
+#endif
+    }
+    else
+    {
+#ifdef USE_GRACE
+     DeleteMatrixCpu(A); // This delete will recursively delete all coarse grid data
+#endif
+    }
+ 
+    DeleteCGData(data);
+    DeleteVector(x);
+    DeleteVector(b);
+    DeleteVector(xexact);
+    DeleteVector(x_overlap);
+    DeleteVector(b_computed);
+    delete[] testnorms_data.values;
+
+    // Clean cuSPARSE data
+    if (params.rank_type == GPU)
+    {
+#ifdef USE_CUDA
+        cublasDestroy(cublashandle);
+        cusparseDestroy(cusparsehandle);
+        cudaStreamDestroy(stream);
+        cudaStreamDestroy(copy_stream);
+        cudaEventDestroy(copy_done);
+#endif
+    }
+
+    // We create the handle even in GPU ranks tp find library version
+    if (params.exec_mode == CPUONLY || params.exec_mode == GPUCPU)
+    {
+#ifdef USE_GRACE
+        nvpl_sparse_destroy(nvpl_sparse_handle);
+#endif
+    }
+
+    HPCG_Finalize();
+
+// Finish up
+#ifndef HPCG_NO_MPI
+    MPI_Finalize();
+#endif
+    return 0;
+}
--- a/src/mytimer.cpp
+++ b/src/mytimer.cpp
@@ -0,0 +1,59 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+/////////////////////////////////////////////////////////////////////////
+
+// Function to return time in seconds.
+// If compiled with no flags, return CPU time (user and system).
+// If compiled with -DWALL, returns elapsed time.
+
+/////////////////////////////////////////////////////////////////////////
+
+#ifndef HPCG_NO_MPI
+#include <mpi.h>
+
+double mytimer(void)
+{
+    return MPI_Wtime();
+}
+
+#elif !defined(HPCG_NO_OPENMP)
+
+// If this routine is compiled with HPCG_NO_MPI defined and not compiled with HPCG_NO_OPENMP then use the OpenMP timer
+#include <omp.h>
+double mytimer(void)
+{
+    return omp_get_wtime();
+}
+#else
+
+#include <cstdlib>
+#include <sys/resource.h>
+#include <sys/time.h>
+double mytimer(void)
+{
+    struct timeval tp;
+    static long start = 0, startu;
+    if (!start)
+    {
+        gettimeofday(&tp, NULL);
+        start = tp.tv_sec;
+        startu = tp.tv_usec;
+        return 0.0;
+    }
+    gettimeofday(&tp, NULL);
+    return ((double) (tp.tv_sec - start)) + (tp.tv_usec - startu) / 1000000.0;
+}
+
+#endif
--- a/src/mytimer.hpp
+++ b/src/mytimer.hpp
@@ -0,0 +1,18 @@
+
+//@HEADER
+// ***************************************************
+//
+// HPCG: High Performance Conjugate Gradient Benchmark
+//
+// Contact:
+// Michael A. Heroux ( maherou@sandia.gov)
+// Jack Dongarra     (dongarra@eecs.utk.edu)
+// Piotr Luszczek    (luszczek@eecs.utk.edu)
+//
+// ***************************************************
+//@HEADER
+
+#ifndef MYTIMER_HPP
+#define MYTIMER_HPP
+double mytimer(void);
+#endif // MYTIMER_HPP
				`@@ -0,0 +1,2 @@`

				`void ComputeOptimalShapeXYZ(int xyz, int& x, int& y, int& z);`